In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sb
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,VotingClassifier
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import BernoulliNB
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder,normalize,MinMaxScaler
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import confusion_matrix,roc_auc_score,roc_curve
import seaborn as sns

In [None]:
import tensorflow as tf

# GPU device Check.
device_name = tf.test.gpu_device_name()
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # PyTorch use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

Read data

In [None]:
# Reading data
train = pd.read_csv('../input/higgs-boson/training.zip')
test = pd.read_csv('../input/higgs-boson/test.zip')

In [None]:
train.head()

In [None]:
test.head()

train shape: 100 rows × 33 columns

test shape: 5 rows × 31 columns

In [None]:
print(train.columns.values,'\n')
print(test.columns.values)

In [None]:
train = train.drop(['Weight'], axis=1)

In [None]:
print(train['Label'].value_counts())

rcParams['figure.figsize'] = 10,5
sb.barplot(x = train['Label'].value_counts().index, y = train['Label'].value_counts().values)
plt.title('Label counts')
plt.show()

Data Preparation

In [None]:
# getting dummy variables column

enc = LabelEncoder()

train['Label'] = enc.fit_transform(train['Label'])
train.head()

In [None]:
y = train["Label"]
X = train
X_test = test

In [None]:
X.set_index(['EventId'],inplace = True)
X_test.set_index(['EventId'],inplace = True)
X = X.drop(['Label'], axis=1)

X.head()

In [None]:
X_test.head()

In [None]:
train.describe()

In [None]:
# #Normalizing

# no = 1

# X["PRI_jet_all_pt"]=((X["PRI_jet_all_pt"]-X["PRI_jet_all_pt"].min())/(X["PRI_jet_all_pt"].max()-X["PRI_jet_all_pt"].min()))*no
# X_test["PRI_jet_all_pt"]=((X_test["PRI_jet_all_pt"]-X_test["PRI_jet_all_pt"].min())/(X_test["PRI_jet_all_pt"].max()-X_test["DER_mass_MMC"].min()))*no

# X["PRI_jet_subleading_pt"]=((X["PRI_jet_subleading_pt"]-X["PRI_jet_subleading_pt"].min())/(X["PRI_jet_subleading_pt"].max()-X["PRI_jet_subleading_pt"].min()))*no
# X_test["PRI_jet_subleading_pt"]=((X_test["PRI_jet_subleading_pt"]-X_test["PRI_jet_subleading_pt"].min())/(X_test["PRI_jet_subleading_pt"].max()-X_test["PRI_jet_subleading_pt"].min()))*no

# X["PRI_jet_leading_pt"]=((X["PRI_jet_leading_pt"]-X["PRI_jet_leading_pt"].min())/(X["PRI_jet_leading_pt"].max()-X["PRI_jet_leading_pt"].min()))*no
# X_test["PRI_jet_leading_pt"]=((X_test["PRI_jet_leading_pt"]-X_test["PRI_jet_leading_pt"].min())/(X_test["PRI_jet_leading_pt"].max()-X_test["PRI_jet_leading_pt"].min()))*no

# X["PRI_met_sumet"]=((X["PRI_met_sumet"]-X["PRI_met_sumet"].min())/(X["PRI_met_sumet"].max()-X["PRI_met_sumet"].min()))*no
# X_test["PRI_met_sumet"]=((X_test["PRI_met_sumet"]-X_test["PRI_met_sumet"].min())/(X_test["PRI_met_sumet"].max()-X_test["PRI_met_sumet"].min()))*no

# X["DER_sum_pt"]=((X["DER_sum_pt"]-X["DER_sum_pt"].min())/(X["DER_sum_pt"].max()-X["DER_sum_pt"].min()))*no
# X_test["DER_sum_pt"]=((X_test["DER_sum_pt"]-X_test["DER_sum_pt"].min())/(X_test["DER_sum_pt"].max()-X_test["DER_sum_pt"].min()))*no

# X["DER_mass_jet_jet"]=((X["DER_mass_jet_jet"]-X["DER_mass_jet_jet"].min())/(X["DER_mass_jet_jet"].max()-X["DER_mass_jet_jet"].min()))*no
# X_test["DER_mass_jet_jet"]=((X_test["DER_mass_jet_jet"]-X_test["DER_mass_jet_jet"].min())/(X_test["DER_mass_jet_jet"].max()-X_test["DER_mass_jet_jet"].min()))*no

# X["DER_pt_h"]=((X["DER_pt_h"]-X["DER_pt_h"].min())/(X["DER_pt_h"].max()-X["DER_pt_h"].min()))*no
# X_test["DER_pt_h"]=((X_test["DER_pt_h"]-X_test["DER_pt_h"].min())/(X_test["DER_pt_h"].max()-X_test["DER_pt_h"].min()))*no

# X["DER_mass_vis"]=((X["DER_mass_vis"]-X["DER_mass_vis"].min())/(X["DER_mass_vis"].max()-X["DER_mass_vis"].min()))*no
# X_test["DER_mass_vis"]=((X_test["DER_mass_vis"]-X_test["DER_mass_vis"].min())/(X_test["DER_mass_vis"].max()-X_test["DER_mass_vis"].min()))*no

# X["DER_mass_transverse_met_lep"]=((X["DER_mass_transverse_met_lep"]-X["DER_mass_transverse_met_lep"].min())/(X["DER_mass_transverse_met_lep"].max()-X["DER_mass_transverse_met_lep"].min()))*no
# X_test["DER_mass_transverse_met_lep"]=((X_test["DER_mass_transverse_met_lep"]-X_test["DER_mass_transverse_met_lep"].min())/(X_test["DER_mass_transverse_met_lep"].max()-X_test["DER_mass_transverse_met_lep"].min()))*no

# X["DER_mass_MMC"]=((X["DER_mass_MMC"]-X["DER_mass_MMC"].min())/(X["DER_mass_MMC"].max()-X["DER_mass_MMC"].min()))*no
# X_test["DER_mass_MMC"]=((X_test["DER_mass_MMC"]-X_test["DER_mass_MMC"].min())/(X_test["DER_mass_MMC"].max()-X_test["DER_mass_MMC"].min()))*no


# X.head()

In [None]:
# # normalize the data attributes
# X = X.apply(lambda x: (x - np.mean(x)) / (np.max(x) - np.min(x)))

# X_test = X_test.apply(lambda x: (x - np.mean(x)) / (np.max(x) - np.min(x)))


# X.head()

In [None]:
#Normalizing

from sklearn.preprocessing import normalize

X = normalize(X)
X_test = normalize(X_test)

In [None]:
# print(X.isnull().sum(),'\n')
# print(X_test.isnull().sum())

In [None]:
#X = X.replace(-999.000,np.nan)
#X.head()

In [None]:
#X_test = X_test.replace(-999.000,np.nan)

In [None]:
#X_test.head()

In [None]:
#X = X.replace(-999.000,0)
#X_test = X_test.replace(-999.000,0)
#X.head()

In [None]:
#print(X.isnull().sum(),'\n')
#print(X_test.isnull().sum())

In [None]:
#X.fillna(X.median(), inplace=True)
#X_test.fillna(X_test.median(), inplace=True)

#X.head()

In [None]:
#X.tail(1000)

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state = 10,test_size=0.2,shuffle =True)

1- Logistic Regression Model

In [None]:
logistic_regression= LogisticRegression()
logistic_regression.fit(X_train,y_train)
y_pred=logistic_regression.predict(X_test)

2- Random Forest Model

In [None]:
# fit the model on the whole dataset
random_forest = RandomForestClassifier()

random_forest.fit(X_train, y_train)

3- Decision Tree Model

In [None]:
decisionTreeModel = DecisionTreeClassifier(criterion= 'entropy',
                                           max_depth = None, 
                                           splitter='best', 
                                           random_state=10)

decisionTreeModel.fit(X_train,y_train)

4- Gradient Boosting Model

In [None]:
# gradientBoostingModel = GradientBoostingClassifier(loss = 'deviance',
#                                                    learning_rate = 0.01,
#                                                    n_estimators = 100,
#                                                    max_depth = 30,
#                                                    random_state=10)

# gradientBoostingModel.fit(X_train,y_train)

5- Nearest Neighbors Model

In [None]:
KNeighborsModel = KNeighborsClassifier(n_neighbors = 7,
                                       weights = 'distance',
                                      algorithm = 'brute')

KNeighborsModel.fit(X_train,y_train)

6- Stochastic Gradient Descent Model

In [None]:
# SGDClassifier = SGDClassifier(loss = 'hinge', 
#                               penalty = 'l1',
#                               learning_rate = 'optimal',
#                               random_state = 10, 
#                               max_iter=100)

# SGDClassifier.fit(X_train,y_train)

7- Support Vector Machine Model

In [None]:
# SVClassifier = SVC(kernel= 'linear',
#                    degree=3,
#                    max_iter=10000,
#                    C=2, 
#                    random_state = 55)

# SVClassifier.fit(X_train,y_train)

8- Bernoulli Naive Bayes Model

In [None]:
bernoulliNBModel = BernoulliNB(alpha=0.1)
bernoulliNBModel.fit(X_train,y_train)

9- Gaussian Naive Bayes Model

In [None]:
gaussianNBModel = GaussianNB()
gaussianNBModel.fit(X_train,y_train)

10- XGBoost Model

In [None]:
XGB_Classifier = XGBClassifier()
XGB_Classifier.fit(X_train, y_train)

**Models evaluation**

In [None]:
#evaluation Details
models = [logistic_regression, random_forest, decisionTreeModel, KNeighborsModel, 
            bernoulliNBModel, gaussianNBModel, XGB_Classifier]

for model in models:
    print(type(model).__name__,' Train Score is   : ' ,model.score(X_train, y_train))
    print(type(model).__name__,' Test Score is    : ' ,model.score(X_test, y_test))
    
    y_pred = model.predict(X_test)
    print(type(model).__name__,' F1 Score is      : ' ,f1_score(y_test,y_pred))
    print('--------------------------------------------------------------------------')

I will use XGBClassifier Model

Prediction

In [None]:
y_pred = XGB_Classifier.predict(X_test)

In [None]:
import seaborn as sn

confusion_matrix = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])
sn.heatmap(confusion_matrix, annot=True)

In [None]:
from sklearn.metrics import accuracy_score,classification_report

print(accuracy_score(y_test,y_pred).round(4)*100,'\n')

print(pd.crosstab(y_test,y_pred),'\n')

print(classification_report(y_test,y_pred),'\n')

In [None]:
X_test.shape

In [None]:
test.shape

In [None]:
test_to_pred = normalize(test)

In [None]:
test_predict = XGB_Classifier.predict(test_to_pred)

In [None]:
test.reset_index(inplace = True)
test.head()

In [None]:
predict = test['EventId']

In [None]:
test_predict = pd.Series(test_predict)

In [None]:
predict = pd.concat([predict,test_predict], axis=1)

In [None]:
predict.rename(columns={0: "Class"},inplace=True)

In [None]:
predict = predict.replace(1,'s')
predict = predict.replace(0,'b')

In [None]:
predict['RankOrder'] = predict['Class'].argsort().argsort() + 1 # +1 to start at 1

In [None]:
predict = predict[['EventId', 'RankOrder','Class']]

In [None]:
predict.to_csv("submission.csv",index=False)

In [None]:
predict.tail(200)

In [None]:
print(predict.RankOrder.min())
print(predict.RankOrder.max())

In [None]:
sb.countplot(predict.Class)