# Flavours of Physics: Finding τ → μμμ (Kernels Only)  with EDA & ML (SVM+KNN+GNB+XGB)

<img src="https://storage.googleapis.com/kaggle-competitions/kaggle/10014/logos/header.png?t=2018-06-20-19-58-34" width="1000px">


### We got the better score
* Accuracy XGBClassifier: 0.9999227202472952
* Precision XGBClassifier: 0.9998466492869191
* Recall XGBClassifier: 1.0
* F1 Score XGBClassifier: 0.9999233187638984





### Data Description
In this competition, you are given a list of collision events and their properties. You will then predict whether a τ → 3μ decay happened in this collision. This τ → 3μ is currently assumed by scientists not to happen, and the goal of this competition is to discover τ → 3μ happening more frequently than scientists currently can understand.


It is challenging to design a machine learning problem for something you have never observed before. Scientists at CERN developed the following designs to achieve the goal.


* training.csv
This is a labelled dataset (the label ‘signal’ being ‘1’ for signal events, ‘0’ for background events) to train the classifier. Signal events have been simulated, while background events are real data.

This real data is collected by the LHCb detectors observing collisions of accelerated particles with a specific mass range in which τ → 3μ can’t happen. We call these events “background” and label them 0.

* test.csv
The test dataset has all the columns that training.csv has, except mass, production, min_ANNmuon, and signal. 

Link dataset

[Here](https://www.kaggle.com/c/flavours-of-physics-kernels-only/data)


In [None]:
!pip install sweetviz

In [None]:
import numpy as np
import pandas as pd
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [None]:
df_train = pd.read_csv("../input/flavours-of-physics-kernels-only/training.csv.zip")
df_test = pd.read_csv("../input/flavours-of-physics-kernels-only/test.csv.zip")

In [None]:
df_train

In [None]:
df_test

In [None]:
df_train= df_train[['IP', 'IPSig', 'isolatione', 'iso', 'ISO_SumBDT', 'p0_IsoBDT','p1_IsoBDT', 'SPDhits', 'signal']]

In [None]:
df2= df_test[['IP', 'IPSig', 'isolatione', 'iso', 'ISO_SumBDT', 'p0_IsoBDT','p1_IsoBDT', 'SPDhits', ]]

In [None]:
df_train

In [None]:
df2

In [None]:
df_train.info()

In [None]:
df_test.info()

In [None]:
df_train["signal"].value_counts()

In [None]:
import sweetviz as sv
Tr_report1 = sv.analyze(df_train)
Tr_report1.show_notebook(w="90%", h="full")
Tr_report1.show_html('Tr_report1.html')

In [None]:
Tr_report1 = sv.analyze(df2)
Tr_report1.show_notebook(w="90%", h="full")
Tr_report1.show_html('Tr_report1.html')

In [None]:
df2

In [None]:
sns.distplot(df_train['IP'],hist_kws={ "linewidth": 3,"alpha": 1, "color": "r"});

In [None]:
sns.distplot(df_train['IPSig'],hist_kws={ "linewidth": 3,"alpha": 1, "color": "r"});

In [None]:
sns.distplot(df_train['isolatione'],hist_kws={ "linewidth": 3,"alpha": 1, "color": "r"});

In [None]:
sns.distplot(df_train['iso'],hist_kws={ "linewidth": 3,"alpha": 1, "color": "r"});

In [None]:
sns.distplot(df_train['ISO_SumBDT'],hist_kws={ "linewidth": 3,"alpha": 1, "color": "r"});

In [None]:
sns.distplot(df_train['p0_IsoBDT'],hist_kws={ "linewidth": 3,"alpha": 1, "color": "r"});

In [None]:
sns.distplot(df_train['p0_IsoBDT'],hist_kws={ "linewidth": 3,"alpha": 1, "color": "r"});

In [None]:
sns.distplot(df_train['p1_IsoBDT'],hist_kws={ "linewidth": 3,"alpha": 1, "color": "r"});

In [None]:
sns.distplot(df_train['SPDhits'],hist_kws={ "linewidth": 3,"alpha": 1, "color": "r"});

In [None]:
df_train.plot(figsize = (11, 25), subplots = True, linewidth = 0.8, color = "r")
plt.xlabel('')
plt.show()

In [None]:
df_train['signal'].value_counts()

df_train['signal'].value_counts() * 100 / len(df_train)


sns.countplot(x='signal', data=df_train, palette='viridis')

In [None]:
from sklearn.utils import resample

not_fraud = df_train[df_train['signal']== 0]
fraud = df_train[df_train['signal'] == 1]

#upsample minority
fraud_upsampled = resample(fraud,
                          replace=True, # sample with replacement
                          n_samples=len(not_fraud), # match number in majority class
                          random_state=27) # reproducible results

#combine majority and upsampled minority
df_train = pd.concat([not_fraud, fraud_upsampled])

# check new class counts
df_train['signal'].value_counts()

In [None]:
sns.countplot(x='signal', data=df_train, palette='viridis')

In [None]:
df_train.isnull().sum()

In [None]:
df_train.describe()

In [None]:
df_train.corr()['signal'].sort_values(ascending=False)

In [None]:
z = df_train.drop('signal', axis=1)
z.corrwith(df_train['signal']).plot(kind='bar', figsize=(15,10), color=['g'])
plt.title("Correlation Matrix")
plt.xticks(size=15)
plt.yticks(size=15)
plt.show()

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(df_train.corr(), annot=True, cbar=False, fmt='.1f', cmap='summer')
plt.show()

In [None]:
plt.figure(figsize=(15,10))
sns.boxplot(data=df_train)
plt.xticks(rotation=90)
plt.show()

In [None]:
df_train

In [None]:
plt.tight_layout()
sns.boxplot(x=df_train["ISO_SumBDT"], data=df_train)
plt.show()

In [None]:
plt.tight_layout()
sns.boxplot(x=df_train["p0_IsoBDT"], data=df_train)
plt.show()

In [None]:
plt.tight_layout()
sns.boxplot(x=df_train["SPDhits"], data=df_train)
plt.show()

In [None]:
plt.tight_layout()
sns.boxplot(x=df_train["IPSig"], data=df_train)
plt.show()

In [None]:
display(df_train.skew())

In [None]:
from sklearn.model_selection import train_test_split
x1 = df_train.drop('signal', axis=1)
y = df_train['signal']

In [None]:
x1

In [None]:
y

In [None]:
from imblearn.over_sampling import SMOTE

x1 = df_train.drop('signal', axis=1)
y = df_train['signal']

# setting up testing and training sets
x_train, x_test, y_train, y_test = train_test_split(x1, y, test_size=0.25, random_state=27)

sm = SMOTE(random_state=27)

x1, y = sm.fit_resample(x1, y)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, classification_report, roc_curve, roc_auc_score, auc, precision_recall_curve, precision_score, recall_score


In [None]:
from sklearn.svm import SVC
svm= SVC(random_state=2)
svm.fit(x_train, y_train)
y_pred_svm = svm.predict(x_test)

In [None]:
y_pred_svm = svm.predict(x_test)

In [None]:
print("Accuracy SVC:", accuracy_score(y_test, y_pred_svm))
print("Precision SVC:", precision_score(y_test, y_pred_svm))
print("Recall SVC:", recall_score(y_test, y_pred_svm))
print("F1 Score SVC:", f1_score(y_test, y_pred_svm))

In [None]:
from sklearn.neighbors import KNeighborsClassifier
KNN = KNeighborsClassifier(algorithm='auto', leaf_size=300, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=100, p=2,
                     weights='uniform')
KNN.fit(x1, y)

In [None]:
y_pred_KNN = KNN.predict(x_test)

In [None]:
print("Accuracy KNeighborsClassifier:", accuracy_score(y_test, y_pred_KNN))
print("Precision KNeighborsClassifier:", precision_score(y_test, y_pred_KNN))
print("Recall KNeighborsClassifier:", recall_score(y_test, y_pred_KNN))
print("F1 Score KNeighborsClassifier:", f1_score(y_test, y_pred_KNN))

In [None]:
from sklearn.naive_bayes import GaussianNB
GNB = GaussianNB(priors=None, var_smoothing=1e-08)
GNB.fit(x_train, y_train)

In [None]:
y_pred_GNB = GNB.predict(x_test)

In [None]:
print("Accuracy GaussianNB:", accuracy_score(y_test, y_pred_GNB))
print("Precision GaussianNB:", precision_score(y_test, y_pred_GNB))
print("Recall GaussianNB:", recall_score(y_test, y_pred_GNB))
print("F1 Score GaussianNB:", f1_score(y_test, y_pred_GNB))

In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=40,
              min_child_weight=1, monotone_constraints='()',
              n_estimators=1000, n_jobs=100, num_parallel_tree=1, random_state=0,
              reg_alpha=2, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)
xgb.fit(x1, y)

In [None]:
y_pred_xgb = xgb.predict(x_test)


In [None]:
print("Accuracy XGBClassifier:", accuracy_score(y_test, y_pred_xgb))
print("Precision XGBClassifier:", precision_score(y_test, y_pred_xgb))
print("Recall XGBClassifier:", recall_score(y_test, y_pred_xgb))
print("F1 Score XGBClassifier:", f1_score(y_test, y_pred_xgb))

In [None]:
y_pred_df_tset = xgb.predict(x1)
y_pred_df_tset

In [None]:
sub = pd.read_csv('../input/flavours-of-physics-kernels-only/sample_submission.csv.zip')
sub[:51758]

In [None]:
sub[:51758]['prediction'] = y_pred_df_tset
sub[:51758].to_csv('submission.csv', index=False)