#### Load Data

In [1]:
import pandas as pd

df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases'
                 '/magic/magic04.data', header=None, error_bad_lines=False)

#### Import Libraries

In [2]:
# Ignore future warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
# Import ML libraries to be used
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [4]:
# Import other necessary libraries

# Preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

# Model Selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import learning_curve
from sklearn.model_selection import validation_curve

# Metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer

# Pipeline
from sklearn.pipeline import Pipeline

# Numpy
import numpy as np

# Plotting
import matplotlib.pyplot as plt

#### Split Data

In [5]:
df.shape

(19020, 11)

In [6]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,28.7967,16.0021,2.6449,0.3918,0.1982,27.7004,22.011,-8.2027,40.092,81.8828,g
1,31.6036,11.7235,2.5185,0.5303,0.3773,26.2722,23.8238,-9.9574,6.3609,205.261,g
2,162.052,136.031,4.0612,0.0374,0.0187,116.741,-64.858,-45.216,76.96,256.788,g
3,23.8172,9.5728,2.3385,0.6147,0.3922,27.2107,-6.4633,-7.1513,10.449,116.737,g
4,75.1362,30.9205,3.1611,0.3168,0.1832,-5.5277,28.5525,21.8393,4.648,356.462,g


In [7]:
# Preprocess and split dataset
data = df.iloc[:, 0:-1].values
labels = df.iloc[:, -1].values

# Convert labels from ['h', 'g'] to [1, 0]
le = LabelEncoder()
labels = le.fit_transform(labels)

X_train, X_test, y_train, y_test = \
    train_test_split(data, labels, test_size=0.25, random_state=42)

In [8]:
np.savetxt('./data/Gamma_Telescope_X_test.csv', X_test, delimiter=',')
np.savetxt('./data/Gamma_Telescope_y_test.csv', y_test, delimiter=',')

In [9]:
np.savetxt('./data/Gamma_Telescope_X_train.csv', X_train, delimiter=',')
np.savetxt('./data/Gamma_Telescope_y_train.csv', y_train, delimiter=',')

#### Decision Tree

In [10]:
# Pipeline for decision tree classifier
pipe_dt = Pipeline([('scl', StandardScaler()),
                    ('clf', DecisionTreeClassifier(splitter='best', 
                                                   random_state=42))])

In [11]:
%%time
scorer = make_scorer(roc_auc_score)

max_depth_range = [1, 3, 6, 9, 12, 15, 18, 21]

param_grid = [{'clf__criterion': ['gini', 'entropy'],
               'clf__max_depth': max_depth_range}]

gs = GridSearchCV(estimator=pipe_dt,
                  param_grid=param_grid,
                  scoring=scorer,
                  cv=5,
                  n_jobs=-1)
gs = gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)

0.8162961168185047
{'clf__criterion': 'gini', 'clf__max_depth': 12}
Wall time: 8.03 s


#### Neural Network (MLP)

In [12]:
# Pipeline for multi-layer perceptron classifier
pipe_mlp = Pipeline([('scl', StandardScaler()),
                     ('clf', MLPClassifier(max_iter=600, 
                                           solver='adam',
                                           random_state=42))])

In [13]:
%%time
scorer = make_scorer(roc_auc_score)

hidden_layer_sizes_range = [10, 20, 40, 60, 80, 120, 140, 160]

param_grid = [{'clf__activation': ['tanh', 'relu'],
               'clf__hidden_layer_sizes': hidden_layer_sizes_range}]

gs = GridSearchCV(estimator=pipe_mlp,
                  param_grid=param_grid,
                  scoring=scorer,
                  cv=5,
                  n_jobs=-1)
gs = gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)

0.8438402145118563
{'clf__hidden_layer_sizes': 160, 'clf__activation': 'relu'}
Wall time: 2min 34s


#### Adaptive Boosting

In [14]:
# Pipeline for adaptive boosting random forest classifier
tree = DecisionTreeClassifier(splitter='best')

pipe_ada = Pipeline([('scl', StandardScaler()),
                     ('clf', AdaBoostClassifier(base_estimator=tree,
                                                algorithm='SAMME',
                                                random_state=42))])

In [15]:
%%time
scorer = make_scorer(roc_auc_score)

n_estimators_range = [50, 100, 150, 200, 300, 400, 500]
max_depth_range = [2, 3, 4, 5, 6, 7, 8]
learning_rate_range = [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

param_grid = [{'clf__base_estimator__criterion': ['gini'],
               'clf__base_estimator__max_depth': max_depth_range,
               'clf__n_estimators': n_estimators_range,
               'clf__learning_rate': learning_rate_range}]

gs = GridSearchCV(estimator=pipe_ada,
                  param_grid=param_grid,
                  scoring=scorer,
                  cv=5,
                  n_jobs=-1)
gs = gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)

0.8586765846094205
{'clf__base_estimator__max_depth': 8, 'clf__n_estimators': 500, 'clf__learning_rate': 0.5, 'clf__base_estimator__criterion': 'gini'}
Wall time: 2h 41min 4s


#### Support Vector Machine

In [16]:
# Pipeline for support vector machine classifier (using linear kernel)
pipe_svm = Pipeline([('scl', StandardScaler()),
                     ('clf', SVC(decision_function_shape='ovr', 
                                 random_state=42))])

In [17]:
%%time
scorer = make_scorer(roc_auc_score)

c_gamma_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]

param_grid = [{'clf__C': c_gamma_range,
               'clf__gamma': c_gamma_range,
               'clf__kernel': ['rbf']}]

gs = GridSearchCV(estimator=pipe_svm,
                  param_grid=param_grid,
                  scoring=scorer,
                  cv=5,
                  n_jobs=-1)
gs = gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)

0.8402561865994702
{'clf__C': 100.0, 'clf__gamma': 0.1, 'clf__kernel': 'rbf'}
Wall time: 15min 35s


In [18]:
%%time
scorer = make_scorer(roc_auc_score)

c_gamma_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]

param_grid = [{'clf__C': c_gamma_range,
               'clf__kernel': ['linear']}]

gs = GridSearchCV(estimator=pipe_svm,
                  param_grid=param_grid,
                  scoring=scorer,
                  cv=5,
                  n_jobs=-1)
gs = gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)

0.7464162951476384
{'clf__C': 10.0, 'clf__kernel': 'linear'}
Wall time: 3min 29s


#### K-Nearest Neighbors

In [19]:
# Pipeline for k nearest neighbors classifier
pipe_knn = Pipeline([('scl', StandardScaler()),
                     ('clf', KNeighborsClassifier(algorithm='auto', 
                                                  p=2))])

In [20]:
%%time
scorer = make_scorer(roc_auc_score)

n_neighbors_range = [1, 5, 10, 15, 20, 40, 50, 100]

param_grid = [{'clf__weights': ['uniform'],
               'clf__n_neighbors': n_neighbors_range}]

gs = GridSearchCV(estimator=pipe_knn,
                  param_grid=param_grid,
                  scoring=scorer,
                  cv=5,
                  n_jobs=-1)
gs = gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)

0.7936891839503781
{'clf__weights': 'uniform', 'clf__n_neighbors': 5}
Wall time: 24.5 s
