#### Load Data

In [1]:
import pandas as pd

df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases'
                 '/spambase/spambase.data', header=None, error_bad_lines=False)

#### Import Libraries

In [2]:
# Ignore future warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
# Import ML libraries to be used
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [4]:
# Import other necessary libraries

# Preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

# Model Selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import learning_curve
from sklearn.model_selection import validation_curve

# Metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer

# Pipeline
from sklearn.pipeline import Pipeline

# Numpy
import numpy as np

# Plotting
import matplotlib.pyplot as plt

#### Split Data

In [5]:
df.shape

(4601, 58)

In [6]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [7]:
# Preprocess and split dataset
data = df.iloc[:, :-1].values
labels = df.iloc[:, -1].values

X_train, X_test, y_train, y_test = \
    train_test_split(data, labels, test_size=0.25, random_state=42)

In [8]:
np.savetxt('./data/Spambase_X_test.csv', X_test, delimiter=',')
np.savetxt('./data/Spambase_y_test.csv', y_test, delimiter=',')

In [9]:
np.savetxt('./data/Spambase_X_train.csv', X_train, delimiter=',')
np.savetxt('./data/Spambase_y_train.csv', y_train, delimiter=',')

#### Decision Tree

In [21]:
# Pipeline for decision tree classifier
pipe_dt = Pipeline([('scl', StandardScaler()),
                    ('clf', DecisionTreeClassifier(splitter='best', 
                                                   random_state=42))])

In [11]:
%%time
scorer = make_scorer(roc_auc_score)

max_depth_range = [1, 3, 6, 9, 12, 15, 18, 21]

param_grid = [{'clf__criterion': ['gini', 'entropy'],
               'clf__max_depth': max_depth_range}]

gs = GridSearchCV(estimator=pipe_dt,
                  param_grid=param_grid,
                  scoring=scorer,
                  cv=5,
                  n_jobs=-1)
gs = gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)

0.9194871249359065
{'clf__max_depth': 15, 'clf__criterion': 'entropy'}
Wall time: 3.44 s


#### Neural Network (MLP)

In [12]:
# Pipeline for multi-layer perceptron classifier
pipe_mlp = Pipeline([('scl', StandardScaler()),
                     ('clf', MLPClassifier(max_iter=600, 
                                           solver='adam',
                                           random_state=42))])

In [13]:
%%time
scorer = make_scorer(roc_auc_score)

hidden_layer_sizes_range = [10, 20, 40, 60, 80, 120, 140, 160]

param_grid = [{'clf__activation': ['tanh', 'relu'],
               'clf__hidden_layer_sizes': hidden_layer_sizes_range}]

gs = GridSearchCV(estimator=pipe_mlp,
                  param_grid=param_grid,
                  scoring=scorer,
                  cv=5,
                  n_jobs=-1)
gs = gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)

0.9422682078441097
{'clf__activation': 'relu', 'clf__hidden_layer_sizes': 120}
Wall time: 2min 10s


#### Adaptive Boosting

In [22]:
# Pipeline for adaptive boosting random forest classifier
tree = DecisionTreeClassifier(splitter='best')

pipe_ada = Pipeline([('scl', StandardScaler()),
                     ('clf', AdaBoostClassifier(base_estimator=tree,
                                                algorithm='SAMME',
                                                random_state=42))])

In [23]:
%%time
scorer = make_scorer(roc_auc_score)

n_estimators_range = [50, 100, 150, 200, 300, 400, 500]
max_depth_range = [2, 3, 4, 5, 6, 7, 8]
learning_rate_range = [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

param_grid = [{'clf__base_estimator__criterion': ['entropy'],
               'clf__base_estimator__max_depth': max_depth_range,
               'clf__n_estimators': n_estimators_range,
               'clf__learning_rate': learning_rate_range}]

gs = GridSearchCV(estimator=pipe_ada,
                  param_grid=param_grid,
                  scoring=scorer,
                  cv=5,
                  n_jobs=-1)
gs = gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)

0.9540066514390906
{'clf__learning_rate': 0.8, 'clf__n_estimators': 300, 'clf__base_estimator__max_depth': 6, 'clf__base_estimator__criterion': 'entropy'}
Wall time: 1h 1min 16s


#### Support Vector Machine

In [16]:
# Pipeline for support vector machine classifier (using linear kernel)
pipe_svm = Pipeline([('scl', StandardScaler()),
                     ('clf', SVC(decision_function_shape='ovr', 
                                 random_state=42))])

In [17]:
%%time
scorer = make_scorer(roc_auc_score)

c_gamma_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]

param_grid = [{'clf__C': c_gamma_range,
               'clf__gamma': c_gamma_range,
               'clf__kernel': ['rbf']}]

gs = GridSearchCV(estimator=pipe_svm,
                  param_grid=param_grid,
                  scoring=scorer,
                  cv=5,
                  n_jobs=-1)
gs = gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)

0.9273959041249727
{'clf__kernel': 'rbf', 'clf__gamma': 0.001, 'clf__C': 100.0}
Wall time: 1min 24s


In [18]:
%%time
scorer = make_scorer(roc_auc_score)

c_gamma_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]

param_grid = [{'clf__C': c_gamma_range,
               'clf__kernel': ['linear']}]

gs = GridSearchCV(estimator=pipe_svm,
                  param_grid=param_grid,
                  scoring=scorer,
                  cv=5,
                  n_jobs=-1)
gs = gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)

0.9225870855301017
{'clf__kernel': 'linear', 'clf__C': 10.0}
Wall time: 51.5 s


#### K-Nearest Neighbors

In [19]:
# Pipeline for k nearest neighbors classifier
pipe_knn = Pipeline([('scl', StandardScaler()),
                     ('clf', KNeighborsClassifier(algorithm='auto', 
                                                  p=2))])

In [20]:
%%time
scorer = make_scorer(roc_auc_score)

n_neighbors_range = [1, 5, 10, 15, 20, 40, 50, 100]

param_grid = [{'clf__weights': ['uniform'],
               'clf__n_neighbors': n_neighbors_range}]

gs = GridSearchCV(estimator=pipe_knn,
                  param_grid=param_grid,
                  scoring=scorer,
                  cv=5,
                  n_jobs=-1)
gs = gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)

0.8971147259461891
{'clf__n_neighbors': 5, 'clf__weights': 'uniform'}
Wall time: 16.2 s
