## Libraries

In [None]:
import numpy
import numpy as np
import scipy
import matplotlib
import pandas
import sklearn
import matplotlib.pyplot as plt
from pandas import read_csv
from pandas import set_option
from time import time
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.ticker import NullFormatter
from sklearn import manifold, metrics, preprocessing
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import cross_val_score, learning_curve, GridSearchCV, KFold
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.datasets import load_digits
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import VotingClassifier


## Load Dataset

In [None]:
train_file = '../input/higgs-boson-data/higgs_train_10k.csv'
test_file = '../input/higgs-boson-data/higgs_test_5k.csv'

names = [
    'response',
    'x1',
    'x2',
    'x3',
    'x4',
    'x5',
    'x6',
    'x7',
    'x8',
    'x9',
    'x10',
    'x11',
    'x12',
    'x13',
    'x14',
    'x15',
    'x16',
    'x17',
    'x18',
    'x19',
    'x20',
    'x21',
    'x22',
    'x23',
    'x24',
    'x25',
    'x26',
    'x27',
    'x28']

In [None]:
train_csv = read_csv(train_file, names=names)
test_csv = read_csv(test_file, names=names)
print(train_csv.shape)
print(test_csv.shape)

In [None]:
train_csv.head(10)

In [None]:
test_csv.head(10)

## Exploring The Data

### Data Types

In [None]:
types = train_csv.dtypes
print(types)

### Some statistics

In [None]:
set_option('display.width', 100)
set_option('precision', 5)
description = train_csv.describe()
print(description)

### Class Distrubibution

In [None]:
# class distribution for train and test
train_data_class = train_csv.groupby('response').size()
print(train_data_class)
test_data_class = test_csv.groupby('response').size()
print(test_data_class)

### Correlation of Classes

In [None]:
correlations = train_csv.corr(method='pearson')

fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(correlations, vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = numpy.arange(0,29,1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(names, size = 25)
ax.set_yticklabels(names, size = 25)
plt.rcParams['figure.figsize'] = (40,40)
plt.show()

### Box Plots

In [None]:
plt.rcParams['figure.figsize'] = (12,12)
train_csv.plot(kind='box', subplots=True, layout=(6,6), sharex=False, sharey=False)
plt.show()
test_csv.plot(kind='box', subplots=True, layout=(6,6), sharex=False, sharey=False)
plt.rcParams['figure.figsize'] = (12,12)
plt.show()

## Train & Test Data

In [None]:
train_arr = train_csv.values
X_train = train_arr[:,1:28]
Y_train = train_arr[:,0]
test_arr = test_csv.values
X_test = test_arr[:,1:28]
Y_test = test_arr[:,0]

## Manifold Visualization

In [None]:
methods = ['standard', 'ltsa', 'hessian', 'modified']
labels = ['LLE', 'LTSA', 'Hessian LLE', 'Modified LLE']

n_neighbors = 10
n_components = 2
color=Y_train

for i, method in enumerate(methods):
    Ytransformed = manifold.Isomap(n_neighbors, n_components).fit_transform(X_train)
    ax = fig.add_subplot(257)
    plt.scatter(Ytransformed[:, 0], Ytransformed[:, 1],c=color, cmap=plt.cm.Spectral)
    plt.title(labels[i])
    ax.xaxis.set_major_formatter(NullFormatter())
    ax.yaxis.set_major_formatter(NullFormatter())
    plt.axis('tight')
    plt.show()


t0 = time()
mds = manifold.MDS(n_components, max_iter=100, n_init=1)
Ytransformed = mds.fit_transform(X_train)
t1 = time()
ax = fig.add_subplot(258)
plt.scatter(Ytransformed[:, 0], Ytransformed[:, 1], c=color,cmap=plt.cm.Spectral)
plt.title("MDS (%.2g sec)" % (t1 - t0))
ax.xaxis.set_major_formatter(NullFormatter())
ax.yaxis.set_major_formatter(NullFormatter())
plt.axis('tight')
plt.show()


t0 = time()
se = manifold.SpectralEmbedding(n_components=n_components,
                                n_neighbors=n_neighbors)
Ytransformed = se.fit_transform(X_train)
t1 = time()
ax = fig.add_subplot(259)
plt.scatter(Ytransformed[:, 0], Ytransformed[:, 1], c=color,cmap=plt.cm.Spectral)
plt.title("SpectralEmbedding (%.2g sec)" % (t1 - t0))
ax.xaxis.set_major_formatter(NullFormatter())
ax.yaxis.set_major_formatter(NullFormatter())
plt.axis('tight')
plt.show()

t0 = time()
tsne = manifold.TSNE(n_components=n_components, init='pca', random_state=0)
Ytransformed = tsne.fit_transform(X_train)
t1 = time()
ax = fig.add_subplot(2, 5, 10)
plt.scatter(Ytransformed[:, 0], Ytransformed[:, 1], c=color,cmap=plt.cm.Spectral)
plt.title("t-SNE (%.2g sec)" % (t1 - t0))
ax.xaxis.set_major_formatter(NullFormatter())
ax.yaxis.set_major_formatter(NullFormatter())
plt.axis('tight')

plt.show()

## PCA Feature Transformation

In [None]:

pca = PCA(n_components=2)
fit = pca.fit(X_train)
projected = pca.fit_transform(X_train)

plt.scatter(projected[:, 0], projected[:, 1],
               c=Y_train, edgecolor='none', alpha=0.5)
plt.xlabel('PCA component 1')
plt.ylabel('PCA component 2')
plt.rcParams['figure.figsize'] = (8, 8)
plt.colorbar()
plt.show()
pca = PCA(n_components=25)
fit = pca.fit(X_train)
plt.plot(numpy.cumsum(fit.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')
plt.show()
# summarize components
print("Explained Variance: %s" % fit.explained_variance_ratio_)

## Model Creation, Tuning Hyperparameters and Validation using Train Data

In [None]:
# using roc AUC as scoring
scoring = 'accuracy'

# Naive Bayes
naiveBayes = GaussianNB()
nbscore = cross_val_score(naiveBayes, X_train, Y_train, cv=3, scoring=scoring)
print('Naive Bayes CV score =', np.mean(nbscore))


# penalty
penalties = numpy.array(['l2'])
# C for logistic regression
c_values = numpy.array([1000, 100, 10, 1, 0.1, 0.01, 0.001, 0.0001, 0.00001])
# max iteration
iters = numpy.array([100, 150])
LR_param_grid = {'penalty': penalties, 'C': c_values, 'max_iter': iters}

# logistic regression as algorithm
gridLogisticRegression = LogisticRegression()
# Using GridSearchCV on Training Data for LR
grid = GridSearchCV(
    estimator=gridLogisticRegression,
    param_grid=LR_param_grid,
    scoring=scoring)
grid.fit(X_train, Y_train)
print('LR CVScore ', grid.best_score_)
print('LR Penalty', grid.best_estimator_.penalty)
print('LR C', grid.best_estimator_.C)
print('LR Max Iterations', grid.best_estimator_.max_iter)


# Perceptron
# Using GridSearchCV on Training Data for perceptron
# alphas
alphas = numpy.array([0.001, 0.0001, 0.00001, 0.000001])
# iterations
pereptorn_param_grid = {'alpha': alphas, 'max_iter': iters}
grid = GridSearchCV(
    estimator=Perceptron(),
    param_grid=pereptorn_param_grid,
    scoring=scoring)
grid.fit(X_train, Y_train)
print('Perceptron CVScore ', grid.best_score_)
print('Perceptron alpha', grid.best_estimator_.alpha)
print('Perceptron Max Iterations', grid.best_estimator_.max_iter)

# LDA
tols = numpy.array([0.001, 0.00001, 0.001])
lda_param_grid = {'tol': tols}
grid = GridSearchCV(
    estimator=LinearDiscriminantAnalysis(),
    param_grid=lda_param_grid,
    scoring=scoring)
grid.fit(X_train, Y_train)
print('LDA CVScore ', grid.best_score_)
print('LDA tol', grid.best_estimator_.tol)

## SVM Grid Search

In [None]:
# gamma parameter in SVM
gammas = numpy.array([1, 0.1, 0.01, 0.001])
# C for logistic regression
c_values = numpy.array([100, 1, 0.1, 0.01])
svm_param_grid = {'gamma': gammas, 'C': c_values}
svm = SVC(kernel='rbf')
scoring = 'accuracy'
grid = GridSearchCV(estimator=svm, param_grid=svm_param_grid, scoring=scoring)
grid.fit(X_train, Y_train)
print(grid.best_score_)
print(grid.best_estimator_.gamma)
print(grid.best_estimator_.C)

## Pipeline with Feature Reduction Selection, Logistic Regression using Grid Search

In [None]:
min_max_scaler = preprocessing.MinMaxScaler()
scaler = min_max_scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)

pipe = Pipeline([
    ('reduce_dim', PCA()),
    ('classify', LogisticRegression())
])

N_FEATURES_OPTIONS = [10, 15, 20]
C_OPTIONS = [0.001, 0.1, 1, 10, 100, 1000]
max_iter_OPTIONS = [100, 150]
param_grid = [
    {
        'reduce_dim': [PCA(iterated_power=10)],
        'reduce_dim__n_components': N_FEATURES_OPTIONS,
        'classify__C': C_OPTIONS,
        'classify__max_iter':max_iter_OPTIONS
    },
    {
        'reduce_dim': [SelectKBest(chi2)],
        'reduce_dim__k': N_FEATURES_OPTIONS,
        'classify__C': C_OPTIONS,
        'classify__max_iter':max_iter_OPTIONS
    },
]
reducer_labels = ['PCA', 'KBest(chi2)']

grid = GridSearchCV(pipe, cv=3, n_jobs=1, param_grid=param_grid)
grid.fit(X_train_scaled, Y_train)

mean_scores = np.array(grid.cv_results_['mean_test_score'])
# scores are in the order of param_grid iteration, which is alphabetical
mean_scores = mean_scores.reshape(len(C_OPTIONS), -1, len(N_FEATURES_OPTIONS))
# select score for best C
mean_scores = mean_scores.max(axis=0)
bar_offsets = (np.arange(len(N_FEATURES_OPTIONS)) *
               (len(reducer_labels) + 1) + .5)

plt.figure()
COLORS = ['tomato', 'darkolivegreen', 'lightsteelblue']
for i, (label, reducer_scores) in enumerate(zip(reducer_labels, mean_scores)):
    plt.bar(bar_offsets + i, reducer_scores, label=label, color=COLORS[i])

plt.title("Comparing feature reduction techniques")
plt.xlabel('Reduced number of features')
plt.xticks(bar_offsets + len(reducer_labels) / 2, N_FEATURES_OPTIONS)
plt.ylabel('Classification accuracy')
plt.ylim((0, 1))
plt.legend(loc='upper left')
plt.show()

## Learning Curvers on Training and Validation Data

In [None]:
def plot_learning_curve(estimator, name, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    plt.figure()
    plt.title('Learning Curves for ' + name)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("No. Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="b")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="b",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt


estimator = LogisticRegression(C=0.1, penalty='l2', max_iter=100)
plot_learning_curve(estimator, 'Tuned Logistic Regression', X_train, Y_train)
plt.rcParams['figure.figsize'] = (7, 7)
plt.show()
estimator = SVC(C=100, gamma=0.01, kernel='rbf')
plot_learning_curve(estimator, 'Tuned SVM', X_train, Y_train)
plt.rcParams['figure.figsize'] = (7, 7)
plt.show()

## Model Selection on Whole Dataset 

In [None]:

def train_and_test(clf):
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, Y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = metrics.accuracy_score(Y_test, pred)
    print("accuracy:   %0.3f" % score)
    print("classification report:")
    print(metrics.classification_report(Y_test, pred))
    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time


results = []
for classifier, name in (
    (LogisticRegression(
        C=0.1, penalty='l2', max_iter=100), "Logistic Regressin"), (Perceptron(
            alpha=0.001, max_iter=100), "Perceptron"), (LinearDiscriminantAnalysis(
                tol=0.001), "LDA"), (GaussianNB(), "Naive Bayes"), (SVC(
                    C=100, gamma=0.01, kernel='rbf'), "SVM")):
    print('=' * 80)
    print(name)
    results.append(train_and_test(classifier))

    
indices = np.arange(len(results))
results = [[x[i] for x in results] for i in range(4)]

## Plots of ROC Curves

In [None]:
lr = LogisticRegression(C=0.1, penalty='l2', max_iter=150)
lr.fit(X_train, Y_train)
lrpreds = lr.predict_proba(X_test)[:,1]
lr_fpr, lr_tpr, _ = metrics.roc_curve(Y_test, lrpreds)
plt.figure()
lw = 2
plt.plot(lr_fpr, lr_tpr, color='darkorange',
         lw=lw)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.01])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.rcParams['figure.figsize'] = (5,5)
plt.show()

## Other Models & Results

In [None]:
models = []
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('Bag', BaggingClassifier(DecisionTreeClassifier(),100, random_state=7)))
models.append(('RF', RandomForestClassifier(100, max_features=5)))
models.append(('Bo', AdaBoostClassifier(DecisionTreeClassifier(),100, random_state=7)))
# create a voting estimation 
estimators = []
estimators.append(('logistic',LogisticRegression()))
estimators.append(('NB',  GaussianNB()))
models.append(('ELE',VotingClassifier(estimators, voting='soft')))


# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
# replace with 'roc_auc', 'neg_log_loss',.. based on the need
for name, model in models:
    kfold = KFold(n_splits=10, random_state=7, shuffle = True)
    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)


In [None]:
fig = plt.figure() 
fig.suptitle('Linear and Non-Linear Algorithm Comparison on Cross-Validation') 
ax = fig.add_subplot(111) 
plt.boxplot(results) 
ax.set_xticklabels(names) 
plt.show()