In [1]:
import numpy as np
import pandas as pd

## Load the data

In [2]:
trainX = pd.read_csv('./train/X_train.txt', header=None, delim_whitespace=True).values
trainy = pd.read_csv('./train/y_train.txt', header=None, delim_whitespace=True).values
testX = pd.read_csv('./test/X_test.txt', header=None, delim_whitespace=True).values
testy = pd.read_csv('./test/y_test.txt', header=None, delim_whitespace=True).values
trainy, testy = trainy[:,0], testy[:,0]
print(trainX.shape, trainy.shape, testX.shape, testy.shape)

(7352, 561) (7352,) (2947, 561) (2947,)


## Import models

In [3]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier

## Non_linear model generators

In [4]:
def non_linear_models(models=dict()):
    models['K-NN'] = KNeighborsClassifier(n_neighbors=7) 
    models['Decision Tree'] = DecisionTreeClassifier()
    models['Support Vector Machine'] = SVC(gamma='auto')
    models['Naive Bayes'] = GaussianNB()
    return models

## Superlearner models

In [5]:
def super_learner_models(models = dict()):
    models['Bagging'] = BaggingClassifier(n_estimators=100) 
    models['Random Forest'] = RandomForestClassifier(n_estimators=100) 
    models['Extra Trees'] = ExtraTreesClassifier(n_estimators=100) 
    models['Gradient Boost'] = GradientBoostingClassifier(n_estimators=100) 
    return models

In [6]:
models = super_learner_models(non_linear_models())
len(models)

8

## Prediction accuracy for a model

In [7]:
from sklearn.metrics import accuracy_score
def get_accuracy(model):
    model.fit(trainX, trainy)
    y_pred = model.predict(testX)
    return accuracy_score(testy, y_pred) * 100.0

## Mark the time of execution and accuracy for each model

In [8]:
from datetime import datetime
accuracies = dict()
for k,v in models.items():
    print('getting accuracy for %s' % k)
    a = datetime.now()
    accuracy = get_accuracy(v)
    duration = datetime.now() - a
    accuracies[k] = (accuracy, duration)
    print('%s took %s' %(k, duration))

getting accuracy for K-NN
K-NN took 0:00:10.465225
getting accuracy for Decision Tree
Decision Tree took 0:00:03.107593
getting accuracy for Support Vector Machine
Support Vector Machine took 0:00:12.002902
getting accuracy for Naive Bayes
Naive Bayes took 0:00:00.105891
getting accuracy for Bagging
Bagging took 0:03:34.768426
getting accuracy for Random Forest
Random Forest took 0:00:08.017591
getting accuracy for Extra Trees
Extra Trees took 0:00:01.568931
getting accuracy for Gradient Boost
Gradient Boost took 0:02:04.597392


## Sort and print results

In [9]:
sorted_accuracies = sorted([(k,p[0],p[1]) for k,p in accuracies.items()], key=lambda x: -x[1])

In [10]:
for k,a,d in sorted_accuracies:
    print('%s: %f (%s)' % (k,a,d))

Extra Trees: 94.129623 (0:00:01.568931)
Support Vector Machine: 94.027825 (0:00:12.002902)
Gradient Boost: 93.892094 (0:02:04.597392)
Random Forest: 93.179505 (0:00:08.017591)
K-NN: 90.329148 (0:00:10.465225)
Bagging: 90.261283 (0:03:34.768426)
Decision Tree: 85.951815 (0:00:03.107593)
Naive Bayes: 77.027486 (0:00:00.105891)
