# Initialization


In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn import ensemble
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [3]:
train = pd.read_csv('../data/mnist-in-csv/mnist_train.csv.zip', compression='zip', nrows=1000)
train.shape

(60000, 785)

In [4]:
train.sample(5)

Unnamed: 0,label,1x1,1x2,1x3,1x4,1x5,1x6,1x7,1x8,1x9,...,28x19,28x20,28x21,28x22,28x23,28x24,28x25,28x26,28x27,28x28
36551,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
58771,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
48966,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18788,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
42983,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Feature selection

In [5]:
y=train.label
X=train.drop('label', axis=1)

# Define models

models = []
models.append(('LR', linear_model.LinearRegression()))
models.append(('BR', linear_model.BayesianRidge(n_iter=1000)))
models.append(('Huber', linear_model.HuberRegressor(alpha=0.0001, epsilon=1.35, fit_intercept=True, 
                                                    max_iter=100, tol=1e-05, warm_start=False)))
models.append(('LarsCV', linear_model.LarsCV(max_iter=1000, eps=1.35, cv=10)))
models.append(('LassoCV', linear_model.LassoCV(max_iter=1000, cv=5)))
models.append(('RF', ensemble.RandomForestRegressor(n_estimators=10)))
models.append(('XGB', XGBRegressor()))

In [6]:
models = []
models.append(('KNN', KNeighborsClassifier(3)))
models.append(('Linear SVM', SVC(kernel="linear", C=0.025)))
models.append(('RBF SVM', SVC(gamma=2, C=1)))
models.append(('Gaussian', GaussianProcessClassifier(1.0 * RBF(1.0))))
models.append(('DT', DecisionTreeClassifier(max_depth=5)))
models.append(('RF', RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)))
models.append(('Neural Net', MLPClassifier(alpha=1)))
models.append(('Ada', AdaBoostClassifier()))
models.append(('NB', GaussianNB()))
models.append(('QDA', QuadraticDiscriminantAnalysis()))
models.append(('XGB', XGBClassifier()))


# Model fitting and prediction

In [7]:
def model_score(name, model, X, y, val_y, y_pred, icv):
    print('Running CV score')
    cvscore = cross_val_score(model, X, y, cv=icv)
    print('Model {0} score: {1:.4f} and std dev: {2:.4f}'.format(name, cvscore.mean(), cvscore.std()))
    print('Variance score: %.4f' % r2_score(val_y, y_pred))
    return cvscore, r2_score(val_y, y_pred)

In [None]:
def cv_fit_and_predict(models, X, y):
    results = []
    train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)
    for name, model in models:
        print("Fitting model {}".format(model))
        model.fit(train_X, train_y)
        print('Running prediction')
        y_pred = model.predict(val_X)
        score, r2 = model_score(name, model, X, y, val_y, y_pred, 5)
        results.append((name, score, r2))
    print("Done")
    return results


In [None]:
results = cv_fit_and_predict(models, X, y)

Fitting model KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=3, p=2,
           weights='uniform')
Running prediction


# Validation and results

In [None]:
def model_validation(results):
    fig, ax = plt.subplots()
    ax.boxplot(results[:,1])
    ax.set_xticklabels(results[:,0])
    ax.set_title('Model CV score')
    plt.show()
    
    plt.figure(figsize=(8, 5))
    plt.bar(results[:,0], results[:,2])
    plt.xlabel('Models')
    plt.ylabel('Var score')
    plt.title('Models variance score')
    plt.show()
    
    return 0

In [None]:
results=np.array(results)

In [None]:
status = model_validation(results)