In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
import copy
import pickle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator

from sklearn.model_selection import train_test_split, GridSearchCV, KFold, StratifiedKFold, cross_validate
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.pipeline import Pipeline
from sklearn.decomposition import FastICA, PCA
from sklearn.externals import joblib
from sklearn.metrics import silhouette_score, accuracy_score
from sklearn.manifold import TSNE

from time import sleep

import umap

from time import perf_counter

from utilities import drawSketch, computeLearningCurve, plotScoreCurve, plotTimeCurve, plotIterationCurve, plotConfusionMatrix, getDrawData, drawIncorrectSketches, heatmap, get_pca_components_for_var, plot_pca_components, df_to_heatmap, get_reconstruction_error, get_max_along_df_axis


# Define settings

In [4]:
# Import shared settings
from settings import random_state, n_jobs, test_size, pRows, pCols, pColNames, pScale, nSamples, names, accuracyMin, accuracyMax, max_iter
# Override settings
#...

In [5]:
# Used for plot names
dim_red_name = "nn_all_features"
casename = f"quickdraw_{dim_red_name}"
# scoreText = "Score (Classification Accuracy)"

# Load and curate data

In [6]:
dataDict = getDrawData(names, nSamples=nSamples)
data = dataDict['df']
nameDict = dataDict['nameDict']

print("")
print("Loaded the following named categories, with classNumbers shown")
print(nameDict)

# Scale pixel data
for col in pColNames:
    data[col] = data[col] / pScale

Loading 2500 samples for baseball from ./data/full_numpy_bitmap_baseball.npy
Loading 2500 samples for basketball from ./data/full_numpy_bitmap_basketball.npy

Loaded the following named categories, with classNumbers shown
{'baseball': 0, 'basketball': 1}


In [7]:
# Create X, y, X_test, y_test using just the features we care about...

X_full = data.loc[:, pColNames].values
print(f'X_full.shape = {X_full.shape}')
y_full = data.loc[:, 'classNumber'].values
print(f'y_full.shape = {y_full.shape}')

X, X_test, y, y_test = train_test_split(X_full, y_full, test_size=test_size, 
                                      random_state=random_state, stratify=y_full)

print('')
print(f'X.shape = {X.shape}')
print(f'y.shape = {y.shape}')
print(f'Ratio of y=1 to y=0 for Training data: {(y == 1).sum() / float(len(y))} (should be 0.5)')


print('')
print(f'X_test.shape = {X_test.shape}')
print(f'y_test.shape = {y_test.shape}')
print(f'Ratio of y_test=1 to y_test=0 for Test data: {(y_test == 1).sum() / float(len(y_test))} (should be 0.5)')

X_full.shape = (5000, 784)
y_full.shape = (5000,)

X.shape = (4000, 784)
y.shape = (4000,)
Ratio of y=1 to y=0 for Training data: 0.5 (should be 0.5)

X_test.shape = (1000, 784)
y_test.shape = (1000,)
Ratio of y_test=1 to y_test=0 for Test data: 0.5 (should be 0.5)


# Fit using MLP through GS

In [7]:
pipe = Pipeline([('scaler', StandardScaler()),
                  ('mlp', MLPClassifier(max_iter=max_iter, random_state=random_state))])

hidden_layer_sizes = [(2, ), (4, ), (6, ), (10, )]
param_grid = {
             'mlp__hidden_layer_sizes': hidden_layer_sizes,
             }
gs = GridSearchCV(pipe, param_grid=param_grid, verbose=10, cv=5, return_train_score=True, 
                  n_jobs=n_jobs)

In [8]:
gs.fit(X, y)
joblib.dump(gs, casename + "_gs.pkl")

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done   8 out of  20 | elapsed:  1.4min remaining:  2.2min
[Parallel(n_jobs=-1)]: Done  11 out of  20 | elapsed:  1.8min remaining:  1.5min
[Parallel(n_jobs=-1)]: Done  14 out of  20 | elapsed:  1.9min remaining:   49.5s
[Parallel(n_jobs=-1)]: Done  17 out of  20 | elapsed:  2.2min remaining:   22.7s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  2.2min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  2.2min finished


['quickdraw_nn_all_features_gs.pkl']

In [9]:
results = pd.DataFrame(gs.cv_results_)
results.to_csv(casename + "_results.csv")

In [8]:
# results = pd.read_csv(casename + "_results.csv")

In [9]:
results

Unnamed: 0.1,Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_mlp__hidden_layer_sizes,params,split0_test_score,split1_test_score,split2_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0,59.198,5.401217,0.035601,0.021061,"(2,)","{'mlp__hidden_layer_sizes': (2,)}",0.75875,0.74375,0.75875,...,0.752,0.005895,3,0.929688,0.939688,0.9325,0.933438,0.938438,0.93475,0.003752
1,1,42.829801,3.085544,0.0304,0.015639,"(4,)","{'mlp__hidden_layer_sizes': (4,)}",0.7375,0.725,0.78125,...,0.75025,0.018914,4,0.977187,0.983125,0.977812,0.984375,0.982812,0.981062,0.002962
2,2,35.5878,2.222834,0.034799,0.013865,"(6,)","{'mlp__hidden_layer_sizes': (6,)}",0.7575,0.76375,0.785,...,0.77,0.009454,1,0.986563,0.985625,0.990625,0.983125,0.985938,0.986375,0.002425
3,3,26.711514,4.056098,0.052199,0.018871,"(10,)","{'mlp__hidden_layer_sizes': (10,)}",0.76,0.76375,0.78125,...,0.76625,0.009186,2,0.999062,0.99875,0.99875,0.999062,0.998437,0.998812,0.000234
