In [1]:
%load_ext autoreload
%autoreload 2

In [11]:
import os
import copy
import pickle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator

from sklearn.datasets import make_classification

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib
from sklearn.decomposition import PCA, FastICA

from time import perf_counter

import umap

from utilities import get_reconstruction_error, get_pca_components_for_var, df_to_heatmap

In [12]:
# Import shared settings
from settings import random_state, n_jobs, nSamples, max_iter, test_size, class_sep
# Override settings
#...

In [13]:
# Used for plot names
dim_red_name = "nn_all_features"
casename = f"synthetic_{dim_red_name}"
# scoreText = "Score (Classification Accuracy)"

# Generate Data

In [14]:
X_full, y_full = make_classification(n_samples=nSamples, n_features=40, n_informative=10, n_redundant=5, n_classes=6, random_state=random_state, class_sep=class_sep)

X, X_test, y, y_test = train_test_split(X_full, y_full, test_size=test_size, 
                                      random_state=random_state, stratify=y_full)

print('')
print(f'X.shape = {X.shape}')
print(f'y.shape = {y.shape}')
print("Fraction of training data for each label = ")
for label in np.unique(y):
    print(f'\tData labelled {label} = {(y == label).sum() / float(len(y))}')

print('')
print(f'X_test.shape = {X_test.shape}')
print(f'y_test.shape = {y_test.shape}')
print("Fraction of testing data for each label = ")
for label in np.unique(y):
    print(f'\tData labelled {label} = {(y == label).sum() / float(len(y))}')


X.shape = (2000, 40)
y.shape = (2000,)
Fraction of training data for each label = 
	Data labelled 0 = 0.165
	Data labelled 1 = 0.169
	Data labelled 2 = 0.17
	Data labelled 3 = 0.1635
	Data labelled 4 = 0.166
	Data labelled 5 = 0.1665

X_test.shape = (500, 40)
y_test.shape = (500,)
Fraction of testing data for each label = 
	Data labelled 0 = 0.165
	Data labelled 1 = 0.169
	Data labelled 2 = 0.17
	Data labelled 3 = 0.1635
	Data labelled 4 = 0.166
	Data labelled 5 = 0.1665


# Fit using MLP through GS

In [15]:
pipe = Pipeline([('scaler', StandardScaler()),
                  ('mlp', MLPClassifier(max_iter=max_iter, random_state=random_state))])

hidden_layer_sizes = [(2, ), (6, ), (10, ), (15,)]
param_grid = {
             'mlp__hidden_layer_sizes': hidden_layer_sizes,
             }
gs = GridSearchCV(pipe, param_grid=param_grid, verbose=10, cv=3, return_train_score=True, 
                  n_jobs=n_jobs)

In [16]:
gs.fit(X, y)
joblib.dump(gs, casename + "_gs.pkl")

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done   3 out of  12 | elapsed:  1.3min remaining:  3.9min
[Parallel(n_jobs=-1)]: Done   5 out of  12 | elapsed:  1.3min remaining:  1.9min
[Parallel(n_jobs=-1)]: Done   7 out of  12 | elapsed:  1.5min remaining:  1.0min
[Parallel(n_jobs=-1)]: Done   9 out of  12 | elapsed:  1.5min remaining:   30.7s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:  1.8min finished


['synthetic_nn_all_features_gs.pkl']

In [17]:
results = pd.DataFrame(gs.cv_results_)
results.to_csv(casename + "_results.csv")

In [19]:
results.loc[:, ["param_mlp__hidden_layer_sizes", "mean_test_score", "mean_train_score"]]

Unnamed: 0,param_mlp__hidden_layer_sizes,mean_test_score,mean_train_score
0,"(2,)",0.6345,0.662744
1,"(6,)",0.8055,0.891742
2,"(10,)",0.7795,0.968255
3,"(15,)",0.781,0.9985
