In [None]:
# File handling
import os
import pickle

# General functions
import numpy as np
import pandas as pd

# Sci-kit learn
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, PredefinedSplit
from sklearn.neural_network import MLPClassifier
import sklearn.pipeline
import sklearn.metrics
# import sklearn.linear_model
# import sklearn.model_selection as skms
# import sklearn.feature_selection
# from sklearn.utils import shuffle
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, ConfusionMatrixDisplay

# Data visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# Import our custom functions
from load_data import load_data 

Load training and validation data:

x data is an array of N*784 pixels (N = 2102 for tr, 600 for va)

y is a dataframe of index, class_name and class_id

In [None]:
x_tr, y_tr_df = load_data('x_train.csv', 'y_train.csv')
x_va, y_va_df = load_data('x_valid.csv', 'y_valid.csv')
x_te = load_data('x_test.csv', 'y_valid.csv')[0]


for label, arr in [('train', x_tr), ('valid', x_va)]:
    print("Contents of %s_x.csv: arr of shape %s" % (
        label, str(arr.shape)))

Plot a random image from the validation data for each category

In [None]:
# prng = np.random.RandomState(0)
prng = np.random.RandomState()
N = 3 # num examples of each class to show
fig, axgrid = plt.subplots(N, 6, figsize=(6*3, N*2.5))

for ll, label in enumerate(['dress', 'pullover', 'top', 'trouser', 'sandal', 'sneaker']):
    match_df = y_va_df.query("class_name == '%s'" % label)
    match_ids_N = prng.choice(match_df.index, size=N)        
    for ii, row_id in enumerate(match_ids_N):
        ax = axgrid[ii, ll]
        x_SS = x_va[row_id].reshape((28,28))
        ax.imshow(x_SS, vmin=0, vmax=255, cmap='gray')
        ax.set_xticks([]); ax.set_yticks([]);
        if ii == 0:
            ax.set_title(label, fontsize=16)
plt.subplots_adjust(left=0.01, right=0.99, wspace=.2, hspace=.01)
plt.tight_layout();
plt.show();

# Data Exploration 1a

In [None]:
tr_class_dist = y_tr_df['class_name'].value_counts()
val_class_dist = y_va_df['class_name'].value_counts()

print('Training class distribution:\n' + str(tr_class_dist))
print('Validation class distribution:\n' + str(val_class_dist))


The training data is heavily skewed with data from sandals and trainers, with only one training image for 2 classes. A challenge will be gaining a balanced weighting for each of the classes such that the dominant classes aren't always favoured by the classifier.

# Model

Run search with predefined split s.t. validation set is used for hyperparameter selection

In [None]:
# Combine training and validation datasets
x_all = np.vstack((x_tr,x_va))
y_all_df = pd.concat([y_tr_df,y_va_df])

print("Training X shape: %s\nValidation X shape: %s\nCombined X shape: %s\n" % (x_tr.shape, x_va.shape, x_all.shape))
print("Training Y shape: %s\nValidation Y shape: %s\nCombined Y shape: %s\n" % (y_tr_df.shape, y_va_df.shape, y_all_df.shape))

valid_indicators = np.hstack([
    -1 * np.ones(y_tr_df.shape[0]), # -1 means never include this example in any test split
    0  * np.ones(y_va_df.shape[0]), #  0 means include in the first test split (we count starting at 0 in python)
    ])

# Define custom splitter to use only the validation dataset for hyperparameter selection
print("Splitter dimensions: %i" % (valid_indicators.shape[0]))
my_splitter = PredefinedSplit(valid_indicators)

display(y_all_df)

# Random search
Load previous model or run new model

In [None]:
# Older run with some unnecesary parameters.
flag = 'stop'
filename = '1B_rand_search_400.sav'

if os.path.isfile("./" + filename) and flag != 'run':
    rand_400_1b_model = pickle.load(open(filename, 'rb'))
else:
    param_dist = dict(activation=['relu', 'logistic', 'identity', 'tanh'], learning_rate_init=np.logspace(-5, 5, 100), learning_rate = ['constant','adaptive'], hidden_layer_sizes=[(10,),(20,),(50,),(100,),(200,),(500,)])

    rand_400_1b_model = sklearn.pipeline.Pipeline([
        ('rand_search', RandomizedSearchCV(sklearn.neural_network.MLPClassifier( solver='lbfgs', random_state = 0, shuffle=True, early_stopping = True), param_dist, scoring='balanced_accuracy', error_score='raise', random_state=0, return_train_score=True, n_iter=400, cv= my_splitter, n_jobs = -1, refit=False))
    ])

    # Fit on x_all as the custom splitter will divide this into tr and val
    rand_400_1b_model.fit(x_all, y_all_df['class_name'])
    pickle.dump(rand_400_1b_model, open(filename, 'wb'))

In [None]:
# Removed following parameters that don't apply to lbfgs:
# batch size only for non-lbfgs. 
# Learning Rate = const, adaptive, etc... ONLY for sgd
# learning_rate_init only used for sgd or adam
# early_stopping, n_iter_no_change and validation_fraction only for sgd/adam.

flag = 'stop'
filename = '1B_rand_search_600.sav'

if os.path.isfile("./" + filename) and flag != 'run':
    rand_600_1b_model = pickle.load(open(filename, 'rb'))
else:
    rand_param_dist = dict(hidden_layer_sizes=[(10,),(20,),(50,),(100,),(200,),(500,)], activation=('identity', 'logistic', 'tanh', 'relu'), max_iter = [1, 2, 3, 4, 5, 10, 20, 50, 100, 200, 300], alpha = np.logspace(-5,5,50))

    rand_600_1b_model = sklearn.pipeline.Pipeline([
        ('rand_search', RandomizedSearchCV(MLPClassifier(solver='lbfgs', shuffle=True, random_state=0), rand_param_dist, scoring='balanced_accuracy', error_score='raise', return_train_score=True, n_iter=600, cv= my_splitter, n_jobs = -1, refit= False, random_state=0))
    ])

    # Fit on x_all as the custom splitter will divide this into tr and val
    rand_600_1b_model.fit(x_all, y_all_df['class_name'])
    pickle.dump(rand_600_1b_model, open(filename, 'wb'))

In [None]:
randRes = rand_600_1b_model['rand_search'].cv_results_
bestIdx = rand_600_1b_model['rand_search'].best_index_ # idx 12, 16, 44 all scored the same

bestIdxs = [12,16,44]
bestParams = dict()
# bestParams[0] = {k:v[bestIdxs[0]] for k,v in randRes.items()}
# bestParams[1] = {k:v[bestIdxs[1]] for k,v in randRes.items()}
# bestParams[2] = {k:v[bestIdxs[2]] for k,v in randRes.items()}
bestParams[0] = {k:v[bestIdx] for k,v in randRes.items()}

# display(bestParams)

print(bestIdx)
print(randRes['params'][bestIdx])
print(randRes["rank_test_score"][bestIdx])
display(randRes["mean_test_score"][bestIdx])
print(bestParams[0])

Using the custom splitter, it's saying the following:

best idx = 264
best params = {'max_iter': 100, 'hidden_layer_sizes': (200,), 'alpha': 1456.3484775012444, 'activation': 'relu'}
test score = 0.775

However when we take those parameters and make a new model and fit it to training, it performs much worse on the validation set.

In [None]:
# bestMLP1b = sklearn.neural_network.MLPClassifier( solver='lbfgs', random_state = 0, shuffle=True, early_stopping = True, learning_rate_init=1e-05,learning_rate ='constant',hidden_layer_sizes=(500,),activation='identity')
# bestMLP1b.fit(x_tr,y_tr_df["class_name"])

bestMLP1b = MLPClassifier(solver='lbfgs', shuffle=True, random_state=0, max_iter=100, hidden_layer_sizes=(200,), alpha=1456.3484775012444, activation='relu')

bestMLP1b.fit(x_tr,y_tr_df["class_name"])

In [None]:
tr_acc = sklearn.metrics.balanced_accuracy_score(y_tr_df['class_name'], bestMLP1b.predict(x_tr))
va_acc = sklearn.metrics.balanced_accuracy_score(y_va_df['class_name'], bestMLP1b.predict(x_va))
print("Training balanced accuracy: %f\nValidation balanced accuracy: %f" % (tr_acc, va_acc))

In [None]:
tr_acc = sklearn.metrics.accuracy_score(y_tr_df['class_name'], bestMLP1b.predict(x_tr))
va_acc = sklearn.metrics.accuracy_score(y_va_df['class_name'], bestMLP1b.predict(x_va))
print("Training balanced accuracy: %f\nValidation balanced accuracy: %f" % (tr_acc, va_acc))

In [None]:
pred_tr = bestMLP1b.predict(x_tr)
pred_va = bestMLP1b.predict(x_va)
pred_te = bestMLP1b.predict(x_te)

# Save output of prediction on test data to a file.
np.savetxt('yhat_test.txt', pred_te, delimiter='\n', fmt='%s')

In [None]:
# Plot test data
rows = 3
cols = 7
fig, axgrid = plt.subplots(rows, cols, layout="constrained")

for imageID in range(rows*cols):
    ax = axgrid[np.unravel_index( imageID, (rows,cols))]
    x_SS = x_te[imageID].reshape((28,28))
    ax.imshow(x_SS, vmin=0, vmax=255, cmap='gray')
    ax.set_xticks([]); ax.set_yticks([]);
    label = pred_te[imageID]
    ax.set_title(label, fontsize=16)
# plt.tight_layout();
fig.suptitle("Sample of test data predictions by model 1B")
plt.show();

# Grid Search

In [None]:
# TODO refine search using gridsearchcv
# param_dist = dict(activation=['relu', 'logistic', 'identity', 'tanh'], learning_rate_init=np.logspace(-5, 5, 100), learning_rate = ['constant','adaptive'], hidden_layer_sizes=[(20,),(50,),(100,),(200,),(500,)])

# fashion_pipes = sklearn.pipeline.Pipeline([
#     ('rand_search', GridSearchCV(sklearn.neural_network.MLPClassifier( solver='lbfgs', random_state = 0, shuffle=True, early_stopping = True), param_dist, scoring='balanced_accuracy', error_score='raise', random_state=0, return_train_score=True, n_iter=100, cv= my_splitter, n_jobs = -1, refit=False))
# ])

# fashion_pipes.fit(x_all, y_all_df['class_name'])

In [None]:
# When refit is true we can use the best_estimator_ method, but this doesn't work with refit=false - here we need to manually run a fit on our training set.

# best_est_1 = fashion_pipes['rand_search'].best_estimator_

# pred_tr = best_est_1.predict(x_tr)
# pred_va = best_est_1.predict(x_va)
# pred_te = best_est_1.predict(x_te)

# # Save output of prediction on test data to a file.
# np.savetxt('yhat_test.txt', pred_te, delimiter='\n', fmt='%s')
# sklearn.neural_network.MLPClassifier( solver='lbfgs', random_state = 0, shuffle=True, early_stopping = True)

Ignore for the time being - this was done with refit = true, which fits the final model from randomizedSearchCV onto the entire dataset (tr+val).

![Balanced acc of 1.0 or tr and val](best_estimator_1.png)


# Repeat with data normalized

In [None]:
# normalize data - necessary
x_tr_norm = x_tr /255
x_va_norm = x_va /255


In [None]:
# Combine training and validation datasets
x_all_norm = np.vstack((x_tr_norm,x_va_norm))
y_all_df = pd.concat([y_tr_df,y_va_df])

print("Training X shape: %s\nValidation X shape: %s\nCombined X shape: %s\n" % (x_tr_norm.shape, x_va_norm.shape, x_all_norm.shape))
print("Training Y shape: %s\nValidation Y shape: %s\nCombined Y shape: %s\n" % (y_tr_df.shape, y_va_df.shape, y_all_df.shape))

valid_indicators = np.hstack([
    -1 * np.ones(y_tr_df.shape[0]), # -1 means never include this example in any test split
    0  * np.ones(y_va_df.shape[0]), #  0 means include in the first test split (we count starting at 0 in python)
    ])

# Define custom splitter to use only the validation dataset for hyperparameter selection
print("Splitter dimensions: %i" % (valid_indicators.shape[0]))
my_splitter = sklearn.model_selection.PredefinedSplit(valid_indicators)


In [None]:
flag = 'stop'
filename = '1Bnorm_rand_search_100.sav'

if os.path.isfile("./" + filename) and flag != 'run':
    fashion_pipes_norm = pickle.load(open(filename, 'rb'))
else:
    param_dist = dict(activation=['relu', 'logistic', 'identity', 'tanh'], learning_rate_init=np.logspace(-5, 5, 100), learning_rate = ['constant','adaptive'], hidden_layer_sizes=[(20,),(50,),(100,),(200,),(500,)])

    fashion_pipes_norm = sklearn.pipeline.Pipeline([
        ('rand_search', RandomizedSearchCV(sklearn.neural_network.MLPClassifier( solver='lbfgs', random_state = 0, shuffle=True, early_stopping = True), param_dist, scoring='balanced_accuracy', error_score='raise', random_state=0, return_train_score=True, n_iter=100, cv= my_splitter, n_jobs = -1, refit=False))
    ])

    fashion_pipes_norm.fit(x_all_norm, y_all_df['class_name'])
    pickle.dump(fashion_pipes, open(filename, 'wb'))

In [None]:
filename = '1D_initial_RanSearch.sav'
print("./" + filename)

os.path.isfile("./" + filename)

In [None]:
# Assess the same procedure with normalized input data - same results and same accuracy found in this case.display

gridRes = fashion_pipes['rand_search'].cv_results_

display(gridRes)
bestIdxNorm = fashion_pipes['rand_search'].best_index_ # idx 12, 16, 44 all scored the same

bestIdxs = [12,16,44]
bestParams = dict()
bestParams[0] = {k:v[bestIdxs[0]] for k,v in gridRes.items()}
bestParams[1] = {k:v[bestIdxs[1]] for k,v in gridRes.items()}
bestParams[2] = {k:v[bestIdxs[2]] for k,v in gridRes.items()}

display(bestParams)
# print(gridRes["mean_test_score"])
# display(gridRes["mean_test_score"])

In [None]:
bestMLP1bNorm = sklearn.neural_network.MLPClassifier( solver='lbfgs', random_state = 0, shuffle=True, early_stopping = True, learning_rate_init=1e-05,learning_rate ='constant',hidden_layer_sizes=(500,),activation='identity')
bestMLP1bNorm.fit(x_tr_norm,y_tr_df["class_name"])

In [None]:
tr_acc = sklearn.metrics.balanced_accuracy_score(y_tr_df['class_name'], bestMLP1bNorm.predict(x_tr_norm))
va_acc = sklearn.metrics.balanced_accuracy_score(y_va_df['class_name'], bestMLP1bNorm.predict(x_va_norm))
print("Training balanced accuracy: %f\nValidation balanced accuracy: %f" % (tr_acc, va_acc))

For some reason the validation accuracy seems to be lower on normalized data