In [None]:
# File handling
import os
import pickle

# General functions
import numpy as np
import pandas as pd

# Sci-kit learn
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, PredefinedSplit
from sklearn.neural_network import MLPClassifier
import sklearn.pipeline
import sklearn.metrics
# import sklearn.linear_model
# import sklearn.model_selection as skms
# import sklearn.feature_selection
# from sklearn.utils import shuffle
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, ConfusionMatrixDisplay

# Data visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# Import our custom functions
from load_data import load_data 

In [None]:
def load_data1(x_file, y_file):
    data_dir = os.path.abspath("data_fashion/")

    # Load data
    x_df = pd.read_csv(os.path.join(data_dir, x_file)).to_numpy()
    y_df = pd.read_csv(os.path.join(data_dir, y_file))
    
    return x_df, y_df

In [None]:
x_tr, y_tr_df = load_data1('x_train.csv', 'y_train.csv')
x_va, y_va_df = load_data('x_valid.csv', 'y_valid.csv')

# Real code shown in stylised markdown if needed, however equivalent is set in code cells

```python
# Define splitter to pass training data for training and validation data for hyperparameter selection
x_all = np.vstack((x_tr,x_va))
y_all_df = pd.concat([y_tr_df,y_va_df])

valid_indicators = np.hstack([
    -1 * np.ones(y_tr_df.shape[0]), # -1 means never include this example in any test split
    0  * np.ones(y_va_df.shape[0]), #  0 means include in the first test split (we count starting at 0 in python)
    ])

# Define custom splitter to use only the validation dataset for hyperparameter selection
print("Splitter dimensions: %i" % (valid_indicators.shape[0]))
my_splitter = PredefinedSplit(valid_indicators)
```

In [None]:
my_splitter = PredefinedSplit((2102,))

This loads the pre-run model, however showing the randomized search cv call that generated this file for context.

```python
rand_param_dist = dict(hidden_layer_sizes=[(10,),(20,),(50,),(100,),(200,),(500,)], activation=('identity', 'logistic', 'tanh', 'relu'), max_iter = [1, 2, 3, 4, 5, 10, 20, 50, 100, 200, 300], alpha = np.logspace(-5,5,50))

rand_600_1b_model = sklearn.pipeline.Pipeline([
    ('rand_search', RandomizedSearchCV(MLPClassifier(solver='lbfgs', shuffle=True, random_state=0), rand_param_dist, scoring='balanced_accuracy', error_score='raise', return_train_score=True, n_iter=600, cv= my_splitter, n_jobs = -1, refit= False, random_state=0))
])

# Fit on x_all as the custom splitter will divide this into tr and val
rand_600_1b_model.fit(x_all, y_all_df['class_name'])
filename = '1B_rand_search_600.sav'
pickle.dump(rand_600_1b_model, open(filename, 'wb'))
```

In [None]:
filename = '1B_rand_search_600.sav'

rand_600_1b_model = pickle.load(open(filename, 'rb'))

# NOTE score is set to balanced accuracy

In [None]:
randRes = rand_600_1b_model['rand_search'].cv_results_
bestIdx = rand_600_1b_model['rand_search'].best_index_ # idx 12, 16, 44 all scored the same


bestParams = dict()
bestParams[0] = {k:v[bestIdx] for k,v in randRes.items()}

display(bestParams)

print("Best Index: %i" % (bestIdx))

# print(randRes['params'][bestIdx])
# print(randRes["rank_test_score"][bestIdx])
print("RandomizedSearchCV reported test balanced accuracy: %f" % (randRes["mean_test_score"][bestIdx]))


In [None]:
# Remake the best run with the same settings

bestMLP1b = MLPClassifier(solver='lbfgs', shuffle=True, random_state=0, max_iter=100, hidden_layer_sizes=(200,), alpha=1456.3484775012444, activation='relu')

bestMLP1b.fit(x_tr,y_tr_df["class_name"])

In [None]:
tr_acc = sklearn.metrics.balanced_accuracy_score(y_tr_df['class_name'], bestMLP1b.predict(x_tr))
va_acc = sklearn.metrics.balanced_accuracy_score(y_va_df['class_name'], bestMLP1b.predict(x_va))
print("Training balanced accuracy: %f\nValidation balanced accuracy: %f" % (tr_acc, va_acc))

Reported accuracy on validation dataset is much lower here