# Importing Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import torch
import torchvision
import torchvision.transforms as transforms
import math

from sklearn.datasets import fetch_openml
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split, ShuffleSplit, GridSearchCV
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.preprocessing import LabelEncoder
from sklearn.neural_network import MLPClassifier

In [2]:
# Fix the random seed for reproducibility
# !! Important !! : do not change this
seed = 1234
np.random.seed(seed)

import warnings
warnings.filterwarnings("ignore") # for future tuning

# Loading Data

In [3]:
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

batch_size = 128

trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                          shuffle=True, num_workers=4)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
                                         shuffle=False, num_workers=4)

classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

Files already downloaded and verified
Files already downloaded and verified


In [4]:
trainset_np = np.asarray(trainset.data)
trainset_labels = np.asarray(trainset.targets)
testset_np = np.asarray(testset.data)
testset_labels = np.asarray(testset.targets)

X_train = trainset_np
y_train = trainset_labels

validset_np = trainset_np[40000:50000]
validset_labels = trainset_labels[40000:50000]
trainset_np = trainset_np[0:40000]
trainset_labels = trainset_labels[0:40000]

print('Train Set')
print(trainset_np.shape)
print(trainset_labels.shape)
print('-')
print('Validation Set')
print(validset_np.shape)
print(validset_labels.shape)
print('-')
print('Test Set')
print(testset_np.shape)
print(testset_labels.shape)
print('-')
print(f'X_train {X_train.shape}')
print(f'y_train {y_train.shape}')

Train Set
(40000, 32, 32, 3)
(40000,)
-
Validation Set
(10000, 32, 32, 3)
(10000,)
-
Test Set
(10000, 32, 32, 3)
(10000,)
-
X_train (50000, 32, 32, 3)
y_train (50000,)


# Flattening Data

In [5]:
# For use in manual tuning
trainset_flattened = np.empty((trainset_np.shape[0], 3072))
validset_flattened = np.empty((validset_np.shape[0], 3072))
testset_flattened = np.empty((testset_np.shape[0], 3072))

# For use in cross-validation
X_train_flattened = np.empty((X_train.shape[0], 3072))

In [6]:
for i in range(trainset_np.shape[0]):
    trainset_flattened[i] = trainset_np[i].flatten()

for i in range(validset_np.shape[0]):
    validset_flattened[i] = validset_np[i].flatten()

for i in range(testset_np.shape[0]):
    testset_flattened[i] = testset_np[i].flatten()


for i in range(X_train.shape[0]):
    X_train_flattened[i] = X_train[i].flatten()

# Tuning MLP

### Base Classifier with Default Parameters

In [7]:
clf = MLPClassifier(random_state=seed)
clf.fit(trainset_flattened, trainset_labels)
y_pred = clf.predict(validset_flattened)
num = (y_pred == validset_labels).sum() # number of correct labels
print(f' Accuracy based on Validation Set {num / len(validset_labels)}') # accuracy 
# print(clf.score(testset_flattened, testset_labels))

 Accuracy based on Validation Set 0.0952


### Using GridSearchCV for hyperparameter tuning

#### 100 Datapoints

In [11]:
import time # for elapsed time

cv = ShuffleSplit(n_splits=1, test_size=0.2, random_state=seed)

param_grid = {
    'hidden_layer_sizes':[5, 10, 15, (5, 5), (5, 10)],
    'activation': ['identity','logistic','tanh','relu'],
    'solver': ['lbfgs','sgd','adam'],
    'learning_rate':['constant','invscaling','adaptive']
    'random_state':[seed]}

gridSearch = GridSearchCV(MLPClassifier(), param_grid, cv=cv,verbose=2)
start = time.time()
gridSearch.fit(X_train_flattened[:100], y_train[:100])
print('Score: ', gridSearch.best_score_)
print('Parameters: ', gridSearch.best_params_)
end = time.time()

print(f'Elapsed Time: {end - start}')

Fitting 1 folds for each of 180 candidates, totalling 180 fits
[CV] END activation=identity, hidden_layer_sizes=5, learning_rate=constant, solver=lbfgs; total time=   0.1s
[CV] END activation=identity, hidden_layer_sizes=5, learning_rate=constant, solver=sgd; total time=   0.0s
[CV] END activation=identity, hidden_layer_sizes=5, learning_rate=constant, solver=adam; total time=   0.0s
[CV] END activation=identity, hidden_layer_sizes=5, learning_rate=invscaling, solver=lbfgs; total time=   0.2s
[CV] END activation=identity, hidden_layer_sizes=5, learning_rate=invscaling, solver=sgd; total time=   0.0s
[CV] END activation=identity, hidden_layer_sizes=5, learning_rate=invscaling, solver=adam; total time=   0.1s
[CV] END activation=identity, hidden_layer_sizes=5, learning_rate=adaptive, solver=lbfgs; total time=   0.0s
[CV] END activation=identity, hidden_layer_sizes=5, learning_rate=adaptive, solver=sgd; total time=   0.0s
[CV] END activation=identity, hidden_layer_sizes=5, learning_rate=a

##### Displaying Hyperparameter Tuning Results

In [12]:
results_df = pd.DataFrame(gridSearch.cv_results_)
print(results_df)

     mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0         0.203080           0.0         0.001004             0.0   
1         0.014840           0.0         0.001018             0.0   
2         0.120437           0.0         0.000000             0.0   
3         0.276190           0.0         0.001507             0.0   
4         0.130112           0.0         0.000000             0.0   
..             ...           ...              ...             ...   
175       0.017333           0.0         0.000000             0.0   
176       0.207999           0.0         0.001012             0.0   
177       0.026511           0.0         0.001270             0.0   
178       0.125922           0.0         0.001002             0.0   
179       0.194782           0.0         0.001094             0.0   

    param_activation param_hidden_layer_sizes param_learning_rate  \
0           identity                        5            constant   
1           identity             

#### Variable Amount of Datapoints

In [13]:
vary_datapoints = [1000, 5000, 10000, 50000] # 50000 is the original total
results = {} # num_datapoint -> (gridSearch_obj, elapsed_time)

cv = ShuffleSplit(n_splits=1, test_size=0.2, random_state=seed)

param_grid = {
    'hidden_layer_sizes':[5, 10, 15, (5, 5), (5, 10)],
    'activation': ['identity','logistic','tanh','relu'],
    'solver': ['lbfgs','sgd','adam'],
    'learning_rate':['constant','invscaling','adaptive']
    'random_state':[seed]}

for num_datapoint in vary_datapoints:
    gridSearch_obj = GridSearchCV(MLPClassifier(), param_grid, cv=cv,verbose=2)
    start = time.time()
    gridSearch_obj.fit(X_train_flattened[:num_datapoint], y_train[:num_datapoint])
    end = time.time()
    elapsed = end - start
    results[num_datapoint] = (gridSearch_obj, elapsed)

Fitting 1 folds for each of 180 candidates, totalling 180 fits
[CV] END activation=identity, hidden_layer_sizes=5, learning_rate=constant, solver=lbfgs; total time=   0.1s
[CV] END activation=identity, hidden_layer_sizes=5, learning_rate=constant, solver=sgd; total time=   0.0s
[CV] END activation=identity, hidden_layer_sizes=5, learning_rate=constant, solver=adam; total time=   0.2s
[CV] END activation=identity, hidden_layer_sizes=5, learning_rate=invscaling, solver=lbfgs; total time=   0.1s
[CV] END activation=identity, hidden_layer_sizes=5, learning_rate=invscaling, solver=sgd; total time=   0.0s
[CV] END activation=identity, hidden_layer_sizes=5, learning_rate=invscaling, solver=adam; total time=   0.3s
[CV] END activation=identity, hidden_layer_sizes=5, learning_rate=adaptive, solver=lbfgs; total time=   0.0s
[CV] END activation=identity, hidden_layer_sizes=5, learning_rate=adaptive, solver=sgd; total time=   0.4s
[CV] END activation=identity, hidden_layer_sizes=5, learning_rate=a

KeyboardInterrupt: 

Individual Additions to `results` dictionary.

In [None]:
num_datapoint = 10000
cv = ShuffleSplit(n_splits=1, test_size=0.2, random_state=seed)

gridSearch_obj = GridSearchCV(MLPClassifier(), param_grid, cv=cv,verbose=2)
start = time.time()
gridSearch_obj.fit(X_train_flattened[:num_datapoint], y_train[:num_datapoint])
end = time.time()
elapsed = end - start
results[num_datapoint] = (gridSearch_obj, elapsed)

In [30]:
num_datapoint = 50000
cv = ShuffleSplit(n_splits=1, test_size=0.2, random_state=seed)

gridSearch_obj = GridSearchCV(MLPClassifier(), param_grid, cv=cv,verbose=2)
start = time.time()
gridSearch_obj.fit(X_train_flattened[:num_datapoint], y_train[:num_datapoint])
end = time.time()
elapsed = end - start
results[num_datapoint] = (gridSearch_obj, elapsed)

Fitting 1 folds for each of 180 candidates, totalling 180 fits
[CV] END activation=identity, hidden_layer_sizes=5, learning_rate=constant, solver=lbfgs; total time=  57.2s
[CV] END activation=identity, hidden_layer_sizes=5, learning_rate=constant, solver=sgd; total time= 2.2min
[CV] END activation=identity, hidden_layer_sizes=5, learning_rate=constant, solver=adam; total time=  41.0s
[CV] END activation=identity, hidden_layer_sizes=5, learning_rate=invscaling, solver=lbfgs; total time=  46.2s
[CV] END activation=identity, hidden_layer_sizes=5, learning_rate=invscaling, solver=sgd; total time= 2.1min
[CV] END activation=identity, hidden_layer_sizes=5, learning_rate=invscaling, solver=adam; total time=  31.3s
[CV] END activation=identity, hidden_layer_sizes=5, learning_rate=adaptive, solver=lbfgs; total time=  47.4s
[CV] END activation=identity, hidden_layer_sizes=5, learning_rate=adaptive, solver=sgd; total time= 2.1min
[CV] END activation=identity, hidden_layer_sizes=5, learning_rate=a

KeyboardInterrupt: 

### Consolidating Data

In [29]:
for num_datapoints, value in results.items():
    print(f'For {num_datapoints} datapoints:')
    print(f'Accuracy: {value[0].best_score_}')
    print(f'was achieved in {value[1]} seconds with the following parameters:')
    print(f'{value[0].best_params_}')

For 1000 datapoints:
Accuracy: 0.255
was achieved in 116.40941214561462 seconds with the following parameters:
{'activation': 'identity', 'hidden_layer_sizes': 15, 'learning_rate': 'adaptive', 'solver': 'adam'}
For 5000 datapoints:
Accuracy: 0.322
was achieved in 777.1471126079559 seconds with the following parameters:
{'activation': 'identity', 'hidden_layer_sizes': (5, 10), 'learning_rate': 'adaptive', 'solver': 'adam'}
For 10000 datapoints:
Accuracy: 0.362
was achieved in 1749.1434381008148 seconds with the following parameters:
{'activation': 'identity', 'hidden_layer_sizes': 10, 'learning_rate': 'invscaling', 'solver': 'adam'}


### Hyperparameter Tuning (Part 2)
Now we should consider using different hidden layer sizes paired with two types of learning rates.

In [33]:
num_datapoint = 5000

cv = ShuffleSplit(n_splits=1, test_size=0.2, random_state=seed)

param_grid = {
    'hidden_layer_sizes':range(1, 100, 2),
    'activation': ['identity'],
    'solver': ['adam'],
    'learning_rate':['invscaling','adaptive'],
    'random_state':[seed]}


grid_search = GridSearchCV(MLPClassifier(), param_grid, cv=cv,verbose=2)
start = time.time()
grid_search.fit(X_train_flattened[:num_datapoint], y_train[:num_datapoint])
end = time.time()
elapsed = end - start
print(f'Elapsed Time: {elapsed}')

Fitting 1 folds for each of 100 candidates, totalling 100 fits
[CV] END activation=identity, hidden_layer_sizes=1, learning_rate=invscaling, solver=adam; total time=   7.1s
[CV] END activation=identity, hidden_layer_sizes=1, learning_rate=adaptive, solver=adam; total time=   8.2s
[CV] END activation=identity, hidden_layer_sizes=3, learning_rate=invscaling, solver=adam; total time=  10.7s
[CV] END activation=identity, hidden_layer_sizes=3, learning_rate=adaptive, solver=adam; total time=  10.8s
[CV] END activation=identity, hidden_layer_sizes=5, learning_rate=invscaling, solver=adam; total time=  13.0s
[CV] END activation=identity, hidden_layer_sizes=5, learning_rate=adaptive, solver=adam; total time=   4.4s
[CV] END activation=identity, hidden_layer_sizes=7, learning_rate=invscaling, solver=adam; total time=   5.9s
[CV] END activation=identity, hidden_layer_sizes=7, learning_rate=adaptive, solver=adam; total time=   9.4s
[CV] END activation=identity, hidden_layer_sizes=9, learning_rate

In [34]:
# Results:

print(f'For {num_datapoints} datapoints:')
print(f'Accuracy: {grid_search.best_score_}')
print(f'was achieved in {elapsed} seconds with the following parameters:')
print(f'{grid_search.best_params_}')

For 10000 datapoints:
Accuracy: 0.323
was achieved in 958.3015356063843 seconds with the following parameters:
{'activation': 'identity', 'hidden_layer_sizes': 3, 'learning_rate': 'adaptive', 'solver': 'adam'}


# Final Model

In [36]:
clf = MLPClassifier(activation='identity', solver='adam', hidden_layer_sizes=3, learning_rate='adaptive', random_state=seed)

clf.fit(trainset_flattened, trainset_labels)

print(f' Training Set Accuracy: {clf.score(trainset_flattened, trainset_labels)}')
print(f' Validation Set Accuracy: {clf.score(validset_flattened, validset_labels)}')
print(f' Testing Set Accuracy: {clf.score(testset_flattened, testset_labels)}')

 Training Set Accuracy: 0.321375
 Validation Set Accuracy: 0.3112
 Testing Set Accuracy: 0.3031


### Random Manual Tuning (Initial)

The following classifiers were built to guage parameters with random values.

In [None]:
clf = MLPClassifier(solver='sgd', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=seed, learning_rate='adaptive')

clf.fit(trainset_flattened, trainset_labels)

y_pred = clf.predict(testset_flattened)
print(clf.score(testset_flattened, testset_labels))

In [None]:
clf = MLPClassifier(solver='sgd', alpha=1e-5, random_state=seed, learning_rate='adaptive')

clf.fit(trainset_flattened, trainset_labels)

# y_pred = clf.predict(testset_flattened)
print(clf.score(testset_flattened, testset_labels))

In [None]:
clf = MLPClassifier(solver='sgd', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=seed)

clf.fit(trainset_flattened, trainset_labels)

# y_pred = clf.predict(testset_flattened)
print(clf.score(testset_flattened, testset_labels))