**COMP3670/6670 Tutorial Week 10**
---

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Task 0: MNIST Dataset (Recap)

In [None]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
digits,labels = datasets.load_digits(return_X_y=True)
X = digits
y = labels

def print_heatmap(data, l=8, h=8):
    plt.matshow(data.reshape(l,h), cmap=plt.cm.gray)

print_heatmap(X[42]) #print digits


In [None]:
TEST_SIZE = 0.5
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=TEST_SIZE)

print("Total dataset", np.shape(X))
print("Split dataset (Train)", np.shape(X_train))
print("Split dataset (Test)", np.shape(X_test))

# MLP

In [None]:
import warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.neural_network import MLPClassifier

#The MLP input layer has 64 pixel inputs (8x8 image). The output layer predicts 10 labels. 
#Hidden layers form connected perceptron between input and output layers.
#(64,32) denotes a first layer with 64 perceptrons connected to the input layer, 
#then another hidden layer with 32 perceptrons connected to the first hidden layer
#and to the output layer. How do these choices matter? 
# hidden_layers = (64,32)
hidden_layers = (64,)
max_iterations = 10
#Max iterations is set to 10. Can you set it higher? Will a higher value ensure every iteration is used?

mlp = MLPClassifier(hidden_layer_sizes=hidden_layers, max_iter=max_iterations, activation = 'logistic', random_state=42, verbose=1)

with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=ConvergenceWarning, module="sklearn")
    mlp.fit(X_train, y_train) #Code to catch convergence warnings

print("Training set score: %f" % mlp.score(X_train, y_train))
print("Test set score: %f" % mlp.score(X_test, y_test))



In [None]:
#Visualising hidden layer weights
import math
for i, layer in enumerate(hidden_layers):
    fig, axes = plt.subplots(1, layer, figsize=(30, 3))
    vmin, vmax = mlp.coefs_[i].min(), mlp.coefs_[i].max()
    for coef, ax in zip(mlp.coefs_[i].T, axes.ravel()):
        #display square subset
        mat_side = math.floor(math.sqrt(len(coef)))
        #For visualising only showing the largest square matrix the weights represent
        ax.matshow(coef[:mat_side*mat_side].reshape(mat_side, mat_side), cmap=plt.cm.gray, vmin=vmin, vmax=vmax)
        ax.set_xticks(())
        ax.set_yticks(())
    
plt.show()

# Task: Test score vs max iterations

In [None]:
#Can you try the same MLP with different max_iterations up to 1000? What is the test score mlp.score(X_test, y_test)?
#Try to plot this

# Task: Test score vs architecture

In [None]:
#For this task set the maximum iterations to be 1000.
#Is there a 'best' architecture? 
#What about wider networks (62) or (128)?
#What about deeper networks (64,64) or (64,32) or (32,32,32)?
#Can you check the test score for different architectures?


# Probabilities of Detections

In [None]:
def plot_detection_probabilities(entry):
    probabilities = mlp.predict_proba([entry])[0]
    # print(np.sum(probabilities)) #What do you think this should be? 
    plt.bar(np.arange(10), probabilities, color='k')
    plt.xticks(np.arange(10))
    plt.ylabel("probability")
    plt.xlabel("digit labels")
    plt.show()


In [None]:
plot_detection_probabilities(X_test[42])
print_heatmap(X_test[42])

# Task: High confidence vs low confidence classifications

In [None]:
#Try different entries from the test set to see if you can find a low confidence example
#What would such an example look like in the probability distribution plot?