## Week 5 - PCA and MNIST

### Heather Tweedie, 16/02/23

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import math

# TensorFlow and tf.keras
import tensorflow as tf
import keras
import keras.layers
import keras.datasets.mnist

from scipy import linalg

import matplotlib.style #Some style nonsense
import matplotlib as mpl #Some more style nonsense

#Set default figure size
#mpl.rcParams['figure.figsize'] = [12.0, 8.0] #Inches... of course it is inches
mpl.rcParams["legend.frameon"] = False
mpl.rcParams['figure.dpi']=200 # dots per inch


In [None]:
# load data
mnist = keras.datasets.mnist
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()

# re-scale inputs
train_images=train_images/255.0
test_images=test_images/255.0

# check shape of datasets
print("Shape of training images:",train_images.shape)
print("Length of training set labels:",len(train_labels))
print("First label:",train_labels[0])
print("Shape of testing images:",test_images.shape)
print("Length of testing set labels:",len(test_labels))

image_x = len(train_images[0,:,0])
image_y = len(train_images[0,0,:])


Define, compile and train model on MNIST dataset:

In [None]:
model = keras.Sequential([
    keras.layers.Flatten(input_shape=(28,28)),
    keras.layers.Dense(128,activation='relu'),
    keras.layers.Dense(15)
])

model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), optimizer=tf.keras.optimizers.SGD(learning_rate=1.0),
              metrics=['accuracy'])

history = model.fit(train_images, train_labels, batch_size=100, epochs=15, validation_data=(test_images, test_labels))

Define functions to be used in exercise:

In [None]:
def get_components(images):
    """
    Decomposes a set of images into PCA components.
    
    Params:
        images: the set of images to be decomposed
    """

    num_images = len(images[:,0,0])
    X = np.reshape(images, (num_images, 784))
    mu = np.mean(X, axis=0)
    x = X - mu
    rho = np.cov(x, rowvar = False)

    #Get the eigenvalues and vectors
    vals, vecs = linalg.eigh(rho)
    #vals is a 1-D array of the eigenvalues in ascending order, vecs is a columnwise array of the normalised
    # eigenvectors such that vecs[:,i] is the eigenvector correspondong to vals[i]
    vecs = np.flip(vecs)
    vals = np.flip(vals)

    return vecs, vals, x, mu

Decompose MNIST dataset into PCA components:

In [None]:
vecs_train, vals_train, x_train, mu_train = get_components(train_images)

print(np.shape(vals_train))
fig,ax=plt.subplots()
index=np.arange((np.shape(vals_train)[0]))
ax.plot(index, vals_train.real, ".")
ax.set_xlabel("PCA Index")
ax.set_ylabel("Variance")
ax.set_title('Change in variance with cumulative PCA components')
ax.grid()

Test the model against N testing datasets, and plot accuracy.

In [None]:
vecs_test, vals_test, x_test, mu_test = get_components(test_images)

In [None]:
def test_accuracy(model):
    """
    Creates N test datasets of images and tests a given model against these, then plots the accuracies.
    N is the number of the best components to be used in the test dataset images.
    
    Params:
        model: the model to be tested
    """

    N = [2, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150, 200, 250, 300, 350, 400, 450, 500, 600, 700, 784]

    test_accuracies = np.empty(len(N))
    for i in range(len(N)):
        P = np.dot(x_test, vecs_test)
        new_test = (np.dot(P[:,0:N[i]], vecs_test.T[0:N[i],:])) + mu_test
        test_loss, test_acc = model.evaluate(new_test, test_labels, verbose = 2)
        test_accuracies[i] = test_acc

    fig, ax = plt.subplots()
    ax.plot(N, test_accuracies)
    ax.set_xlabel('Number of PCA components used (first N)')
    ax.set_ylabel('Test accuracy')
    ax.set_title('Number of PCA components vs test accuracy')
    ax.grid()

In [None]:
test_accuracy(model)

Retrain network on best 100 PCA components:

In [None]:
# create new dataset
P = np.dot(x_train, vecs_train)
new_training_100 = (np.dot(P[:,0:100], vecs_train.T[0:100,:])) + mu_train
new_training_100 = np.reshape(new_training_100, (60000,28,28))

# define, compile and train new model on new training dataset
model_100 = keras.Sequential([
    keras.layers.Flatten(input_shape=(28,28)),
    keras.layers.Dense(128,activation='relu'),
    keras.layers.Dense(15)
])

model_100.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), optimizer=tf.keras.optimizers.SGD(learning_rate=1.0),
              metrics=['accuracy'])

history = model_100.fit(new_training_100, train_labels, batch_size=100, epochs=15, validation_data=(test_images, test_labels))

test_accuracy(model_100)

Retrain network on best 20 PCA components:

In [None]:
# create new dataset
P = np.dot(x_train, vecs_train)
new_training_20 = (np.dot(P[:,0:20], vecs_train.T[0:20,:])) + mu_train
new_training_20 = np.reshape(new_training_20, (60000,28,28))

# define, compile and train new model
model_20 = keras.Sequential([
    keras.layers.Flatten(input_shape=(28,28)),
    keras.layers.Dense(128,activation='relu'),
    keras.layers.Dense(15)
])

model_20.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), optimizer=tf.keras.optimizers.SGD(learning_rate=1.0),
              metrics=['accuracy'])

history = model_20.fit(new_training_20, train_labels, batch_size=100, epochs=15, validation_data=(test_images, test_labels))

test_accuracy(model_20)