# MNIST

In [1]:
import numpy as np
import pandas as pd

from sklearn.naive_bayes import GaussianNB
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

import matplotlib.pyplot as plt

## Global config

In [None]:
# TODO

## Load data

In [2]:
mnist_28x28_train = np.load("mnist_28x28_train.npy")
mnist_8x8_train = np.load("mnist_8x8_train.npy")
train_labels = np.load("train_labels.npy")

mnist_28x28_test = np.load("mnist_28x28_test.npy")
mnist_8x8_test = np.load("mnist_8x8_test.npy")

## Helpers

In [3]:
# TODO
from sklearn.metrics import make_scorer
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate

def k_fold_fit_and_evaluate(X, y, model, scoring_method, n_splits=5):
    # define evaluation procedure
    cv = KFold(n_splits=n_splits, random_state=42, shuffle=True)
    # evaluate model
    scores = cross_validate(model, X, y, scoring=scoring_method, cv=cv, n_jobs=-1)
    
       
    return scores["test_score"]

scoring_method = make_scorer(lambda prediction, true_target: f1_score(prediction, true_target, average="weighted"))

In [20]:
models = {
    "GaussianNB": GaussianNB(),
    "DecisionTreeClassifier": DecisionTreeClassifier(max_depth=None, min_samples_leaf=2, random_state=42),
    "KNeighborsClassifier": KNeighborsClassifier(n_neighbors=3, weights="distance"),
    "SVM": SVC(C=10, kernel="poly", random_state=42),
    "LogisticRegression": LogisticRegression(C=10, random_state=42, max_iter=1000)
}

##  Data exploration

### Question 1
Hint: `plt.imshow`

In [None]:
print(np.unique(train_labels))
print("Train 8 shape: ", mnist_8x8_train.shape)
print("Train 28 shape: ", mnist_28x28_train.shape)
print("Test 8 shape: ", mnist_8x8_test.shape)
print("Test 28 shape: ", mnist_28x28_test.shape)
print("Labels size: ", len(train_labels))

In [None]:
# TODO
i = 0
for m in mnist_8x8_test:
    if i < 10:
        plt.imshow(m)
        plt.show()
        i += 1
    else:
        break
    

In [None]:
# TODO
i = 0
for m in mnist_28x28_train:
    if i < 10:
        plt.imshow(m)
        plt.show()
        i += 1
    else:
        break

##  Data Preparations

### Question 1

In [9]:
# Scale all values to between 0 and 1
# Didn't make any difference
scale_train_8 = mnist_8x8_train / 255.0
scale_test_8 = mnist_8x8_test / 255.0
scale_train_28 = mnist_28x28_train / 255.0
scale_test_28 = mnist_28x28_test / 255.0

In [10]:
from sklearn.preprocessing import Normalizer

nsam_train, nxx, nyy = scale_train_8.shape
tmp_train_8 = scale_train_8.reshape(nsam_train, nxx * nyy)
X_fit_trans_8_train = Normalizer().fit_transform(tmp_train_8)
X_reshape_8_train = X_fit_trans_8_train.reshape(nsam_train, nxx, nyy)

nsam_train, nxx, nyy = scale_test_8.shape
tmp_test_8 = scale_test_8.reshape(nsam_train, nxx * nyy)
X_fit_trans_8_test = Normalizer().fit_transform(tmp_test_8)
X_reshape_8_test = X_fit_trans_8_test.reshape(nsam_train, nxx, nyy)

nsam_train, nxx, nyy = scale_train_28.shape
tmp_train_28 = scale_train_28.reshape(nsam_train, nxx * nyy)
X_fit_trans_28_train = Normalizer().fit_transform(tmp_train_28)
X_reshape_28_train = X_fit_trans_28_train.reshape(nsam_train, nxx, nyy)

nsam_train, nxx, nyy = scale_test_28.shape
tmp_test_28 = scale_test_28.reshape(nsam_train, nxx * nyy)
X_fit_trans_28_test = Normalizer().fit_transform(tmp_test_28)
X_reshape_28_test = X_fit_trans_28_test.reshape(nsam_train, nxx, nyy)

## Experiments

### Question 1

In [11]:
# Split data
from sklearn.model_selection import train_test_split

X8_train, X8_test, y8_train, y8_test = train_test_split(X_reshape_8_train, 
                                train_labels, test_size=0.1, random_state=42, shuffle=True, stratify=train_labels)

X28_train, X28_test, y28_train, y28_test = train_test_split(X_reshape_28_train, 
                                train_labels, test_size=0.1, random_state=42, shuffle=True, stratify=train_labels)

### Question 2

In [15]:
from sklearn.metrics import f1_score, accuracy_score
from tabulate import tabulate
# TODO
def fit_predict(X_train, X_test, y_train, y_test):
    table = []
    for name, model in models.items():
        nsamples_train, nx, ny = X_train.shape
        X_flat_train = X_train.reshape((nsamples_train,nx*ny))

        nsamples_test, nx, ny = X_test.shape
        X_flat_test = X_test.reshape((nsamples_test,nx*ny))

        model.fit(X_flat_train, y_train)
        pred = model.predict(X_flat_test)
        f = f1_score(pred, y_test, average='weighted')
        acc = accuracy_score(pred, y_test)
        
        table.append([name, f, acc])
    return X_flat_train, table

In [16]:
# k-fold
def kfold_fit_eval(X_kfold_train, y_train):
    table2 = []
    for name, model in models.items():
        tmp_acc = k_fold_fit_and_evaluate(X_kfold_train, y_train, model, scoring_method)
        m_acc = np.mean(tmp_acc)
        std_acc = np.std(tmp_acc)
        table2.append([name, m_acc, std_acc])
    return table2

In [21]:
X_kfold_train, table = fit_predict(X8_train, X8_test, y8_train, y8_test)
table2 = kfold_fit_eval(X_kfold_train, y8_train)
print(tabulate(table, headers=['Name', 'F1', 'Accuracy']))
print(tabulate(table, headers=['Name', 'Mean', 'Std_Acc']))

Name                          F1    Accuracy
----------------------  --------  ----------
GaussianNB              0.663124    0.626667
DecisionTreeClassifier  0.762462    0.762667
KNeighborsClassifier    0.930339    0.930667
SVM                     0.95991     0.96
LogisticRegression      0.898203    0.898667
Name                        Mean    Std_Acc
----------------------  --------  ---------
GaussianNB              0.663124   0.626667
DecisionTreeClassifier  0.762462   0.762667
KNeighborsClassifier    0.930339   0.930667
SVM                     0.95991    0.96
LogisticRegression      0.898203   0.898667


In [22]:
X_kfold_train_28, table = fit_predict(X28_train, X28_test, y28_train, y28_test)
table2 = kfold_fit_eval(X_kfold_train_28, y28_train)
print(tabulate(table, headers=['Name', 'F1', 'Accuracy']))
print(tabulate(table, headers=['Name', 'Mean', 'Std_Acc']))

Name                          F1    Accuracy
----------------------  --------  ----------
GaussianNB              0.623049    0.581333
DecisionTreeClassifier  0.723819    0.72
KNeighborsClassifier    0.936158    0.936
SVM                     0.949388    0.949333
LogisticRegression      0.89589     0.896
Name                        Mean    Std_Acc
----------------------  --------  ---------
GaussianNB              0.623049   0.581333
DecisionTreeClassifier  0.723819   0.72
KNeighborsClassifier    0.936158   0.936
SVM                     0.949388   0.949333
LogisticRegression      0.89589    0.896


### Question 3

### Question 4 

In [None]:
#TODO

### Question 5

In [None]:
#TODO

### Question 6

In [None]:
#TODO
prediction = np.array([-1] * len(mnist_8x8_test)) #TODO replace this with you own prediction
pd.DataFrame(prediction).to_csv("GROUP_classes_problem_mnist.txt", index=False, header=False)

In [None]:
# Normalize data - the pixels are already well seperated, quality improves by a lot in 28x28 compared to 8x8. 
# A simple normaliztion should be sufficient.
from sklearn.preprocessing import Normalizer

# sc = NDNormalizer()
# x8_train = sc.transform(sc.fit(mnist_8x8_train))
# x28_train = sc.transform(sc.fit(mnist_28x28_train))

sc = Normalizer()
X8_transform_train = np.full_like(mnist_8x8_train, fill_value=0)
X8_transform_test = np.full_like(mnist_8x8_test, fill_value=0)
# x8_test = np.full_like(mnist_8x8_test, fill_value=0)
# x28_test = np.full_like(mnist_28x28_test, fill_value=0)

for i in range(mnist_8x8_train.shape[1]):
    X8_transform_train[:, i, :] = sc.fit_transform(mnist_8x8_train[:, i, :]) 

for i in range(mnist_8x8_test.shape[1]):
    X8_transform_test[:, i, :] = sc.transform(mnist_8x8_test[:, i, :])
        
# for i in range(mnist_28x28_train.shape[1]):
#     x28_train[:, i, :] = sc.fit_transform(mnist_28x28_train[:, i, :]) 

# for i in range(mnist_28x28_test.shape[1]):
#     x28_test[:, i, :] = sc.transform(mnist_28x28_test[:, i, :])

i = 0
for m in X8_transform_train:
    if i < 10:
        plt.imshow(m)
        plt.show()
        i += 1
    else:
        pass

In [None]:
from sklearn.base import TransformerMixin
from sklearn.preprocessing import Normalizer


class NDNormalizer(TransformerMixin):
    def __init__(self, **kwargs):
        self._scaler = Normalizer(copy=True, **kwargs)
        self._orig_shape = None

    def fit(self, X, **kwargs):
        X = np.array(X)
        # Save the original shape to reshape the flattened X later
        # back to its original shape
        if len(X.shape) > 1:
            self._orig_shape = X.shape[1:]
        X = self._flatten(X)
        self._scaler.fit(X, **kwargs)
        return self

    def transform(self, X, **kwargs):
        X = np.array(X)
        X = self._flatten(X)
        X = self._scaler.transform(X, **kwargs)
        X = self._reshape(X)
        return X

    def _flatten(self, X):
        # Reshape X to <= 2 dimensions
        if len(X.shape) > 2:
            n_dims = np.prod(self._orig_shape)
            X = X.reshape(-1, n_dims)
        return X

    def _reshape(self, X):
        # Reshape X back to it's original shape
        if len(X.shape) >= 2:
            X = X.reshape(-1, *self._orig_shape)
        return X

In [None]:
def remove_constant_pixels(pixels_df):
    """Removes from the images the pixels that have a constant intensity value,
    either always black (0) or white (255)
    Returns the cleared dataset & the list of the removed pixels (columns)"""

    #Remove the pixels that are always black to compute faster
    changing_pixels_df = pixels_df.loc[:]
    dropped_pixels_b = []

    #Pixels with max value =0 are pixels that never change
    for col in pixels_df:
        if changing_pixels_df[col].max() == 0:
            changing_pixels_df.drop(columns=[col], inplace=True)
            dropped_pixels_b.append(col)
    print("Constantly black pixels that have been dropped: {}".format(dropped_pixels_b))


    #Same with pixels with min=255 (white pixels)
    dropped_pixels_w = []
    for col in changing_pixels_df:
        if changing_pixels_df[col].min() == 255:
            changing_pixels_df.drop(columns=[col], inplace=True)
            dropped_pixel_w.append(col)
    print("\n Constantly white pixels that have been dropped: {}".format(dropped_pixels_b))

    print(changing_pixels_df.head())
    print("Remaining pixels: {}".format(len(changing_pixels_df.columns)))
    print("Pixels removed: {}".format(784-len(changing_pixels_df.columns)))
    
    return changing_pixels_df, dropped_pixels_b + dropped_pixels_w

In [None]:
rm_const_train_8, dropped_pixels = remove_constant_pixels(mnist_8x8_train)
rm_const_train_28, dropped_pixels = remove_constant_pixels(mnist_28x28_train)
rm_const_test_8, dropped_pixels = remove_constant_pixels(mnist_8x8_test)
rm_const_test_28, dropped_pixels = remove_constant_pixels(mnist_28x28_test)

In [None]:
# Rescale and convert to black and white
np.seterr(divide='ignore', invalid='ignore')

def rescale_decolourize(image):
    #print(image)
    pmin, pmax = image.min(), image.max()
    rescaled_image = 255 * (image - pmin) / (pmax - pmin)
    rescaled_pixels = rescaled_image
    print(rescaled_pixels)

    i = 0
    for m in rescaled_pixels:
        if i < 10:
            plt.imshow(m)
            plt.show()
            i += 1
        else:
            pass

    # Only black or white pixels
    for i in range(len(image[0])):
        for j, x in enumerate(rescaled_pixels[i]):
            image[i][j] = 0 if x < 128 else 255
    return image
    # return image.apply(lambda x: 0 if x<128 else 255)
    # bw_image = image.values.reshape((8,8))
    # return bw_image

In [None]:
resize_8_train = np.full_like(mnist_8x8_train, fill_value=0)
for i, pic in enumerate(mnist_8x8_train):
    print(mnist_8x8_train[0])
    resize_8_train[i] = rescale_decolourize(pic)
    
resize_8_test = np.full_like(mnist_8x8_test, fill_value=0)
for i, pic in enumerate(mnist_8x8_test):
    resize_8_test[i] = rescale_decolourize(pic)  
    
# i = 0
# for m in resize_8_train:
#     if i < 10:
#         plt.imshow(m)
#         plt.show()
#         i += 1
#     else:
#         pass