<a href="https://colab.research.google.com/github/sdgroeve/EuBIC2022_workshop_ML/blob/main/mnist_digit_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# MNIST digit classification


In [None]:
# Importing modules
import pandas as pd
import numpy as np
#import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
import warnings
warnings.filterwarnings('ignore')

### Load the MNIST dataset with Scikit-learn

In [None]:
# Importing data
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', cache=False)

### The features: pixel values

In [None]:
mnist.data

In [None]:
img = np.array(mnist.data.iloc[0])
img = np.resize(img, (28, 28))

plt.figure(figsize=(20, 20))
sns.heatmap(img, annot=True, fmt='.1f', square=True, cmap="YlGnBu")
plt.show()

In [None]:
# Viewing the images
index_to_view = 0
number_of_images = 6

def view_image(ds, index_to_view, cmap="gray"):
    image = np.array(ds.iloc[index_to_view])
    image = np.resize(image, (28, 28))
    plt.imshow(image, cmap)
    

plt.figure(figsize=(10, 10))
for i in range(number_of_images):
    plt.subplot(round(number_of_images/2), round(number_of_images/2), i+1)
    view_image(mnist.data, i)

plt.tight_layout()

### Feature normalization

In [None]:
img = np.array(mnist.data.iloc[0]) / 255
img = np.resize(img, (28, 28))

plt.figure(figsize=(20, 20))
sns.heatmap(img, annot=True, fmt='.1f', square=True, cmap="YlGnBu")
plt.show()

### Splitting the data

In [None]:
X_train_ds = mnist.data.iloc[:60000,:] / 255
y_train_ds = mnist.target.iloc[:60000]
X_test_ds = mnist.data.iloc[60000:,:] / 255
y_test_ds = mnist.target.iloc[60000:]

### Exploring the labels

In [None]:
y_train_valuec = y_train_ds.value_counts().sort_index()
y_train_valuec.plot(kind='bar')
plt.xlabel("Label")
plt.ylabel("Count")

### Let's focus on binary classification

In [None]:
to_predict = '8'

y_train_label = [1 if x == to_predict else 0 for x in y_train_ds]

### Fit a logistic regression model

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
#model = RandomForestClassifier()

model.fit(X_train_ds, y_train_label)
print(model)

In [None]:
predictions = model.predict(X_test_ds)
print(predictions)

In [None]:
from sklearn.metrics import accuracy_score

y_test_label = [1 if x == to_predict else 0 for x in y_test_ds]

print(accuracy_score(y_test_label,predictions))

### How performant is an ignorant model that always predicts "not 8"?

In [None]:
predictions_baseline = [0]*len(predictions)
print(accuracy_score(y_test_label,predictions_baseline))

### Better metrics for evaluation

In [None]:
predictions_proba = model.predict_proba(X_test_ds)
print(predictions_proba)

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import PrecisionRecallDisplay

y_score = predictions_proba[:,1]

fpr, tpr, _ = roc_curve(y_test_label, y_score, pos_label=model.classes_[1])
roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr)

prec, recall, _ = precision_recall_curve(y_test_label, y_score, pos_label=model.classes_[1])
pr_display = PrecisionRecallDisplay(precision=prec, recall=recall)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))

roc_display.plot(ax=ax1)
pr_display.plot(ax=ax2)
plt.show()

In [None]:
import sklearn.metrics as metrics

print(metrics.recall_score(y_test_label, predictions))
print(metrics.precision_score(y_test_label, predictions))
print()
print(metrics.f1_score(y_test_label, predictions))

### The modelparameter values

In [None]:
image = np.resize(model.coef_[0], (28, 28))
plt.imshow(image, "gray")
plt.show()

### Prediction performance for all classes

In [None]:
for label in range(10):
  y_train_label = [1 if x == str(label) else 0 for x in y_train_ds]
  y_test_label  = [1 if x == str(label) else 0 for x in y_test_ds]
  model.fit(X_train_ds, y_train_label)
  predictions = model.predict(X_test_ds)
  print("%i %f"%(label,metrics.f1_score(y_test_label, predictions)))

### Multi-class classification

In [None]:
model.fit(X_train_ds, y_train_ds)
predictions = model.predict(X_test_ds)
print(predictions)

In [None]:
from sklearn.metrics import confusion_matrix

def plot_confusion_matrix(y_true, y_pred):
    mtx = confusion_matrix(y_true, y_pred)
    fig, ax = plt.subplots(figsize=(8,8))
    sns.heatmap(mtx, annot=True, fmt='d', linewidths=.5,  cbar=False, ax=ax)
    #  square=True,
    plt.ylabel('true label')
    plt.xlabel('predicted label')

plot_confusion_matrix(y_test_ds, predictions)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test_ds, predictions))

### Example predictions

In [None]:
from random import randint

predictions_proba = model.predict_proba(X_test_ds)

idx = randint(0,len(X_test_ds)-1)

pd.DataFrame(predictions_proba[idx]).plot.bar()
plt.show()
image = np.resize(X_test_ds.iloc[idx,:], (28, 28))
plt.imshow(image, "gray")
plt.show()