# MNIST - Solving a Classification problem

First, let's make sure this notebook works well in both python 2 and 3, import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures:

In [None]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt


from sklearn.model_selection import cross_val_score, cross_val_predict


plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "classification"

def save_fig(fig_id, tight_layout=True):
    path = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id + ".png")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

In [None]:
## suppress warnings
import warnings; warnings.simplefilter('ignore')

# Fetch the MNIST

In [None]:
#from sklearn.datasets import fetch_mldata
#mnist = fetch_mldata('MNIST original')
#mnist

In [None]:
#Alternate method of loading data 

X=np.loadtxt('/Data/X.txt')
y=np.loadtxt('/Data/y.txt')

In [None]:
#X, y = mnist["data"], mnist["target"]
X.shape

In [None]:
y.shape

In [None]:
y

In [None]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

some_digit = X[36000]
some_digit_image = some_digit.reshape(28, 28)
plt.imshow(some_digit_image, cmap = matplotlib.cm.binary,
           interpolation="nearest")
plt.axis("off")

#save_fig("some_digit_plot")
plt.show()

In [None]:
def plot_digit(data):
    image = data.reshape(28, 28)
    plt.imshow(image, cmap = matplotlib.cm.binary,
               interpolation="nearest")
    plt.axis("off")

In [None]:
# EXTRA
def plot_digits(instances, images_per_row=10, **options):
    size = 28
    images_per_row = min(len(instances), images_per_row)
    images = [instance.reshape(size,size) for instance in instances]
    n_rows = (len(instances) - 1) // images_per_row + 1
    row_images = []
    n_empty = n_rows * images_per_row - len(instances)
    images.append(np.zeros((size, size * n_empty)))
    for row in range(n_rows):
        rimages = images[row * images_per_row : (row + 1) * images_per_row]
        row_images.append(np.concatenate(rimages, axis=1))
    image = np.concatenate(row_images, axis=0)
    plt.imshow(image, cmap = matplotlib.cm.binary, **options)
    plt.axis("off")

## Get a Sample of the Digits

In [None]:
plt.figure(figsize=(9,9))
example_images = np.r_[X[:12000:600], X[13000:30600:600], X[30600:60000:590]]
plot_digits(example_images, images_per_row=10)

plt.show()

##  Split test Train

In [None]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

### Lets shuffle it

In [None]:
import numpy as np

shuffle_index = np.random.permutation(60000)
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]

# Binary classifier - SGDClassifier

In [None]:
y_train_5 = (y_train == 5).astype(int)
y_test_5 = (y_test == 5).astype(int)

In [None]:
import pandas as pd
cmp = pd.DataFrame({'y_train': y_train, 'y_train_5': y_train_5})
cmp.sample(n=10, random_state=32)

In [None]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(max_iter=15, random_state=42)
sgd_clf.fit(X_train, y_train_5)

In [None]:
#predict
y_test_pred = sgd_clf.predict(X_test)
#y_train_pred = sgd_clf.predict(X_train)

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test_5, y_test_pred)
print(cm)

In [None]:
# Calculating Recall, etc
TN = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]
TP = cm[1][1]
print('TEST DATA RESULTS - SGD')
print('Test Set Size ',float(TP+TN+FN + FP))

accuracy = (TP+TN) / float(TP+TN+FN +FP)
print('Accuracy ', round(accuracy, 2))

sensitivity = TP / float(FN + TP)
print('Sensitivity or recall(5 Predictions):',round(sensitivity, 2))

specificity = TN / (TN + FP)
print('specificity(not 5 predictions)',round(specificity,2))

precision = TP / float(TP + FP)
print('precision', round(precision, 2))



# Getting classification report
from sklearn.metrics import classification_report
print(classification_report(y_test_5,y_test_pred,target_names=['Non 5','5s']))

## Precision and Recall Score - Training vs Test

In [None]:
from sklearn.metrics import precision_score, recall_score,f1_score,accuracy_score

#print('accuracy on training ', accuracy_score(y_train_5, y_train_pred))
print('accuracy on test ', accuracy_score(y_test_5, y_test_pred))

print('precision on training ', precision_score(y_train_5, y_train_pred))
print('precision on test ', precision_score(y_test_5, y_test_pred))

print('recall on training ', recall_score(y_train_5, y_train_pred))
print('recall on test ', recall_score(y_test_5, y_test_pred))


print('f1 on training ', f1_score(y_train_5, y_train_pred))
print('f1 on test ', f1_score(y_test_5, y_test_pred))

In [None]:
import seaborn as sns
sns.heatmap(cm.T, square=True, annot=True, fmt='d', cbar=False, cmap="Blues",
            xticklabels=['Not 5','5'] ,
            yticklabels=['Not 5', '5 '])
plt.xlabel('true label')
plt.ylabel('predicted label');

### Display the Confusion Matrix - Visually

In [None]:
cl_a, cl_b = 1,0
#TP
X_aa = X_test[(y_test_5 == cl_a) & (y_test_pred == cl_a)]
#FN
X_ab = X_test[(y_test_5 == cl_a) & (y_test_pred == cl_b)]
#FP
X_ba = X_test[(y_test_5 == cl_b) & (y_test_pred == cl_a)]
#TN
X_bb = X_test[(y_test_5 == cl_b) & (y_test_pred == cl_b)]

plt.figure(figsize=(8,8))
plt.subplot(221); plot_digits(X_bb[:25], images_per_row=5)
plt.subplot(222); plot_digits(X_ba[:25], images_per_row=5)

plt.subplot(223); plot_digits(X_ab[:25], images_per_row=5)
plt.subplot(224); plot_digits(X_aa[:25], images_per_row=5)

plt.show()

# Multiclass classification - using Random Forest


In [None]:
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(random_state=42)
forest_clf.fit(X_train, y_train)
# forest_clf.predict([some_digit])

In [None]:
# forest_clf.predict_proba([some_digit])

In [None]:
# y_train_pred_forest = forest_clf.predict(X_train)
y_test_pred_forest = forest_clf.predict( X_test)

In [None]:
matrix=confusion_matrix(y_test, y_test_pred_forest)
matrix

In [None]:
import seaborn as sns

sns.heatmap(matrix.T, square=True, annot=True, fmt='d', cbar=False, cmap="Blues")
plt.xlabel('true label')
plt.ylabel('predicted label');

In [None]:
ok=0
for i in range(0,10):
    ok=ok+matrix[i,i]

print('Accuracy' , ok/10000)

In [None]:
from sklearn.metrics import classification_report

print(classification_report( y_test,y_test_pred_forest))

In [None]:
cl_a, cl_b = 4, 9
X_aa = X_test[(y_test == cl_a) & (y_test_pred_forest == cl_a)]
X_ab = X_test[(y_test == cl_a) & (y_test_pred_forest == cl_b)]
X_ba = X_test[(y_test == cl_b) & (y_test_pred_forest == cl_a)]
X_bb = X_test[(y_test == cl_b) & (y_test_pred_forest == cl_b)]

plt.figure(figsize=(8,8))
plt.subplot(221); plot_digits(X_aa[:25], images_per_row=5)
plt.subplot(222); plot_digits(X_ab[:25], images_per_row=5)
plt.subplot(223); plot_digits(X_ba[:25], images_per_row=5)
plt.subplot(224); plot_digits(X_bb[:25], images_per_row=5)

plt.show()

# Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(random_state=42)
forest_clf.fit(X_train,y_train_5)


In [None]:
y_train_pred_forest = forest_clf.predict(X_train)
y_test_pred_forest = forest_clf.predict( X_test)

#precision_score(y_train_5, y_train_pred_forest)

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test_5, y_test_pred_forest)
print(cm)

In [None]:
sns.heatmap(cm.T, square=True, annot=True, fmt='d', cbar=False, cmap="Blues",
            xticklabels=['Not 5','5'] ,
            yticklabels=['Not 5', '5 '])
plt.xlabel('true label')
plt.ylabel('predicted label');

In [None]:
# Calculating Recall, etc
TN = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]
TP = cm[1][1]
print('TEST DATA RESULTS - RF')
accuracy = (TP+TN) / float(TP+TN+FN +FP)
print('Accuracy ',accuracy)
sensitivity = TP / float(FN + TP)
print('Sensitivity or recall:',sensitivity)
precision = TP / float(TP + FP)
print('precision',precision)
specificity = TN / (TN + FP)
print('specificity',specificity)
print('Size ',float(TP+TN+FN + FP))

In [None]:
# Getting classification report
from sklearn.metrics import classification_report
print(classification_report(y_test_5,y_test_pred_forest,target_names=['Non 5','5s']))

## Precision and Recall Score - Training vs Test

In [None]:
from sklearn.metrics import precision_score, recall_score,f1_score,accuracy_score

print('accuracy on training ', accuracy_score(y_train_5, y_train_pred_forest))
print('accuracy on test ', accuracy_score(y_test_5, y_test_pred_forest))

print('precision on training ', precision_score(y_train_5, y_train_pred_forest))
print('precision on test ', precision_score(y_test_5, y_test_pred_forest))

print('recall on training ', recall_score(y_train_5, y_train_pred_forest))
print('recall on test ', recall_score(y_test_5, y_test_pred_forest))


print('f1 on training ', f1_score(y_train_5, y_train_pred_forest))
print('f1 on test ', f1_score(y_test_5, y_test_pred_forest))

In [None]:
cl_a, cl_b = 1,0
#TP
X_aa = X_test[(y_test_5 == cl_a) & (y_test_pred_forest == cl_a)]
#FN
X_ab = X_test[(y_test_5 == cl_a) & (y_test_pred_forest == cl_b)]
#FP
X_ba = X_test[(y_test_5 == cl_b) & (y_test_pred_forest == cl_a)]
#TN
X_bb = X_test[(y_test_5 == cl_b) & (y_test_pred_forest == cl_b)]

plt.figure(figsize=(8,8))
plt.subplot(221); plot_digits(X_bb[:25], images_per_row=5)
plt.subplot(222); plot_digits(X_ba[:25], images_per_row=5)

plt.subplot(223); plot_digits(X_ab[:25], images_per_row=5)
plt.subplot(224); plot_digits(X_aa[:25], images_per_row=5)

plt.show()

In [None]:
from sklearn.metrics import precision_score, recall_score

print('precision on training ', precision_score(y_train_5, y_train_pred_forest))
print('precision on test ', precision_score(y_test_5, y_test_pred_forest))

print('recall on training ', recall_score(y_train_5, y_train_pred_forest))
print('recall on test ', recall_score(y_test_5, y_test_pred_forest))


print('f1 on training ', f1_score(y_train_5, y_train_pred_forest))
print('f1 on test ', f1_score(y_test_5, y_test_pred_forest))

### Additional material

# Multiclass classification - using SGD Classifier

In [None]:
sgd_clf.fit(X_train, y_train)
sgd_clf.predict([some_digit])

### Now the scores are of 10 classes

In [None]:
some_digit_scores = sgd_clf.decision_function([some_digit])
some_digit_scores

In [None]:
np.argmax(some_digit_scores)

In [None]:
sgd_clf.classes_

In [None]:
sgd_clf.classes_[5]

## One vs One Classifiers

In [None]:
from sklearn.multiclass import OneVsOneClassifier
ovo_clf = OneVsOneClassifier(SGDClassifier(max_iter=5, random_state=42))
ovo_clf.fit(X_train, y_train)
ovo_clf.predict([some_digit])

In [None]:
len(ovo_clf.estimators_)

## Improve Scores Using Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))

from sklearn.model_selection import cross_val_score
cross_val_score(sgd_clf, X_train_scaled, y_train, cv=3, scoring="accuracy")

## Confusion Matrix

In [None]:
y_train_pred = cross_val_predict(sgd_clf, X_train_scaled, y_train, cv=3)
conf_mx = confusion_matrix(y_train, y_train_pred)
conf_mx

In [None]:
def plot_confusion_matrix(matrix):
    """If you prefer color and a colorbar"""
    fig = plt.figure(figsize=(8,8))
    ax = fig.add_subplot(111)
    cax = ax.matshow(matrix)
    fig.colorbar(cax)

In [None]:
plt.matshow(conf_mx, cmap=plt.cm.gray)
save_fig("confusion_matrix_plot", tight_layout=False)
plt.show()

In [None]:
row_sums = conf_mx.sum(axis=1, keepdims=True)
norm_conf_mx = conf_mx / row_sums

In [None]:
np.fill_diagonal(norm_conf_mx, 0)
plt.matshow(norm_conf_mx, cmap=plt.cm.gray)
save_fig("confusion_matrix_errors_plot", tight_layout=False)
plt.show()

## printing out the errors

In [None]:
cl_a, cl_b = 3, 5
X_aa = X_train[(y_train == cl_a) & (y_train_pred == cl_a)]
X_ab = X_train[(y_train == cl_a) & (y_train_pred == cl_b)]
X_ba = X_train[(y_train == cl_b) & (y_train_pred == cl_a)]
X_bb = X_train[(y_train == cl_b) & (y_train_pred == cl_b)]

plt.figure(figsize=(8,8))
plt.subplot(221); plot_digits(X_aa[:25], images_per_row=5)
plt.subplot(222); plot_digits(X_ab[:25], images_per_row=5)
plt.subplot(223); plot_digits(X_ba[:25], images_per_row=5)
plt.subplot(224); plot_digits(X_bb[:25], images_per_row=5)
save_fig("error_analysis_digits_plot")
plt.show()