# MNIST Hyper Parameter Test

First, import modules. This includes<br>numpy for computations<br>pandas for data handling<br>matplotlib for display<br>scikit learn for machine learning

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_mldata
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.externals import joblib
from sklearn.svm import SVC
from tqdm import tqdm

%matplotlib inline

sklearn.datasets.fetch_mldata imports datasets found in scikitlearn's database.

In [2]:
#Import data

mnist = fetch_mldata('MNIST original')

Separate the image and labels. The image comes unrolled into a 784 points (28x28 unrolled).

In [3]:
#Separate data and labels

mnist_data = mnist['data']
mnist_target = mnist['target']

Scaling the data to have inputs between -1 and 1 helps improve the model's accuracy.

In [4]:
#Scale mnist data

scaler = StandardScaler()
scaler.fit(mnist_data)
mnist_data_scaled = scaler.transform(mnist_data)



The dataset is split in 80% training data and 20% test data. The dataset is stratified during splitting so the training data is evenly distributed among the target values.

In [5]:
#Stratified split of data set to train and test. Allot 20% for test sey

X_train, X_test, y_train, y_test = train_test_split(mnist_data_scaled, 
                                                    mnist_target, 
                                                    stratify=mnist_target, 
                                                    test_size=0.2,
                                                    random_state = 42)

Run a loop to identify which combination of parameters returns the highest accuracy measure.

In [6]:
#Initiate K Nearest Neighbors model

knn_clf_unif_p1 = KNeighborsClassifier(n_neighbors = 10, weights = "uniform", p=1)

In [7]:
#Train model

knn_clf_unif_p1.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=10, p=1,
           weights='uniform')

Predict based on models

In [8]:
#Predict 
y_pred_unif_p1=[]
for i in tqdm(range(0,X_test.shape[0])):
    y_pred_unif_p1.append(knn_clf_unif_p1.predict(X_test[i,:].reshape(1,-1)))

100%|████████████████████████████████████████████████████████████████████████████| 14000/14000 [16:46<00:00, 14.69it/s]


In [9]:
#Score

acc_unif_p1 = accuracy_score(y_pred_unif_p1, y_test)
acc_unif_p1

0.9555

In [10]:
svc_try = SVC(random_state = 42, max_iter = 1000, verbose = True,
              gamma = 0.001, C = 100)

In [11]:
svc_try.fit(X_train,y_train)

[LibSVM]



SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=1000, probability=False, random_state=42, shrinking=True,
  tol=0.001, verbose=True)

In [12]:
y_pred_svc = []
for i in tqdm(range(0,X_test.shape[0])):
    y_pred_svc.append(svc_try.predict(X_test[i,:].reshape(1,-1)))

100%|████████████████████████████████████████████████████████████████████████████| 14000/14000 [03:14<00:00, 71.99it/s]


In [13]:
print("SVC: ", accuracy_score(y_pred_svc,y_test))

SVC:  0.9705
