#### Support Vector Machine classifier

Train a SVM model on MNIST dataset to identify the digits. As SVM classifier is a binary classifier, we have to use 
one-versus-all to classify all 10 digits

In [27]:
#Import required libraries and packages

import numpy as np
from sklearn.datasets import fetch_mldata
from sklearn.svm import SVC, LinearSVC
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import reciprocal, uniform

In [19]:
#load dataset
mnist = fetch_mldata("MNIST original")
X = mnist["data"]
y = mnist["target"]

In [25]:
X.shape

(70000, 784)

In [3]:
#Split in to train and test sets
X_train = X[:60000]
y_train = y[:60000]
X_test = X[60000:]
y_test = y[60000:]

In [4]:
#Shuffle data as some training algorithms are sensitive to training instances
np.random.seed(42)
rand_index = np.random.permutation(60000)
X_train = X_train[rand_index]
y_train = y_train[rand_index]

In [5]:
#Load classifier and fit model on the data

lin_clf = LinearSVC(random_state=42)
lin_clf.fit(X_train, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=42, tol=0.0001,
     verbose=0)

In [6]:
#Predict target on train set
y_pred = lin_clf.predict(X_train)

#Find accuracy score
accuracy_score(y_train, y_pred)

0.85375

An accuracy of 25 is obtained on raw data. We can improve this by scaling the data.

In [8]:
#Scaling data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype(np.float32))
X_test_scaled = scaler.transform(X_test.astype(np.float32))

In [9]:
#Fit the model on scaled data
lin_clf = LinearSVC(random_state=42)
lin_clf.fit(X_train_scaled, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=42, tol=0.0001,
     verbose=0)

In [10]:
#Predict target on scaled train set
y_pred = lin_clf.predict(X_train_scaled)

#Find accuracy again
accuracy_score(y_train, y_pred)

0.9204

As we can see, the accuracy has improved. But we can make it better by using an SVM. A kernel is required to try SVM, here we will use RBF kernel

In [11]:
svm_clf = SVC(decision_function_shape="ovr")
svm_clf.fit(X_train_scaled, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [12]:
y_pred = svm_clf.predict(X_train_scaled)
accuracy_score(y_train, y_pred)

0.9853

In [35]:
# classification report
print("Classification report:\n %s:n \n %sn" % (svm_clf, metrics.classification_report(y_train, y_pred)))

Classification report:
 SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False):n 
              precision    recall  f1-score   support

        0.0       0.99      0.99      0.99      5923
        1.0       0.99      0.99      0.99      6742
        2.0       0.99      0.99      0.99      5958
        3.0       0.98      0.98      0.98      6131
        4.0       0.98      0.99      0.98      5842
        5.0       0.99      0.98      0.99      5421
        6.0       0.99      0.99      0.99      5918
        7.0       0.98      0.98      0.98      6265
        8.0       0.98      0.98      0.98      5851
        9.0       0.98      0.97      0.98      5949

avg / total       0.99      0.99      0.99     60000
n


In [36]:
#Predict on test data
y_test_pred = svm_clf.predict(X_test_scaled)
accuracy_score(y_test, y_test_pred)

0.9665

The model gave an accuracy of 96 on test data

In [38]:
# confusion matrix
print("Confusion matrix: \n%s" % metrics.confusion_matrix(y_test, y_test_pred))

Confusion matrix: 
[[ 968    0    1    1    0    3    3    2    2    0]
 [   0 1126    3    0    0    1    3    0    2    0]
 [   6    1  994    3    2    0    1   15    9    1]
 [   0    0    4  979    1    7    0   12    7    0]
 [   0    0   11    0  945    2    4    7    3   10]
 [   2    0    1   10    2  855    6    7    7    2]
 [   6    2    1    0    4    8  931    2    4    0]
 [   1    6   14    2    3    0    0  989    0   13]
 [   3    0    4    6    6   11    3   11  927    3]
 [   4    6    4   11   12    1    0   17    3  951]]


### Randomised Search to determine the best parameters

In [13]:
param_distributions = {"gamma": reciprocal(0.001, 0.1), "C": uniform(1, 10)}
rnd_search_cv = RandomizedSearchCV(svm_clf, param_distributions, n_iter=10, verbose=2)
rnd_search_cv.fit(X_train_scaled[:1000], y_train[:1000])

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] C=8.852316058423087, gamma=0.001766074650481071 .................
[CV] .. C=8.852316058423087, gamma=0.001766074650481071, total=   2.7s
[CV] C=8.852316058423087, gamma=0.001766074650481071 .................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.0s remaining:    0.0s


[CV] .. C=8.852316058423087, gamma=0.001766074650481071, total=   2.9s
[CV] C=8.852316058423087, gamma=0.001766074650481071 .................
[CV] .. C=8.852316058423087, gamma=0.001766074650481071, total=   2.7s
[CV] C=1.8271960104746645, gamma=0.006364737055453384 ................
[CV] . C=1.8271960104746645, gamma=0.006364737055453384, total=   3.1s
[CV] C=1.8271960104746645, gamma=0.006364737055453384 ................
[CV] . C=1.8271960104746645, gamma=0.006364737055453384, total=   3.2s
[CV] C=1.8271960104746645, gamma=0.006364737055453384 ................
[CV] . C=1.8271960104746645, gamma=0.006364737055453384, total=   3.6s
[CV] C=9.875199193765326, gamma=0.051349833451870636 .................
[CV] .. C=9.875199193765326, gamma=0.051349833451870636, total=   3.3s
[CV] C=9.875199193765326, gamma=0.051349833451870636 .................
[CV] .. C=9.875199193765326, gamma=0.051349833451870636, total=   3.4s
[CV] C=9.875199193765326, gamma=0.051349833451870636 .................
[CV] .

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  2.4min finished


RandomizedSearchCV(cv=None, error_score='raise',
          estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
          fit_params=None, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002428EBFE400>, 'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002428EBFE9E8>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [14]:
rnd_search_cv.best_estimator_

SVC(C=8.852316058423087, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001766074650481071,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [15]:
rnd_search_cv.best_score_

0.856

The score is pretty low because the model was trained only on 1000 instances 