# HW7: kNN, SVM, and Random Forest for handwriting recognition
## IST 707 Applied Machine Learning
### Shivangi Mundhra | SUID - 842548148

In [2]:
# mounting Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Read in the data

In [3]:
import pandas as p
import numpy as np
train = p.read_csv("/content/drive/MyDrive/Colab Notebooks/digit-train.csv")
y_train = train['label'].values
y_trainLabels = train.iloc[:,:1]
X_train = train.iloc[:,1:784]
test = p.read_csv("/content/drive/MyDrive/Colab Notebooks/digit-test.csv")
y_test = test['label'].values
X_test = test.iloc[:,1:784]
y_testLabels = test.iloc[:,:1]

In [4]:
y_train.size

4198

In [5]:
X_train.shape

(4198, 783)

In [6]:
# Check how many training examples in each category
# this is important to see whether the data set is balanced or skewed

unique, counts = np.unique(y_train, return_counts = True)
print(np.asarray((unique, counts)))

[[  0   1   2   3   4   5   6   7   8   9]
 [418 471 413 425 420 390 416 436 438 371]]


In [7]:
unique, counts = np.unique(y_test, return_counts = True)
print(np.asarray((unique, counts)))

[[  0   1   2   3   4   5   6   7   8   9]
 [414 478 420 446 404 388 404 465 393 386]]


## SVM

### Train a Linear SVC classifier

In [8]:
# import the LinearSVC module
from sklearn.svm import LinearSVC
from sklearn.svm import SVC

# initialize the LinearSVC model
linear_svm_clf = LinearSVC(C = 1)

# use the training data to train the model
linear_svm_clf.fit(X_train, y_train)



LinearSVC(C=1)

In [9]:
# get the accuracy of this linear SVM classifier
linear_svm_clf_accuracy = linear_svm_clf.score(X_test, y_test)
print(linear_svm_clf_accuracy)

0.8544545021438781


The linear SVM model is only about 85.30% accurate

In [10]:
# print confusion matrix and classification report

from sklearn.metrics import confusion_matrix
y_pred_linear_svm = linear_svm_clf.predict(X_test)
cm_linear_svm = confusion_matrix(y_test, y_pred_linear_svm)
print(cm_linear_svm)
print()

from sklearn.metrics import classification_report
# target_names = ['0', '1', '2', '3']
print(classification_report(y_test, y_pred_linear_svm))

[[390   0   4   4   1   7   3   0   5   0]
 [  0 462   5   7   0   0   0   0   4   0]
 [  5   5 359  20   3   2   7   2  13   4]
 [  5   3  17 353   1  22  10   7  21   7]
 [  0   2   3   2 352   2   2   5   3  33]
 [  8   4   7  29  14 291  11   2  14   8]
 [  6   0   7   0   4  10 370   1   4   2]
 [  2   2   6   7   6   2   1 408   3  28]
 [  7   9  13  25   7  19   7   4 296   6]
 [  3   2   2  12  24   5   1  22   9 306]]

              precision    recall  f1-score   support

           0       0.92      0.94      0.93       414
           1       0.94      0.97      0.96       478
           2       0.85      0.85      0.85       420
           3       0.77      0.79      0.78       446
           4       0.85      0.87      0.86       404
           5       0.81      0.75      0.78       388
           6       0.90      0.92      0.91       404
           7       0.90      0.88      0.89       465
           8       0.80      0.75      0.77       393
           9       0.78    

In [12]:
# y_pred_linear_svm
linearSvmPred = p.DataFrame({"y_test" : y_test, "y_pred_linear_svm" : y_pred_linear_svm})
linearSvmPred.to_csv("linearSvmPred.csv", index = False)

### Train a base SVC classifier

In [14]:
# initialize the SVC model
svm_clf = SVC()

# use training data to train the model
svm_clf.fit(X_train, y_train)

# get the accuracy of the base SVM classifier
svm_clf_accuracy = svm_clf.score(X_test, y_test)
print(svm_clf_accuracy)

0.9530728918532635


The base SVM model has 95.31% accuracy

In [15]:
# print confusion matrix and classification report

y_pred_svm = svm_clf.predict(X_test)
cm_svm = confusion_matrix(y_test, y_pred_svm)
print(cm_svm)
print()

print(classification_report(y_test, y_pred_svm))

[[408   0   1   0   1   0   2   0   2   0]
 [  0 469   5   2   0   1   0   0   1   0]
 [  1   2 403   1   5   1   3   3   0   1]
 [  2   3   6 415   0   7   1   5   5   2]
 [  1   1   2   0 385   0   0   0   1  14]
 [  0   2   0   7   1 372   4   0   1   1]
 [  3   0   1   0   2   5 392   0   1   0]
 [  0   4   2   0   9   0   0 438   0  12]
 [  1   5   3   7   2   9   1   0 365   0]
 [  3   2   0   6  10   2   0   8   1 354]]

              precision    recall  f1-score   support

           0       0.97      0.99      0.98       414
           1       0.96      0.98      0.97       478
           2       0.95      0.96      0.96       420
           3       0.95      0.93      0.94       446
           4       0.93      0.95      0.94       404
           5       0.94      0.96      0.95       388
           6       0.97      0.97      0.97       404
           7       0.96      0.94      0.95       465
           8       0.97      0.93      0.95       393
           9       0.92    

In [16]:
svmPred = p.DataFrame({"y_test" : y_test, "y_pred_svm" : y_pred_svm})
svmPred.to_csv("svmPred.csv", index = False)

### Tuning the SVM model

In [12]:
from sklearn.model_selection import GridSearchCV

In [None]:
# Create a dictionary of possible parameters
params_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'gamma': [0.0001, 0.001, 0.01, 0.1], 'kernel':['linear','rbf', 'poly'] }
# Create the GridSearchCV object
grid_clf = GridSearchCV(SVC(), params_grid)
print(grid_clf)

GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100],
                         'gamma': [0.0001, 0.001, 0.01, 0.1],
                         'kernel': ['linear', 'rbf', 'poly']})


In [None]:
# fit the data with the best possible parameters
grid_clf = grid_clf.fit(X_train, y_train)
# print the best estimator with it's parameters
print(grid_clf.best_params_)

{'C': 0.001, 'gamma': 0.0001, 'kernel': 'poly'}


In [17]:
tuned_svm_clf = SVC(C = 0.001, gamma = 0.0001, kernel = 'poly')
tuned_svm_clf.fit(X_train, y_train)
tuned_clf_accuracy = tuned_svm_clf.score(X_test, y_test)
print(tuned_clf_accuracy)

0.9437827536922344


In [18]:
# print confusion matrix and classification report

y_pred_tuned_svm = tuned_svm_clf.predict(X_test)
cm_tuned_svm = confusion_matrix(y_test, y_pred_tuned_svm)
print(cm_tuned_svm)
print()

print(classification_report(y_test, y_pred_tuned_svm))

[[402   2   0   1   1   3   4   0   1   0]
 [  0 472   3   1   0   0   0   1   1   0]
 [  6   8 391   3   2   1   3   2   4   0]
 [  5  11   5 409   0   5   1   3   6   1]
 [  0   2   1   0 388   0   1   0   0  12]
 [  0   4   1   8   0 367   4   0   3   1]
 [  9   2   1   0   3   2 386   0   1   0]
 [  1   6   1   0   9   0   0 442   0   6]
 [  4   8   2   7   1   9   3   0 357   2]
 [  4   5   0   6   8   1   0  12   2 348]]

              precision    recall  f1-score   support

           0       0.93      0.97      0.95       414
           1       0.91      0.99      0.95       478
           2       0.97      0.93      0.95       420
           3       0.94      0.92      0.93       446
           4       0.94      0.96      0.95       404
           5       0.95      0.95      0.95       388
           6       0.96      0.96      0.96       404
           7       0.96      0.95      0.96       465
           8       0.95      0.91      0.93       393
           9       0.94    

In [19]:
tunedSvmPred = p.DataFrame({"y_test" : y_test, "y_pred_tuned_svm" : y_pred_tuned_svm})
tunedSvmPred.to_csv("tunedSvmPred.csv", index = False)

## kNN

In [20]:
# import knn classifier
from sklearn.neighbors import KNeighborsRegressor

# initialize a base knn model
knn = KNeighborsRegressor()

In [21]:
# fit the base knn model with training data
knn.fit(X_train, y_train)

KNeighborsRegressor()

In [22]:
# check the accuracy
knn_accuracy = knn.score(X_test, y_test)
print(knn_accuracy)

0.8858859613374672


In [29]:
# print confusion matrix and classification report

y_pred_knn = knn.predict(X_test)
cm_knn = confusion_matrix(y_test, np.round(y_pred_knn))
print(cm_knn)
print()

print(classification_report(y_test, np.round(y_pred_knn)))

[[386  17   4   1   2   3   1   0   0   0]
 [  0 472   3   1   1   0   0   1   0   0]
 [  2  15 345  31  13   6   5   3   0   0]
 [  0   5   9 360  39  19  12   1   1   0]
 [  0   3   4   6 246  77  40  16   9   3]
 [  0   0   2   7  24 324  26   2   3   0]
 [  1   2   2   3   6  19 370   1   0   0]
 [  0   3   6   2   1   9  27 402  11   4]
 [  1   3   8   2   9  24  31  63 249   3]
 [  0   1   3   2   6   4   8  18  69 275]]

              precision    recall  f1-score   support

           0       0.99      0.93      0.96       414
           1       0.91      0.99      0.94       478
           2       0.89      0.82      0.86       420
           3       0.87      0.81      0.84       446
           4       0.71      0.61      0.66       404
           5       0.67      0.84      0.74       388
           6       0.71      0.92      0.80       404
           7       0.79      0.86      0.83       465
           8       0.73      0.63      0.68       393
           9       0.96    

In [26]:
knnPred = p.DataFrame({"y_test" : y_test, "y_pred_knn" : y_pred_knn})
knnPred.to_csv("knnPred.csv", index = False)

### Tuning the kNN model

In [None]:
knn_parameters = {"n_neighbors": range(1, 50)}
gridsearch = GridSearchCV(KNeighborsRegressor(), knn_parameters)
gridsearch.fit(X_train, y_train)

GridSearchCV(estimator=KNeighborsRegressor(),
             param_grid={'n_neighbors': range(1, 25)})

In [None]:
gridsearch.best_params_

{'n_neighbors': 3}

In [30]:
tuned_knn = KNeighborsRegressor(n_neighbors = 3)

In [31]:
tuned_knn.fit(X_train, y_train)

KNeighborsRegressor(n_neighbors=3)

In [32]:
# check the accuracy
tuned_knn_accuracy = tuned_knn.score(X_test, y_test)
print(tuned_knn_accuracy)

0.8943603383219869


In [33]:
# print confusion matrix and classification report

y_pred_tuned_knn = tuned_knn.predict(X_test)
cm_tuned_knn = confusion_matrix(y_test, np.round(y_pred_tuned_knn))
print(cm_tuned_knn)
print()

print(classification_report(y_test, np.round(y_pred_tuned_knn)))

[[401   0   6   3   2   1   1   0   0   0]
 [  0 471   4   1   1   0   0   1   0   0]
 [  3  14 364   7  21   5   3   3   0   0]
 [  1   4   9 356  36  24   9   7   0   0]
 [  0   4   2   5 298   8  53  28   2   4]
 [  0   0   3   0  38 322  21   0   2   2]
 [  1   1   3   0  12   7 379   1   0   0]
 [  0   4   0   7   2   6  15 407  21   3]
 [  1   3   2   7  11  16  40  23 285   5]
 [  1   1   1   3   3   5   7  33  36 296]]

              precision    recall  f1-score   support

           0       0.98      0.97      0.98       414
           1       0.94      0.99      0.96       478
           2       0.92      0.87      0.89       420
           3       0.92      0.80      0.85       446
           4       0.70      0.74      0.72       404
           5       0.82      0.83      0.82       388
           6       0.72      0.94      0.81       404
           7       0.81      0.88      0.84       465
           8       0.82      0.73      0.77       393
           9       0.95    

## Random Forest

In [34]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor

# Instantiate model
rf = RandomForestRegressor()

# Train the model on training data
rf.fit(X_train, y_train)

RandomForestRegressor()

In [35]:
# check the accuracy
rf_accuracy = rf.score(X_test, y_test)
print(rf_accuracy)

0.8056301474450053


In [36]:
# print confusion matrix and classification report

y_pred_rf = rf.predict(X_test)
cm_rf = confusion_matrix(y_test, np.round(y_pred_rf))
print(cm_rf)
print()

print(classification_report(y_test, np.round(y_pred_rf)))

[[240  86  50  20   6   9   2   1   0   0]
 [  0 355  99  15   7   1   1   0   0   0]
 [  0   0 181 145  64  23   7   0   0   0]
 [  1   2   7 224 147  41  17   6   0   1]
 [  0   0   2   6 235 111  30  14   6   0]
 [  0   1   5  12  57 265  43   5   0   0]
 [  1   0   5   6  15 121 256   0   0   0]
 [  0   0   3   8  20  43  70 311  10   0]
 [  0   1   0   6  16  60 105 134  71   0]
 [  1   0   0   0   7  20  31  52 131 144]]

              precision    recall  f1-score   support

           0       0.99      0.58      0.73       414
           1       0.80      0.74      0.77       478
           2       0.51      0.43      0.47       420
           3       0.51      0.50      0.50       446
           4       0.41      0.58      0.48       404
           5       0.38      0.68      0.49       388
           6       0.46      0.63      0.53       404
           7       0.59      0.67      0.63       465
           8       0.33      0.18      0.23       393
           9       0.99    

### tuning the random forest model

In [None]:
rf_parameters = {"n_estimators": range(0, 1000, 100)}
rf_gridsearch = GridSearchCV(RandomForestRegressor(), rf_parameters)
rf_gridsearch.fit(X_train, y_train)

5 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_forest.py", line 392, in fit
    self._validate_estimator()
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_base.py", line 140, in _validate_estimator
    self.n_estimators
ValueError: n_estimators must be greater than zero, got 0.

 0.77514907 0.77621624 0.77678909 0.77649282]


GridSearchCV(estimator=RandomForestRegressor(),
             param_grid={'n_estimators': range(0, 1000, 100)})

In [None]:
rf_gridsearch.best_params_

{'n_estimators': 800}

In [37]:
# initiate the tuned rf model
tuned_rf = RandomForestRegressor(n_estimators = 800, random_state = 2022)

# Train the model on training data
tuned_rf.fit(X_train, y_train)

RandomForestRegressor(n_estimators=800, random_state=2022)

In [38]:
# check the accuracy
tuned_rf_accuracy = tuned_rf.score(X_test, y_test)
print(tuned_rf_accuracy)

0.8081019134445595


In [39]:
# print confusion matrix and classification report

y_pred_tuned_rf = tuned_rf.predict(X_test)
cm_tuned_rf = confusion_matrix(y_test, np.round(y_pred_tuned_rf))
print(cm_tuned_rf)
print()

print(classification_report(y_test, np.round(y_pred_tuned_rf)))

[[229  98  52  17   8   6   3   1   0   0]
 [  0 356  94  19   7   0   2   0   0   0]
 [  0   1 184 140  66  20   9   0   0   0]
 [  0   2   7 230 143  41  17   4   1   1]
 [  0   0   2   4 230 117  30  16   5   0]
 [  0   1   4  10  58 276  34   5   0   0]
 [  1   0   3   8  16 116 260   0   0   0]
 [  0   0   3   9  13  48  69 316   7   0]
 [  0   0   1   5  16  65  96 139  71   0]
 [  1   0   0   0   5  24  31  49 133 143]]

              precision    recall  f1-score   support

           0       0.99      0.55      0.71       414
           1       0.78      0.74      0.76       478
           2       0.53      0.44      0.48       420
           3       0.52      0.52      0.52       446
           4       0.41      0.57      0.48       404
           5       0.39      0.71      0.50       388
           6       0.47      0.64      0.54       404
           7       0.60      0.68      0.64       465
           8       0.33      0.18      0.23       393
           9       0.99    