# SVM vs. kNN Classification

#### In this project, I have used classification methods to classify handwritten digits. Specifically, I have compared the performance and accuracy of using a Support Vector Machine (SVM) vs. k Nearest Neighbors (kNN).

In [None]:
# Imports and setup. 

import pandas as pd
import numpy as np

from sklearn import tree, svm, metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score, KFold
from sklearn.datasets import load_digits
from sklearn.preprocessing import scale
from sklearn.tree import DecisionTreeClassifier

import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (10, 6)
plt.style.use('ggplot')

### THE MNIST handwritten digit dataset consists of images of handwritten digits, together with labels indicating which digit is in each image. There are several versions of the MNIST dataset. I have used the one that is built into scikit-learn. I have scaled the data before running them through my algorithms, which altered their appearance when I plotted them.

In [None]:
# Code to scale images...

digits = load_digits()
X = scale( digits.data )
y = digits.target

n_samples, n_features = X.shape
n_digits = len( np.unique(digits.target) )

print( "n_digits: %d, n_samples %d, n_features %d" % (n_digits, n_samples, n_features) )

In [None]:
# This is what one digit (the "zero") looks like:

print( "===\nThe raw data" )
print( digits.images[0] )

print( "===\nThe scaled data" )
print( X[0] )

print( "===\nThe digit" )
print( digits.target[0] )

In [None]:
# Display the first 25 images...

plt.figure( figsize=(10, 10) )
 
for ii in np.arange( 25 ):
    plt.subplot( 5, 5, ii+1 )
    plt.imshow( np.reshape( X[ii,:], (8,8) ), cmap='Greys', interpolation='nearest' )
    plt.axis( 'off' )

plt.show()

### Classification with Support Vector Machines (SVM)

In [None]:
# Split the data into a training and test sets

XTrain, XTest, yTrain, yTest = train_test_split( X, y, random_state=1, test_size=0.8 )

In [None]:
# Use SVM with an rbf kernel and the cost parameter C=5 to build a classifier using the training dataset.
model = svm.SVC(kernel='rbf', C=5)
model.fit(XTrain, yTrain)

In [None]:
# Using the test dataset, evaluate the accuracy of the model.
y_pred = model.predict(XTest)
print('Accuracy = ', metrics.accuracy_score(y_true = yTest, y_pred = y_pred))

In [None]:
# Again using the test dataset, compute the confusion matrix.
print(metrics.confusion_matrix(y_true = yTest, y_pred = y_pred))

In [None]:
# Display all of the misclassified digits as images (title with: Predicted #, Actual #).
plt.figure( figsize=(12, 12) )
misclassified_count = 0

for i in range(len(yTest)):
    if y_pred[i] != yTest[i]:
        misclassified_count += 1
        plt.subplot(2, 4, misclassified_count)
        plt.imshow(np.reshape(XTest[i], (8, 8)), cmap='Greys', interpolation='nearest')
        plt.title(f"Predicted {y_pred[i]}\nActual {yTest[i]}", fontsize=10)
        plt.axis('off')
        
        if misclassified_count == 8:
            break

plt.tight_layout()
plt.show()

In [None]:
# Using the 'cross_val_score' function, evaluate the accuracy of the SVM for different values
# of the parameter C: .5 to 5 (by .1) and then 10-50 (by 20).

In [None]:
# Define the range of C values to test
c_values = np.concatenate((np.arange(0.5, 5.1, 0.1), np.arange(10, 51, 20)))

# Initialize a list to store the mean accuracies
mean_accuracies = []

# Loop over the range of C values
for c in c_values:
    # Create the SVM model with the current C value
    model = svm.SVC(kernel='rbf', C=c)
    
    # Perform cross-validation with 5 folds and get the accuracy scores
    scores = cross_val_score(model, X, y, cv=5)
    
    # Calculate the mean accuracy and append it to the list
    mean_accuracy = scores.mean()
    mean_accuracies.append(mean_accuracy)

#mean_accuracies

In [None]:
# Graph results
plt.figure(figsize=(10, 6))
plt.plot(c_values, mean_accuracies, marker='o', linestyle='-')
plt.xlabel('C')
plt.ylabel('Mean Accuracy')
plt.title('SVM Accuracy vs. C')
plt.grid(True)
plt.show()

# Find the best value of C that gives the highest mean accuracy
best_c = c_values[np.argmax(mean_accuracies)]
best_accuracy = np.max(mean_accuracies)

print(f"Best value of C: {best_c}")
print(f"Highest mean accuracy: {best_accuracy:.4f}")

In [None]:
X_raw = digits.data
y_raw = digits.target

# Split the data into a training and test sets on the raw data
XTrain_raw, XTest_raw, yTrain, yTest = train_test_split(X_raw, y_raw, random_state=1, test_size=0.8)

# Use SVM with an rbf kernel and the cost parameter C=5 to build a classifier using the training dataset.
model_raw = svm.SVC(kernel='rbf', C=5)
model_raw.fit(XTrain_raw, yTrain)

# Using the test dataset, evaluate the accuracy of the model on raw data.
y_pred_raw = model_raw.predict(XTest_raw)
accuracy_raw = metrics.accuracy_score(y_true=yTest, y_pred=y_pred_raw)

print('Accuracy on raw data =', accuracy_raw)

**Accuracy Score:** 0.975

### Prediction with K-nearest Neighbors

In [None]:
# Split the data into a training and test set
XTrain, XTest, yTrain, yTest = train_test_split( X, y, random_state=1, test_size=0.8 )

In [None]:
# Use KNN Classification with k=10 to build a classifier using the training dataset.
knn_model = KNeighborsClassifier(n_neighbors=10)
knn_model.fit(XTrain, yTrain)

In [None]:
# Using the test dataset, evaluate the accuracy of the model.
y_pred_knn = knn_model.predict(XTest)
accuracy_knn = metrics.accuracy_score(y_true=yTest, y_pred=y_pred_knn)

print('Accuracy of k-NN (k=10):', accuracy_knn)

In [None]:
# Again using the test dataset, compute the confusion matrix.
confusion_matrix_knn = metrics.confusion_matrix(y_true=yTest, y_pred=y_pred_knn)
print('Confusion Matrix:')
print(confusion_matrix_knn)

**Note:** The most common mistake the classifier makes is misclassifying a 1 as a 9 (10 times). Overall, also, 1 was misclassified the most (26 times).

In [None]:
# Display all of the misclassified digits as images (title with: Predicted #, Actual #).
plt.figure(figsize=(12, 8))
misclassified_count_knn = 0

for i in range(len(yTest)):
    if y_pred_knn[i] != yTest[i]:
        misclassified_count_knn += 1
        plt.subplot(3, 4, misclassified_count_knn)
        plt.imshow(np.reshape(XTest[i], (8, 8)), cmap='Greys', interpolation='nearest')
        plt.title(f"Predicted {y_pred_knn[i]}\nActual {yTest[i]}", fontsize=10)
        plt.axis('off')
        
        if misclassified_count_knn == 12:
            break

plt.tight_layout()
plt.show()

In [None]:
# Using the 'cross_val_score' function, evaluate the accuracy of the KNN for different values 
# of the parameter k, first as k = 10

# Define the range of k values to test
k_values = np.concatenate((np.arange(1, 11), np.arange(10, 51, 5)))

# Initialize a list to store the mean accuracies
mean_accuracies_knn = []

# Loop over the range of k values
for k in k_values:
    # Create the k-NN model with the current k value
    knn_model = KNeighborsClassifier(n_neighbors=k)
    
    # Perform cross-validation with 5 folds and get the accuracy scores
    scores = cross_val_score(knn_model, X, y, cv=5)
    
    # Calculate the mean accuracy and append it to the list
    mean_accuracy = scores.mean()
    mean_accuracies_knn.append(mean_accuracy)

mean_accuracies_knn

In [None]:
# Graph results
# Plot the results
plt.figure(figsize=(10, 6))
plt.plot(k_values, mean_accuracies_knn, marker='o', linestyle='-')
plt.xlabel('k')
plt.ylabel('Mean Accuracy')
plt.title('k-NN Accuracy vs. k')
plt.grid(True)
plt.show()

# Find the best value of k that gives the highest mean accuracy
best_k = k_values[np.argmax(mean_accuracies_knn)]
best_accuracy_knn = np.max(mean_accuracies_knn)

print(f"Best value of k for k-NN: {best_k}")
print(f"Highest mean accuracy for k-NN: {best_accuracy_knn:.4f}")

**Note:** The best value for k is 3.

In [None]:
# Train and test the algorithm on the raw (non-scaled) data.
# Split the data into a training and test set
XTrain_raw, XTest_raw, yTrain_raw, yTest_raw = train_test_split(X_raw, y_raw, random_state=1, test_size=0.8)

# Use k-NN with k=10 to build a classifier using the training dataset.
model_raw = KNeighborsClassifier(n_neighbors=10)
model_raw.fit(XTrain_raw, yTrain)

# Using the test dataset, evaluate the accuracy of the model on raw data.
y_pred_raw = model_raw.predict(XTest_raw)
accuracy_raw = metrics.accuracy_score(y_true=yTest, y_pred=y_pred_raw)

print('Accuracy on raw data =', accuracy_raw)

**Observations:** In SVM, increasing the value of C worked in its favor up to a limit with smaller datasets. In large datasets, it worked well to have a high value of C. SVM did extremely well with 97% accuracy on the raw digit data.

For kNN, the lower the number of dimensions and the smaller the dataset, the more accurate it was. It did well in the digit dataset with 94% accuracy on the raw data.