# **CS 5361/6361 Machine Learning**

**Classification using k-nn and the scikit-learn library**

**Author:** Ruben Martinez
**Last modified:** 9/17/2024<br>


In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import time

Download data.

In [10]:
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.mnist.load_data()
X_train = np.float32(X_train.reshape(X_train.shape[0],-1)/255)
X_test = np.float32(X_test.reshape(X_test.shape[0],-1)/255)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(60000, 784)
(60000,)
(10000, 784)
(10000,)


Now we will classify the test set using the sklearn implementation of k-nearest neighbors with default parameters.

The documentation can be found here:
https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html



In [9]:
classifier = KNeighborsClassifier()
start = time.time()
classifier.fit(X_train, y_train)
end = time.time()
print('Elapsed time training= {:.4f} secs'.format(end-start))
start = time.time()
pred = classifier.predict(X_test)
end = time.time()
print('Elapsed time testing= {:.4f} secs'.format(end-start))
print(f'Accuracy: {accuracy_score(y_test,pred):6.4f}')
print(f'Confusion matrix:\n{confusion_matrix(y_test,pred)}')

Elapsed time training= 0.0436 secs
Elapsed time testing= 7.7491 secs
Accuracy: 0.9296
Confusion matrix:
[[ 963    0    4    1    1    7    2    1    0    1]
 [   0 1127    4    2    1    0    1    0    0    0]
 [  18    4  961   10    6    6    8   14    1    4]
 [   6    0   20  903    0   49   10    1   17    4]
 [   1   13    2    0  940    1    4    4    0   17]
 [  17    4    5   20    3  827   11    2    2    1]
 [  14    4    3   10    8   20  897    1    0    1]
 [   3   20   12    1   10    1    0  964    0   17]
 [   4    2   17   65   10   36   13   11  802   14]
 [   7    4    9    7   23   11    1   32    3  912]]


By default, the k-neighbors classifier uses 5 nearest neighbors (n_neighbors = 5). Would you expect the accuracy to increase or decrease using:
*   k = 1?
*   k = 15?



By default, the algorithm assigns the same weight to all the nearest neighbors of a test example (weights = 'uniform'), while distance weighting assigns a larger weight to neighbors that are closer to the test example. Would you expect the accuracy to increase or decrease using distance weighting and:
*   k = 1?
*   k = 2?
*   k = 5?
*   k = 15?

Suppose we select the n features with the largest variance in the training set. If we select half of the features (that is, n =  784/2 = 392), what do you expect to change in terms of


*   Training time
*   Classification time
*   Accuracy





# Predicting with alternative parameters


In [6]:
parameters = [(1, "uniform"), (15, "uniform"), (1, "distance"), (2, "distance"), (5, "distance"), (15, "distance")] # (n_neighbors, weighted?) passing uniform means unweighted and distance means weighted

for num_neighbors, weight_type in parameters:
    print(f"Number of neighbors: {num_neighbors} and {'Weighted' if weight_type == 'distance' else 'Unweighted'}")
    classifier = KNeighborsClassifier(n_neighbors = num_neighbors, weights = weight_type)
    start = time.time()
    classifier.fit(X_train, y_train)
    end = time.time()
    print('Elapsed time training= {:.4f} secs'.format(end-start))
    start = time.time()
    pred = classifier.predict(X_test)
    end = time.time()
    print('Elapsed time testing= {:.4f} secs'.format(end-start))
    print(f'Accuracy: {accuracy_score(y_test,pred):6.4f}')
    print(f'Confusion matrix:\n{confusion_matrix(y_test,pred)}')
    print()

Number of neighbors: 1 and Unweighted
Elapsed time training= 0.0421 secs
Elapsed time testing= 38.7040 secs
Accuracy: 0.9691
Confusion matrix:
[[ 973    1    1    0    0    1    3    1    0    0]
 [   0 1129    3    0    1    1    1    0    0    0]
 [   7    6  992    5    1    0    2   16    3    0]
 [   0    1    2  970    1   19    0    7    7    3]
 [   0    7    0    0  944    0    3    5    1   22]
 [   1    1    0   12    2  860    5    1    6    4]
 [   4    2    0    0    3    5  944    0    0    0]
 [   0   14    6    2    4    0    0  992    0   10]
 [   6    1    3   14    5   13    3    4  920    5]
 [   2    5    1    6   10    5    1   11    1  967]]

Number of neighbors: 15 and Unweighted
Elapsed time training= 0.0377 secs
Elapsed time testing= 37.9144 secs
Accuracy: 0.9633
Confusion matrix:
[[ 970    1    1    0    0    2    5    1    0    0]
 [   0 1131    2    1    0    0    1    0    0    0]
 [  15   15  968    3    1    0    3   20    7    0]
 [   0    3    2  975 

# Predicting with half the features of largest variance

In [8]:
variances = np.var(X_train, axis=0)  # variance of each feature

# sort indices by variance and select the largest half of features
largest_half_indices = np.argsort(variances)[X_train.shape[1] // 2:]  # select top 50% of features

X_train = X_train[:, largest_half_indices]
X_test = X_test[:, largest_half_indices]


classifier = KNeighborsClassifier()
start = time.time()
classifier.fit(X_train, y_train)
end = time.time()
print('Elapsed time training= {:.4f} secs'.format(end-start))

start = time.time()
pred = classifier.predict(X_test)
end = time.time()
print('Elapsed time testing= {:.4f} secs'.format(end-start))

print(f'Accuracy: {accuracy_score(y_test,pred):6.4f}')
print(f'Confusion matrix:\n{confusion_matrix(y_test,pred)}')

Elapsed time training= 0.0752 secs
Elapsed time testing= 7.6698 secs
Accuracy: 0.9296
Confusion matrix:
[[ 963    0    4    1    1    7    2    1    0    1]
 [   0 1127    4    2    1    0    1    0    0    0]
 [  18    4  961   10    6    6    8   14    1    4]
 [   6    0   20  903    0   49   10    1   17    4]
 [   1   13    2    0  940    1    4    4    0   17]
 [  17    4    5   20    3  827   11    2    2    1]
 [  14    4    3   10    8   20  897    1    0    1]
 [   3   20   12    1   10    1    0  964    0   17]
 [   4    2   17   65   10   36   13   11  802   14]
 [   7    4    9    7   23   11    1   32    3  912]]


# Predicting with standardized training data

In [35]:
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.mnist.load_data()

X_train = np.float32(X_train.reshape(X_train.shape[0], -1) / 255)
X_test = np.float32(X_test.reshape(X_test.shape[0], -1) / 255)

means = np.mean(X_train, axis=0) # means for each feature

standard_deviations = np.std(X_train, axis=0) # standard deviations for each feature
standard_deviations[standard_deviations == 0] = 1 # replace standard deviations of 0 with 1

X_train_standardized = (X_train - means) / standard_deviations
X_test_standardized = (X_test - means) / standard_deviations

classifier = KNeighborsClassifier()
start = time.time()
classifier.fit(X_train_standardized, y_train)
end = time.time()
print('Elapsed time training= {:.4f} secs'.format(end-start))

start = time.time()
pred = classifier.predict(X_test_standardized)
end = time.time()
print('Elapsed time testing= {:.4f} secs'.format(end-start))

print(f'Accuracy: {accuracy_score(y_test,pred):6.4f}')
print(f'Confusion matrix:\n{confusion_matrix(y_test,pred)}')

Elapsed time training= 0.0255 secs
Elapsed time testing= 37.4009 secs
Accuracy: 0.9443
Confusion matrix:
[[ 963    0    1    3    1    5    6    1    0    0]
 [   0 1129    3    0    0    0    3    0    0    0]
 [  14    6  960   20    5    0    7    9   10    1]
 [   0    3    5  962    3   13    0   10   10    4]
 [   1   10    5    3  922    3    6    4    2   26]
 [   5    1    3   23    8  824   13    2    6    7]
 [  10    4    2    1    3    6  929    0    3    0]
 [   0   21   12    4    8    2    0  949    1   31]
 [  13    3    6   18    8   30    3    6  880    7]
 [   6    5    5   10   18    6    0   31    3  925]]
