In [1]:
import numpy as np
from scipy.stats import mode
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from tensorflow.keras.datasets import mnist
from tensorflow.keras.datasets import cifar10

# increase the width of boxes in the notebook file (this is only cosmetic)
np.set_printoptions(linewidth=180)

In [2]:
# Create a class for the k-nearest neighbor classifier
class kNearestNeighborClassifier:
    # constructor to save the hyperparameter k
    def __init__(self, k = 5):
        # initialize the number of neighbors to use
        self.neighbors = k
        
        # print a warning if k is even
        #if k % 2 == 0:
        #    print('[WARNING] An odd number is recommended for k to avoid tie votes in the kNN classifier.')
    
    # fit the model to the training data (for kNN, there's no actual fitting involved)
    def fit(self, X, y):
        '''
        Record the class labels for training data
        
        Inputs
        ------
        
        X: a matrix of datapoints from the training data, each row is a point
        y: a vector of labels for each datapoint
        
        '''
        
        # record the unique class labels
        self.classes = np.unique(y)
        
        # print a warning if we only input one class
        if self.classes.shape[0] < 2:
            print('[WARNING] There should be at least two classes in the input data.')
            
        # record the data and labels
        self.data = X
        self.labels = y
    
    # use the classifier to predict the classifications of the testing data
    def predict(self, X):
        '''
        Predict the class labels for the input data
        
        Inputs
        ------
        
        X: a matrix of datapoints from the testing data, each row is a point
        
        Outputs
        -------
        
        classes: the class predicted by the k-nearest neighbor classifier for each testing datapoint
        
        '''
        # initialize the predicted classes
        yPredicted = np.empty([X.shape[0],1])
        
        # loop over the datapoints in X
        for row in range(X.shape[0]):
            datapoint = X[row,]
            
            # find the distances from the datapoint to each training point using the L2 norm
            distances = np.sqrt(np.sum(((self.data - datapoint)**2), axis = 1))
            
            # find the indices of the smallest k distances
            indices = np.argsort(distances)[:self.neighbors]
            
            # find the the class labels of the nearest neighbors
            nearestClasses = self.labels[indices]
            
            # determine the predicted class by finding the mode
            yPredicted[row] = int(mode(nearestClasses)[0][0])
            
        return yPredicted

In [11]:
###### cifarData = cifar10.load_data()
from sklearn.preprocessing import normalize

### CLASSIFY CIFAR PICTURES

numPictures = 5000

# create a dataset of 1000 CIFAR images, reshaped as single vectors, and labels

# The datapoints are in cifarData[0][0]
X = cifarData[0][0][:numPictures].reshape([numPictures,32*32*3])

# The labels are in cifarData[0][1]
Y = cifarData[0][1][:numPictures]

# randomly choose 75% of the data to be the training set and 25% for the testing set
(trainX, testX, trainY, testY) = train_test_split(X, Y, test_size = 0.2, random_state = 3)

# for each dimension, subtract the mean value and divide by the standard deviation
trainX = normalize(trainX)
testX = normalize(testX)

# fit the model to the training data
model = kNearestNeighborClassifier(k = 4)
model.fit(trainX,trainY)

# print the classification performance
print(classification_report(testY, model.predict(testX)))

              precision    recall  f1-score   support

           0       0.21      0.55      0.30        96
           1       0.42      0.12      0.19        84
           2       0.19      0.34      0.24       107
           3       0.26      0.22      0.24       103
           4       0.22      0.36      0.27        94
           5       0.25      0.13      0.17       100
           6       0.45      0.16      0.24       116
           7       0.46      0.12      0.19       103
           8       0.35      0.44      0.39       108
           9       0.43      0.11      0.18        89

    accuracy                           0.26      1000
   macro avg       0.32      0.26      0.24      1000
weighted avg       0.32      0.26      0.24      1000

