# Imported Libaries

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import tqdm
import tarfile
import zipfile
import requests
import time
import pickle
import platform
import matplotlib.pyplot as plt

data_url = "https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz"
classes = ['plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']

In [None]:
class Downloder():
    def __init__(self):
        pass
    
    def __download_file(self,url, directory):
        response = requests.get(url, stream=True)
        total_size = int(response.headers.get('content-length', 0))
        block_size = 1024
        progress = tqdm.tqdm(total=total_size, unit='B', unit_scale=True)
        file_name = url.split("/")[-1]
        path = os.path.join(directory, file_name)

        with open(path, "wb") as f:
            for data in response.iter_content(block_size):
                progress.update(len(data))
                f.write(data)
        progress.close()
    
    def download_and_extract(self,url, download_dir):
        """
        Download and extract the data if it doesn't already exist.
        :param download_dir: Directory where the downloaded file is saved.
            Example: "data/CIFAR-10/"
        """
        filename = url.split('/')[-1]
        file_path = os.path.join(download_dir, filename)

        if not os.path.exists(file_path):
            self.__download_file(url,download_dir)
            print()
            print("Download finished. Extracting files.")

            if file_path.endswith(".zip"):
                # Unpack the zip-file.
                zipfile.ZipFile(file=file_path, mode="r").extractall(download_dir)
            elif file_path.endswith((".tar.gz", ".tgz")):
                # Unpack the tar-ball.
                tarfile.open(name=file_path, mode="r:gz").extractall(download_dir)

            print("Extraction is Done.")
        else:
            print("Data has apparently already been downloaded and unpacked.")

In [None]:
class Cifar10Loader():
    def __init__(self):
        pass
    
    def __LoadBinary(self,f):
        """
        This function check the python version, that required python version 3 or above.
        If so then it will load each file with latin-1 encoder.
        
        'https://stackoverflow.com/questions/4299802/python-convert-string-from-utf-8-to-latin-1'
        """
        ver = platform.python_version_tuple()
        if ver[0]=='3':
            return pickle.load(f, encoding='latin1')
        else :
            return ValueError("invalid python version: {}".format(version))
        
    def __loadCifarBatchs(self, filename):
        """
        Function for loading the loading the eatch batch of the Cifar-10 Dataset.
        """
        with open(filename,'rb') as f:
            data = self.__LoadBinary(f)
            x = data['data']
            y = data['labels']
            x = x.reshape(10000,3,32,32).transpose(0,2,3,1).astype('float')
            y = np.array(y)
            return x,y
        
    def load_cifar10(self, rootdir):
        """
        Finction loading the full dataset with the help of above two helper function. And Return 
        the four tupple (X_trin, y_train, X_test, y_test)
        """
        indipendent = []
        dependent = []

        for b in range(1,6):
            F = os.path.join(rootdir,'data_batch_%d'%(b,))
            X , Y = self.__loadCifarBatchs(F)
            indipendent.append(X)
            dependent.append(Y)
        xtrain = np.concatenate(indipendent)
        ytrian = np.concatenate(dependent)
        del indipendent, dependent
        xtest, ytest = self.__loadCifarBatchs(os.path.join(rootdir,'test_batch'))
        
        return xtrain, ytrian, xtest, ytest

In [None]:
class KNNCifar10_Classifier():
    def __init__(self, X, Y):
        self.X_train = X
        self.y_train = Y
    
    def predict(self, x_test,k=1):
        """
        Function takes test numpy-n-array as input and calculate
        the distance beteween test image and train images. 
        Based on that distance the predicted lable is computed.
        """
        d = self.__distance(x_test)
        return self.__predict_labels(dist = d, k = k)
    
    def __distance(self, X):
        test_size = X.shape[0]
        train_size = self.X_train.shape[0]
        
        distanceMatrix = np.zeros((test_size,train_size))
        # Populating the distances to distance matrix
        distanceMatrix = np.sqrt(np.sum(np.square(self.X_train), axis=1) + np.sum(np.square(X), axis=1)[:, np.newaxis] - 2 * np.dot(X, self.X_train))
        pass
        return distanceMatrix
        
    def __predict_lables(slef, dist,k=1):
        test_size = dist.shape[0]
        y_pred = np.zeros(test_size)
        
        for i in range(test_size):
            nearest_y = []
            sorted_dist = np.argsort(dist[i])
            nearest_y = list(self.y_train[sorted_dist[:k]])
            y_pred[i] = (np.argmax(np.bincount(nearest_y)))
            # https://numpy.org/doc/stable/reference/generated/numpy.bincount.html
        return y_pred
      

In [None]:
downloader = Downloder()
download_dir = "E:/Classes/4.1/Computer-Vision/cvpr-problem/Mid/Assignment-1/"
downloader.download_and_extract(data_url,download_dir)

In [None]:
!ls $download_dir

In [None]:
loader = Cifar10Loader()

X_train, y_train, X_test, y_test = loader.load_cifar10(os.path.join(download_dir,'cifar-10-batches-py'))

In [None]:
print('Training data shape: ', X_train.shape)
print('Training labels shape: ', y_train.shape)
print('Test data shape: ', X_test.shape)
print('Test labels shape: ', y_test.shape)

## Visualizing Data Samples

In [None]:
def visualizSampleData(classess, number):

    for lbl, cls in enumerate(classess):
        idxs = np.flatnonzero(y_train == lbl)
        idxs = np.random.choice(idxs, number, replace = False)
        
        for i, j in enumerate(idxs):
            plt_idx = i * len(classess) + lbl + 1
            plt.subplot(number, len(classess), plt_idx)
            plt.imshow(X_train[j].astype('uint8'))
            plt.axis('off')
            
            if i == 0:
                plt.title(cls)   
    plt.show()

In [None]:
visualizSampleData(classess= classes, number=5)

## Data Subsampling

In [None]:
def dataSubsampler(X_train, y_train, X_test, y_test, trainSize = 10000, testSize = 1000):
    """
    Resize the train and test size as define the size
    """
    if trainSize > X_train.shape[0] or testSize > X_test.shape[0]:
        raise ValueError('Invalid value of Training size of Test size')
        
    X_train = X_train[list(range(trainSize))]
    y_train = y_train[list(range(trainSize))]
    
    X_test = X_test[list(range(testSize))]
    y_test = y_test[list(range(testSize))]
    
    return X_train, y_train, X_test, y_test

In [None]:
X_train, y_train, X_test, y_test = dataSubsampler(X_train, y_train, X_test, y_test)

In [None]:
# Changing the shape of the data

X_train = X_train.reshape(X_train.shape[0], -1)
X_test = X_test.reshape(X_test.shape[0], -1)

X_train.shape, X_test.shape

# K-Nearest Neighbour

In [None]:
classifier = KNNCifar10_Classifier(X_train,y_train)
y_pred = classifier.predict(X_test, k= 5)

correctly_Pred = np.sum(y_pred == y_test)

accuracy = float(correctly_Pred)/ X_test.shape[0]

print('Got %d / %d correct => accuracy: %f' % (correctly_Pred, X_test.shape[0], accuracy))

In [None]:
num_folds = 5
k_choices = [1, 3, 5, 7, 9, 10, 12, 15, 20, 25, 50, 75, 100]

X_train_fold = np.array_split(X_train, num_folds)
y_train_fold = np.array_split(y_train, num_folds)

accuracies = {}

for k in k_choices:
    accuracies[k] = []
    
    for knn in range(0, num_folds):
        X_test = X_train_fold[knn]
        y_test = y_train_fold[knn]
        
        X_train = X_train_fold
        y_train = y_train_fold
        
        temp = np.delete(X_train, knn, 0)
        x_train = np.concatenate((temp), axis=0)
        y_train = np.delete(y_train, knn, 0)
        y_train = np.concatenate((y_train), axis=0)
        
        classifier = KNN_image_classifier()
        classifier.fit(X_train, y_train)
        
        distance = classifier.claculate_distance(X_test, test_size)
        y_test_pred = classifier.predict_labels(distance, k)

        num_correct = np.sum(y_test_pred == y_test)
        accuracy = float(num_correct) / test_size
        
        accuracies[k].append(accuracy)

print("f{k}-fold accuracy: ")
print("+*20")

for k in sorted(accuracies):
    for accu in accuracies[k]:
        print(f"Value of k is {k} and accuracy {accu}")
        
print("+*20")

In [None]:
def KNN(num_folds, k_choices, X, y, traning_size=10000, test_size=1000):
    accuracies = {}

    if (num_folds is not None) and type(num_folds is not list):
        X_train_folds = []
        y_train_folds = []

        X_train_folds = np.array_split(X, num_folds)
        y_train_folds = np.array_split(y, num_folds)

        for k in k_choices:
            accuracies[k] = []
            for num_knn in range(0, num_folds):
                X_test = X_train_folds[num_knn]
                y_test = y_train_folds[num_knn]
                X_train = X_train_folds
                y_train = y_train_folds

                temp = np.delete(X_train, num_knn, 0)
                X_train = np.concatenate((temp), axis=0)
                y_train = np.delete(y_train, num_knn, 0)
                y_train = np.concatenate((y_train), axis=0)

                classifier = NearestNeighbourClassifier(k)
                classifier.fit(X_train, y_train)

                y_test_pred = classifier.predict(X_test)

                correct, accuracy = classifier.ClassificationResult(
                    y_test_pred, y_test)

                accuracies[k].append(accuracy)
        plot_Accuracies(k_choices, accuracies)

    elif (num_folds is not None) and (type(num_folds) == list):
        for fold in num_folds:
            X_train_folds = []
            y_train_folds = []
            X_train_folds = np.array_split(X, fold)
            y_train_folds = np.array_split(y, fold)

            for k in k_choices:
                accuracies[k] = []
                for num_knn in range(0, fold):
                    X_test = X_train_folds[num_knn]
                    y_test = y_train_folds[num_knn]
                    X_train = X_train_folds
                    y_train = y_train_folds

                    temp = np.delete(X_train, num_knn, 0)
                    X_train = np.concatenate((temp), axis=0)
                    y_train = np.delete(y_train, num_knn, 0)
                    y_train = np.concatenate((y_train), axis=0)

                    classifier = NearestNeighbourClassifier(k)
                    classifier.fit(X_train, y_train)

                    y_test_pred = classifier.predict(X_test)

                    correct, accuracy = classifier.ClassificationResult(
                        y_test_pred, y_test)

                    accuracies[k].append(accuracy)
            # Ploting the Acccuracy of the current fold.
            plot_Accuracies(k_choices, accuracies)


In [None]:
# def KNN(num_folds, k_choices, X_train, y_train, traning_size=10000, test_size=1000):
#     accuracies = {}

#     if (num_folds is not None) and type(num_folds is not list):
#         X_train_folds = []
#         y_train_folds = []

#         X_train_folds = np.array_split(X_train, num_folds)
#         y_train_folds = np.array_split(y_train, num_folds)

#         for k in k_choices:
#             accuracies[k] = []
#             for num_knn in range(0, num_folds):
#                 X_test = X_train_folds[num_knn]
#                 y_test = y_train_folds[num_knn]
#                 X_train = X_train_folds
#                 y_train = y_train_folds

#                 temp = np.delete(X_train, num_knn, 0)
#                 X_train = np.concatenate((temp), axis=0)
#                 y_train = np.delete(y_train, num_knn, 0)
#                 y_train = np.concatenate((y_train), axis=0)

#                 classifier = NearestNeighbourClassifier(k)
#                 classifier.fit(X_train, y_train)

#                 y_test_pred = classifier.predict(X_test)

#                 correct, accuracy = classifier.ClassificationResult(
#                     y_test_pred, y_test)

#                 accuracies[k].append(accuracy)
#         plot_Accuracies(k_choices, accuracies)

#     elif (num_folds is not None) and (type(num_folds) == list):
#         for fold in num_folds:
#             X_train_folds = []
#             y_train_folds = []
#             X_train_folds = np.array_split(X_train, fold)
#             y_train_folds = np.array_split(y_train, fold)

#             for k in k_choices:
#                 accuracies[k] = []
#                 for num_knn in range(0, fold):
#                     X_test = X_train_folds[num_knn]
#                     y_test = y_train_folds[num_knn]
#                     X_train = X_train_folds
#                     y_train = y_train_folds

#                     temp = np.delete(X_train, num_knn, 0)
#                     X_train = np.concatenate((temp), axis=0)
#                     y_train = np.delete(y_train, num_knn, 0)
#                     y_train = np.concatenate((y_train), axis=0)

#                     classifier = NearestNeighbourClassifier(k)
#                     classifier.fit(X_train, y_train)

#                     y_test_pred = classifier.predict(X_test)

#                     correct, accuracy = classifier.ClassificationResult(
#                         y_test_pred, y_test)

#                     accuracies[k].append(accuracy)
#             # Ploting the Acccuracy of the current fold.
#             plot_Accuracies(k_choices, accuracies)
