In [1]:
# Importing the needed libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# In order to to read the Ciffar data set, we must define the unpickle function: https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz). 
def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# Reading the data
data_1 = unpickle("/content/drive/My Drive/Colab Notebooks/cifar-10-batches-py/data_batch_1")
data_2 = unpickle("/content/drive/My Drive/Colab Notebooks/cifar-10-batches-py/data_batch_2")
data_3 = unpickle("/content/drive/My Drive/Colab Notebooks/cifar-10-batches-py/data_batch_3")
data_4 = unpickle("/content/drive/My Drive/Colab Notebooks/cifar-10-batches-py/data_batch_4")
data_5 = unpickle("/content/drive/My Drive/Colab Notebooks/cifar-10-batches-py/data_batch_5")
test_batch = unpickle("/content/drive/My Drive/Colab Notebooks/cifar-10-batches-py/test_batch")

In [5]:
# For the sake of computational cost, we will work on gray scale images - without feature normalization
data_1[b"data"] = data_1[b"data"].astype(float)[:,0:1024]
data_2[b"data"] = data_2[b"data"].astype(float)[:,0:1024]
data_3[b"data"] = data_3[b"data"].astype(float)[:,0:1024]
data_4[b"data"] = data_4[b"data"].astype(float)[:,0:1024]
data_5[b"data"] = data_5[b"data"].astype(float)[:,0:1024]
test_batch[b"data"] = test_batch[b"data"].astype(float)[:,0:1024]

In [6]:
#Preparing our data set for 5 fold validation. a list of 4 fold training set (training_data_n , training_labels_n) 
#and a fold for validation (validation_data_n, validation_labels_n) 
training_data_1 = []
training_data_1.append(data_1[b"data"])
training_data_1.append(data_2[b"data"])
training_data_1.append(data_3[b"data"])
training_data_1.append(data_4[b"data"])
training_labels_1 = data_1[b"labels"] + data_2[b"labels"] + data_3[b"labels"] + data_4[b"labels"] 
validation_data_1 = data_5[b"data"]
validation_labels_1 = data_5[b"labels"]

training_data_2 = []
training_data_2.append(data_1[b"data"])
training_data_2.append(data_2[b"data"])
training_data_2.append(data_3[b"data"])
training_data_2.append(data_5[b"data"])
training_labels_2 = data_1[b"labels"] + data_2[b"labels"] + data_3[b"labels"] + data_5[b"labels"] 
validation_data_2 = data_4[b"data"]
validation_labels_2 = data_4[b"labels"]

training_data_3 = []
training_data_3.append(data_1[b"data"])
training_data_3.append(data_2[b"data"])
training_data_3.append(data_4[b"data"])
training_data_3.append(data_5[b"data"])
training_labels_3 = data_1[b"labels"] + data_2[b"labels"] + data_4[b"labels"] + data_5[b"labels"] 
validation_data_3 = data_3[b"data"]
validation_labels_3 = data_3[b"labels"]

training_data_4 = []
training_data_4.append(data_1[b"data"])
training_data_4.append(data_3[b"data"])
training_data_4.append(data_4[b"data"])
training_data_4.append(data_5[b"data"])
training_labels_4 = data_1[b"labels"] + data_3[b"labels"] + data_4[b"labels"] + data_5[b"labels"] 
validation_data_4 = data_2[b"data"]
validation_labels_4 = data_2[b"labels"]

training_data_5 = []
training_data_5.append(data_2[b"data"])
training_data_5.append(data_3[b"data"])
training_data_5.append(data_4[b"data"])
training_data_5.append(data_5[b"data"])
training_labels_5 = data_2[b"labels"] + data_3[b"labels"] + data_4[b"labels"] + data_5[b"labels"] 
validation_data_5 = data_1[b"data"]
validation_labels_5 = data_1[b"labels"]

In [7]:
#preparing our data set with feature normalization
normalized_data_1 = (data_1[b"data"].astype(float)[:,0:1024] - np.mean(data_1[b"data"].astype(float)[:,0:1024],axis=1)[:,np.newaxis])/(np.std(data_1[b"data"].astype(float)[:,0:1024], axis=1)[:, np.newaxis])
normalized_data_2 = (data_2[b"data"].astype(float)[:,0:1024] - np.mean(data_2[b"data"].astype(float)[:,0:1024],axis=1)[:,np.newaxis])/(np.std(data_2[b"data"].astype(float)[:,0:1024], axis=1)[:, np.newaxis])
normalized_data_3 = (data_3[b"data"].astype(float)[:,0:1024] - np.mean(data_3[b"data"].astype(float)[:,0:1024],axis=1)[:,np.newaxis])/(np.std(data_3[b"data"].astype(float)[:,0:1024], axis=1)[:, np.newaxis])
normalized_data_4 = (data_4[b"data"].astype(float)[:,0:1024] - np.mean(data_4[b"data"].astype(float)[:,0:1024],axis=1)[:,np.newaxis])/(np.std(data_4[b"data"].astype(float)[:,0:1024], axis=1)[:, np.newaxis])
normalized_data_5 = (data_5[b"data"].astype(float)[:,0:1024] - np.mean(data_5[b"data"].astype(float)[:,0:1024],axis=1)[:,np.newaxis])/(np.std(data_5[b"data"].astype(float)[:,0:1024], axis=1)[:, np.newaxis])
normalized_test_batch = (test_batch[b"data"].astype(float)[:,0:1024] - np.mean(test_batch[b"data"].astype(float)[:,0:1024],axis=1)[:,np.newaxis])/(np.std(test_batch[b"data"].astype(float)[:,0:1024], axis=1)[:, np.newaxis])

In [8]:
#Preparing our normalized data set for 5 fold validation. a list of 4 fold training set (normalized_training_data_n , normalized_training_labels_n) 
#and a fold for validation (normalized_validation_data_n, normalized_validation_labels_n) 
normalized_training_data_1 = []
normalized_training_data_1.append(normalized_data_1)
normalized_training_data_1.append(normalized_data_2)
normalized_training_data_1.append(normalized_data_3)
normalized_training_data_1.append(normalized_data_4)
training_labels_1 = data_1[b"labels"] + data_2[b"labels"] + data_3[b"labels"] + data_4[b"labels"] 
normalized_validation_data_1 = normalized_data_5
validation_labels_1 = data_5[b"labels"]

normalized_training_data_2 = []
normalized_training_data_2.append(normalized_data_1)
normalized_training_data_2.append(normalized_data_2)
normalized_training_data_2.append(normalized_data_3)
normalized_training_data_2.append(normalized_data_5)
training_labels_2 = data_1[b"labels"] + data_2[b"labels"] + data_3[b"labels"] + data_5[b"labels"] 
normalized_validation_data_2 = normalized_data_4
validation_labels_2 = data_4[b"labels"]

normalized_training_data_3 = []
normalized_training_data_3.append(normalized_data_1)
normalized_training_data_3.append(normalized_data_2)
normalized_training_data_3.append(normalized_data_4)
normalized_training_data_3.append(normalized_data_5)
training_labels_3 = data_1[b"labels"] + data_2[b"labels"] + data_4[b"labels"] + data_5[b"labels"] 
normalized_validation_data_3 = normalized_data_3
validation_labels_3 = data_3[b"labels"]

normalized_training_data_4 = []
normalized_training_data_4.append(normalized_data_1)
normalized_training_data_4.append(normalized_data_3)
normalized_training_data_4.append(normalized_data_4)
normalized_training_data_4.append(normalized_data_5)
training_labels_4 = data_1[b"labels"] + data_3[b"labels"] + data_4[b"labels"] + data_5[b"labels"] 
normalized_validation_data_4 = normalized_data_2
validation_labels_4 = data_2[b"labels"]

normalized_training_data_5 = []
normalized_training_data_5.append(normalized_data_2)
normalized_training_data_5.append(normalized_data_3)
normalized_training_data_5.append(normalized_data_4)
normalized_training_data_5.append(normalized_data_5)
training_labels_5 = data_2[b"labels"] + data_3[b"labels"] + data_4[b"labels"] + data_5[b"labels"] 
normalized_validation_data_5 = normalized_data_1
validation_labels_5 = data_1[b"labels"]

In [9]:
#DistanceCalculation function --> inputs: training data as a list of 4 training_data or 4 normalized_training data, and validation_data or normalized_validation_data 
#Output:  a matrix that includes the distance of every validation image from each of the training image 
# as a distance formula, i have used the formula which is described in the "No-loop implementation" part of the https://ljvmiranda921.github.io/notebook/2017/02/09/k-nearest-neighbors/
def DistanceCalculation(training_data, validation_data):
  matrices = []
  sumsquared_validation = np.diagonal(np.dot(validation_data, validation_data.T)) #b^2
  
  for i in training_data:
      sumsquared_training = np.diagonal(np.dot(i, i.T))[:, np.newaxis] #a^2
      matrices.append(np.sqrt(sumsquared_training + sumsquared_validation - 2 * i.dot(validation_data.T))) # a^2 + b^2 -2*a*b
  
  distances = np.concatenate((matrices[0],matrices[1],matrices[2],matrices[3]),axis=0) 

  return distances

In [10]:
#KNN function --> input: a distances matrix (a number of images in the training data * a number of images in the validation or test data set)
#output: the accuracy
def KNN(distances,training_lables, validation_labels,k):
  

    #finding the indices of k smallest distances
    sorted_distances_indices = np.argsort(distances, axis=0)
    Ksmallest_distances_indices = sorted_distances_indices[0:k, ]
 
    #Finding the labels of K smallest distances using their indices
    Ksmallest_labels = np.zeros((k,10000))
    for i in range(0,10000):
        for j in range(0,k):
            Ksmallest_labels[j,i] = Ksmallest_labels[j,i] + training_lables[Ksmallest_distances_indices[j,i]]
  
  
    #predicted label: most frequent label in k smallest distances'label
    #argmax(array, axis = None, out = None) : Returns indices of the max element of the array in a particular axis
    #bincount: Count number of occurrences of each value in array of non-negative ints.
    predicted_labels = []
    for i in range(0,10000):
        a = Ksmallest_labels[:,i].astype(int)
        predicted_labels.append(np.bincount(a).argmax())
  

  #comparing our result and calculating accuracy
    a = 0
    for i in range(0,10000):
        if predicted_labels[i] == validation_labels[i]:
            a = a + 1
  
    accuracy = a/10000
  
    return (accuracy)

In [17]:
distance_1 = DistanceCalculation(training_data_1, validation_data_1)

In [18]:
#finding the accuracy for k between 1 and 20 - data set: first set of folds wihtout feature normalization
Acc_1 = []
for i in range(1,21):

    print(i)

    Acc_1.append(KNN(distances = distance_1,training_lables = training_labels_1, validation_labels = validation_labels_1,k=i))



print(Acc_1)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
[0.2733, 0.2345, 0.2446, 0.2512, 0.2558, 0.2548, 0.2571, 0.2583, 0.2631, 0.2613, 0.2622, 0.2603, 0.2572, 0.2575, 0.2597, 0.2588, 0.2585, 0.259, 0.2581, 0.2577]


In [13]:
#finding the accuracy for k between 1 and 20 - data set: first set of folds wiht feature normalization
normdistances_1 = DistanceCalculation(normalized_training_data_1, normalized_validation_data_1)
normalized_Acc_1 = []

for i in range(1,21):
    print(i)
    normalized_Acc_1.append(KNN(distances = normdistances_1,training_lables = training_labels_1, validation_labels = validation_labels_1,k=i))

    

print(normalized_Acc_1)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
[0.3467, 0.3148, 0.3347, 0.3461, 0.3533, 0.3575, 0.3577, 0.3612, 0.3609, 0.3594, 0.3613, 0.3577, 0.3546, 0.3567, 0.3574, 0.3565, 0.3561, 0.3583, 0.3592, 0.3591]


In [15]:
#finding the accuracy for k between 1 and 20 - data set: second set of folds wihtout feature normalization
distances_2 = DistanceCalculation(training_data_2,validation_data_2)
Acc_2 = []

for i in range(1,21):
    print(i)
    Acc_2.append(KNN(distances_2, training_labels_2, validation_labels_2, i))

print(Acc_2)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
[0.272, 0.2357, 0.2532, 0.2563, 0.2584, 0.2614, 0.2617, 0.2641, 0.2633, 0.268, 0.2671, 0.264, 0.2655, 0.263, 0.2634, 0.2614, 0.2624, 0.2584, 0.2591, 0.2581]


In [12]:
#finding the accuracy for k between 1 and 20 - data set: second set of folds wiht feature normalization
normdistances_2 = DistanceCalculation(normalized_training_data_2,normalized_validation_data_2)
normalized_Acc_2 = []

for i in range(1,21):
    print(i)
    normalized_Acc_2.append(KNN(normdistances_2, training_labels_2, validation_labels_2, i))

print(normalized_Acc_2)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
[0.3454, 0.3126, 0.3373, 0.3473, 0.3509, 0.3539, 0.3544, 0.3606, 0.3612, 0.3636, 0.3649, 0.3656, 0.3633, 0.3617, 0.3632, 0.3605, 0.3592, 0.3608, 0.361, 0.3596]


In [13]:
#finding the accuracy for k between 1 and 20 - data set: third set of folds wihtout feature normalization
distances_3 = DistanceCalculation(training_data_3,validation_data_3)
Acc_3 = []

for i in range(1,21):
    print(i)
    Acc_3.append(KNN(distances_3, training_labels_3, validation_labels_3, i))

print(Acc_3)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
[0.2797, 0.2412, 0.2583, 0.2638, 0.2657, 0.2679, 0.2676, 0.2659, 0.2674, 0.2683, 0.2661, 0.2636, 0.263, 0.2623, 0.2617, 0.2629, 0.2612, 0.259, 0.26, 0.2587]


In [14]:
#finding the accuracy for k between 1 and 20 - data set: third set of folds with feature normalization
normdistances_3 = DistanceCalculation(normalized_training_data_3,normalized_validation_data_3)
normalized_Acc_3 = []
for i in range(1,21):
    print(i)
    normalized_Acc_3.append(KNN(normdistances_3, training_labels_3, validation_labels_3, i))

print(normalized_Acc_3)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
[0.351, 0.3199, 0.3433, 0.3512, 0.3592, 0.3632, 0.3647, 0.3627, 0.3656, 0.3672, 0.3656, 0.3674, 0.3677, 0.3678, 0.3682, 0.3674, 0.3654, 0.3644, 0.3659, 0.3669]


In [13]:
#finding the accuracy for k between 1 and 20 - data set: fourth set of folds without feature normalization
distances_4 = DistanceCalculation(training_data_4,validation_data_4)
Acc_4 = []
for i in range(1,21):
    print(i)
    Acc_4.append(KNN(distances_4, training_labels_4, validation_labels_4, i))

print(Acc_4)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
[0.2692, 0.2371, 0.2518, 0.2601, 0.2613, 0.2577, 0.2606, 0.2588, 0.2589, 0.2588, 0.2572, 0.2553, 0.257, 0.255, 0.2545, 0.2559, 0.2531, 0.2523, 0.2518, 0.2525]


In [11]:
#finding the accuracy for k between 1 and 20 - data set: fourth set of folds with feature normalization
normdistances_4 = DistanceCalculation(normalized_training_data_4,normalized_validation_data_4)
normalized_Acc_4 = []
for i in range(1,21):
    print(i)
    normalized_Acc_4.append(KNN(normdistances_4, training_labels_4, validation_labels_4, i))


print(normalized_Acc_4)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
[0.3414, 0.3017, 0.325, 0.337, 0.3421, 0.3404, 0.3424, 0.3476, 0.3456, 0.3482, 0.3493, 0.3471, 0.3429, 0.3442, 0.3465, 0.3466, 0.3465, 0.3467, 0.3458, 0.3455]


In [None]:
#finding the accuracy for k between 1 and 20 - data set: fifth set of folds without feature normalization
distances_5 = DistanceCalculation(training_data_5,validation_data_5)
Acc_5 = []
for i in range(1,21):
    print(i)
    Acc_5.append(KNN(distances_5, training_labels_5, validation_labels_5, i))

In [12]:
print(Acc_5)

[0.2802, 0.2374, 0.2526, 0.2579, 0.2634, 0.2656, 0.2638, 0.2634, 0.2659, 0.2688, 0.2665, 0.2638, 0.2643, 0.261, 0.2625, 0.259, 0.2576, 0.2587, 0.2586, 0.2583]


In [11]:
#finding the accuracy for k between 1 and 20 - data set: fifth set of folds with feature normalization
normdistances_5 = DistanceCalculation(normalized_training_data_5,normalized_validation_data_5)
normalized_Acc_5 = []
for i in range(1,21):
    print(i)
    normalized_Acc_5.append(KNN(normdistances_5, training_labels_5, validation_labels_5, i))

print(normalized_Acc_5)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
[0.3504, 0.3199, 0.3342, 0.3383, 0.3439, 0.3481, 0.3486, 0.3522, 0.3541, 0.3521, 0.3533, 0.3542, 0.3551, 0.3531, 0.3559, 0.3575, 0.355, 0.3544, 0.3534, 0.3544]


In [11]:
#now the proper k has been selected: proper K without feature normalization = 1, proper K with feature normalization = 14
#preparing our data for the test phase without feature normalization
training_data_final = []
training_data_final.append(data_1[b"data"])
training_data_final.append(data_2[b"data"])
training_data_final.append(data_3[b"data"])
training_data_final.append(data_4[b"data"])
training_data_final.append(data_5[b"data"])

training_labels_final = data_1[b"labels"] + data_2[b"labels"] + data_3[b"labels"] + data_4[b"labels"] + data_5[b"labels"]  
test_data = test_batch[b"data"]
test_labels = test_batch[b"data"]

In [12]:
#preparing our data for the test phase with feature normalization
normalized_training_data_final = []
normalized_training_data_final.append(normalized_data_1)
normalized_training_data_final.append(normalized_data_2)
normalized_training_data_final.append(normalized_data_3)
normalized_training_data_final.append(normalized_data_4)
normalized_training_data_final.append(normalized_data_5)

training_labels_final = data_1[b"labels"] + data_2[b"labels"] + data_3[b"labels"] + data_4[b"labels"] + data_5[b"labels"]  
normalized_test_data = normalized_test_batch
test_labels = test_batch[b"labels"]

In [13]:
distances_test = DistanceCalculation(training_data_final,test_data)

In [14]:
#Calculating the accuracy, the training and test data set have not been normalized
Acc_test = KNN(distances_test, training_labels_final, test_labels, 1)
print(Acc_test)

0.2831


In [13]:
normalized_distance_test = DistanceCalculation(normalized_training_data_final,normalized_test_data)

In [14]:
#Calculating the accuracy, both of the training and test data set have been normalized
Acc_test_normalized = KNN(normalized_distance_test, training_labels_final, test_labels, 14)
print(Acc_test_normalized)

0.3607


In [13]:
normalized_distance_test_noramal_test_data = DistanceCalculation(normalized_training_data_final,test_data)

In [14]:
#Calculating the accuracy, just the training set has been normalized
Acc_test_normalized_regular_test_data = KNN(normalized_distance_test_noramal_test_data, training_labels_final, test_labels, 14)
print(Acc_test_normalized_regular_test_data)

0.3607


In [None]:
##we can see that we have reached the same accuracy for Acc_test_normalized_regular_test_data and Acc_test_normalized which make sense, 