# **Importing the libraries**

In [1]:
import numpy as np 
  
import pandas as pd 
  
import matplotlib.pyplot as plt 

# **Importing the datasets**

In [2]:
train_df = pd.read_csv("/home/somya/Downloads/mnist_train_small.csv")
test_df = pd.read_csv("/home/somya/Downloads/mnist_test.csv")

In [3]:
train_df.head()

Unnamed: 0,6,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,...,0.581,0.582,0.583,0.584,0.585,0.586,0.587,0.588,0.589,0.590
0,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
test_df.head()

Unnamed: 0,7,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,...,0.658,0.659,0.660,0.661,0.662,0.663,0.664,0.665,0.666,0.667
0,2,0,0,0,0,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0,0,0,0,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0,0,0,0,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,0,0,0,0,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,0,0,0,0,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# **Arranging the training dataset**

In [5]:
x_train = train_df.drop(['6'], axis = 1).values

In [6]:
print(x_train)
print(x_train.shape)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
(19999, 784)


In [7]:
y1 = train_df['6']
y_train = y1.to_numpy()
y_train = np.vstack(y_train)

In [8]:
print(y_train)
print(y_train.shape)

[[5]
 [7]
 [9]
 ...
 [2]
 [9]
 [5]]
(19999, 1)


# **Arranging the test dataset**

In [9]:
x_test = test_df.drop(index = 1942 , columns ='7').values

In [10]:
print(x_test)
print(x_test.shape)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(1942, 784)


In [11]:
y2 = test_df['7']
y_test = y2.drop(index = 1942).values
y_test = np.vstack(y_test)

In [12]:
print(y_test)
print(y_test.shape)

[[2]
 [1]
 [0]
 ...
 [5]
 [7]
 [8]]
(1942, 1)


# **Implementation of model**

In [13]:
def distance(x1, x2):

    diff = x1 - x2

    return np.sqrt(np.dot(diff.T, diff))

In [14]:
class KNN:

    def __init__(self, k=3):
        self.k = k

    def fit(self, X, y):
        self.x_train = X
        self.y_train = y

    def predict(self, X):
        predictions = np.array([0])
        for x in X :
            predictions = np.vstack((predictions, self._predict(x)))
        return predictions

    def _predict(self, x):
        
        # Create an array of distances between test example and all examples in the training set
        distances = np.array([])
        for i in self.x_train :
            distances = np.append(distances, distance(x, i))  

        # Sort the distances and return indices of the first k neighbors
        distances_asc = np.argsort(distances)
        first_k = distances_asc[ : self.k]
        
        # Extract the labels of the k nearest neighbor training samples
        k_neighbors = np.array([])
        for j in first_k :
           k_neighbors = np.append(k_neighbors, y_train[j])  
        k_neighbors = k_neighbors.astype(int)
        
        # return the most common class label
        return np.argmax(np.bincount(k_neighbors))

In [15]:
data = KNN(k=3)

In [16]:
data.fit(x_train, y_train)

In [17]:
y_pred = data.predict(x_test)

In [18]:
y_pred = np.delete(y_pred, 0, 0)

In [19]:
l = np.hstack((y_test, y_pred))

result = pd.DataFrame(l, columns = ["Actual", "Predicted"])

print(result)

      Actual  Predicted
0          2          2
1          1          1
2          0          0
3          4          4
4          1          1
...      ...        ...
1937       4          4
1938       1          1
1939       5          5
1940       7          7
1941       8          8

[1942 rows x 2 columns]


In [20]:
from sklearn.metrics import accuracy_score

#print ("Accuracy of the model is: ", accuracy_score(y_pred, y_train))
print ("Accuracy of the model is:", accuracy_score(y_pred, y_test)*100, "%")

Accuracy of the model is: 93.92378990731206 %
