In [2]:
# Import the needed packages
import numpy as np  # numerical processing
import pandas as pd # dataframes

In [3]:
train_data = pd.read_csv("trainMatrixModified.txt",sep='\t', header=None)
test_data = pd.read_csv("testMatrixModified.txt",sep='\t', header=None)
train_class = pd.read_csv("trainClasses.txt",sep='\t', header=None)
test_class = pd.read_csv("testClasses.txt",sep='\t', header=None)

In [4]:
X_train=train_data.T
X_test=test_data.T
X_train.shape

(800, 5500)

In [5]:
y_train=train_class[1]
y_test=test_class[1]
y_train.shape

(800,)

In [7]:
y_train

0      0
1      1
2      0
3      1
4      0
      ..
795    0
796    1
797    1
798    1
799    1
Name: 1, Length: 800, dtype: int64

In [8]:
y_test

0      1
1      0
2      0
3      1
4      1
      ..
195    1
196    1
197    0
198    1
199    1
Name: 1, Length: 200, dtype: int64

In [9]:
class KNN:
    def __init__(self, k=3):
        self.k = k
    
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
    
    def predict(self, X_test):
        # get predictions for every row in test data
        y_pred = [self._get_single_prediction(X_test.iloc[i]) for i in range(X_test.shape[0])]
        return np.array(y_pred)

    def _get_single_prediction(self, x_test_row):
        # get distances of test_row vs all training rows
        distances = [self._get_euclidean_distance(x_test_row, self.X_train.iloc[j]) 
                     for j in range(self.X_train.shape[0])]
        # get indices of k-nearest neighbors -> k-smallest distances
        k_idx = np.argsort(distances)[:self.k]
        
        # get corresponding y-labels of training data
        k_labels = [self.y_train[idx] for idx in k_idx]
        # return most common label
        return np.argmax(np.bincount(k_labels))

    def _get_euclidean_distance(self, x1, x2):
        # calculate euclidean distance for a row pair
        sum_squared_distance = np.sum((x1 - x2)**2)
        return np.sqrt(sum_squared_distance)

In [10]:
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score
def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy

In [11]:
k = 3
clf = KNN(k=k)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)

score = accuracy(y_test, predictions)
    
# print the accuracy
print(f"Accuracy: {score}")
print("Confusion Matrix on training data:\n", confusion_matrix(y_test, predictions))

Accuracy: 0.81
Confusion Matrix on training data:
 [[79 20]
 [18 83]]


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)

knn.fit(X_train,y_train)
X_test = np.ascontiguousarray(X_test)
pred = knn.predict(X_test)

score = accuracy(y_test, pred)
print(f"Accuracy: {score}")
print(confusion_matrix(y_test,pred))

Accuracy: 0.8
[[85 14]
 [26 75]]
