In [1]:
import math 
import numpy as np 
import pandas as pd 
from statistics import mode

In [None]:
# one vector prediction can't be made yet :)  at least 2

In [2]:
class kNN_Scratch(): 
    
    def __init__(self,k):
        """"
            Initializes the kNN model. 
            
            Stores the self.k variable, which
            will be the number of nearest newighbours. 
            
            Also initializing a tuple which will hold the X and y values
            as rows of the sample(given X and y).  
            
        """ 
        self.k = k  
        self.rows  = list()
        
    def distance(self,vec1,vec2):
        """"
            Euclidian distance function:
            Calculates the distance between given vectors, the 2 rows of the sample. 
            Returns the distance.
        """ 
        distance = 0.0  
        for i in range( len(vec1)  ):
            diff = vec1[i] - vec2[i]
            distance +=   np.power(diff, 2 )
        return np.sqrt(distance) 
     
        
    def fit(self, X,y):
        """ 
           Stores the whole dataset in  rows, in a 2d tuple.
           Rows are formed from X ( 1st parameter as a vector ) and y (2nd parameter as a value). 
        """ 
        index_target_counter = 0 
        for row in X:
            self.rows.append( ( row,  y[index_target_counter]) )
            index_target_counter += 1   
        return self 
                         
    
    def predict(self,X): 
        
        #Initializing the prediction vector
        y_pred = np.array([]) 
        
        # Going through every single row of the prediction's X 
        for row in X:
            
            # Initializing a tuple for storing the self.rows and their distances compared to the prediction row.
            distances = list() 
            
            # Going through every stored/initial X's
            for i in range( len(self.rows) ):
                
                # Calculates the distance between prediction row and every row from stored/initial X
                dist = self.distance(row, self.rows[i][0]) 
                
                # Stores all the distances between prediction row and all rows from stored/initial X in a 3d tuple
                distances.append((self.rows[i], dist)) 
                
            # Sorts the distance 3d tuple by the last  parameter which is the distance itself. 
            distances.sort( key = lambda dist_tuple : dist_tuple[1] )   
            
            # Creates a list of all target values(y) from the top k nearest vectors/neighbours.
            target_val = list()
            for i in range(self.k):
                target_val.append( distances[i][0][1] )  
                
            # Choosing the most frequent target values(y) or the Class for the prediction row. 
            most_freq_class = mode(target_val) 
            
            # Stores the most frequent value in the prediction vector
            y_pred = np.append(y_pred,most_freq_class) 
        
        # Returns the prediction vector
        return y_pred
        

# Testing on a dataset

In [3]:
dataset = pd.read_csv('social_ads.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [4]:
# Splitting the data sample.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [5]:
dataset

Unnamed: 0,Age,EstimatedSalary,Purchased
0,19,19000,0
1,35,20000,0
2,26,43000,0
3,27,57000,0
4,19,76000,0
...,...,...,...
395,46,41000,1
396,51,23000,1
397,50,20000,1
398,36,33000,0


## The from scratch model.

In [6]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [7]:
#initializing
kNN_scr = kNN_Scratch(5) 

#fitting the data
kNN_scr.fit( X_train, y_train)  

#getting the predictions on test set.
y_pred = kNN_scr.predict(X_test) 

In [8]:
#initializing the confusion matrix.
cm = confusion_matrix(y_test, y_pred) 

print(cm) 
#printing the accuracy of the model on the test set.
accuracy_score(y_test, y_pred)

[[59  9]
 [ 8 24]]


0.83

## Imported model

In [9]:
from sklearn.neighbors import KNeighborsClassifier 
#initilizing the model
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2) 

#fitting the data.
classifier.fit(X_train, y_train)

KNeighborsClassifier()

In [12]:
#getting the predictions on test set.
y_pred = classifier.predict(X_test) 

In [13]:
#initializing the confusion matrix.
cm = confusion_matrix(y_test, y_pred) 

print(cm) 
#printing the accuracy of the model on the test set.
accuracy_score(y_test, y_pred)

[[59  9]
 [ 8 24]]


0.83

### Conclusion:
##### 1. By the confusion matrix details, we have 17 incorrect predictions out of 100. Meaning that 83 are correct.  
##### 2. Also the models have the same accuracy but the from_scratch model is much slower :) 