# Building KNN algorithm

In [1]:
# Importing necessary libraries
import numpy as np
import pandas as pd
from scipy import stats

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.model_selection import train_test_split

### Methods built for computing KNN classification

In [3]:
'''
Method Name : execute_euclidean_formula
Description : Computes Euclindean distance between two points.
Input:    a : vector cordinates of a point, type : numpy array 
          b : vector coordinates of a point, type : numpy array
Return      : Euclidean distance, type : numerical
'''
def execute_euclidean_formula(a,b):
    return np.sqrt(np.sum(np.square(a - b)))

'''
Method Name  : compute_euclidian_dist
Description  : Computes euclidean distance between every point of test data with every other point of train data
Input      x : Vector cordinates of train data, type : Dataframe
           y : Vector cordinates of test data, type  : Dataframe
Return       : Returns a dataframe with x index as row index and y index as column names, type : Dataframe
'''
def compute_euclidian_dist(x, y):
    y_index_values =  y.index.values
    x_index_values = x.index.values
    eucledianDistance = pd.DataFrame(index=x_index_values)
    for y_index in y_index_values:
        eucledianDistance[y_index] = -1
        for x_index in x_index_values:
            distance = execute_euclidean_formula(x.loc[x_index], y.loc[y_index])

            eucledianDistance.loc[x_index,y_index] = distance
    return eucledianDistance

'''
Method        : knn_prediction
Description   : Makes prediction using Knn Algorithm (Euclidean Distance)
Input X_train : Training Data with features, type : DataFrame
      y_train : Training Data with only Target variable, type : Series
      X_test  : Test Data with features for which the prediction has to done, type : DataFrame
      k       : No of Neighbours needed for prediction, type : int
returns       : predicted values for the given X_test, type : Series
'''
def knn_prediction (X_train, y_train, X_test, k=1):
    eculidian_dist = compute_euclidian_dist(X_train, X_test)
    if (isinstance(y_test.iloc[1], str)):
        target_category = True
    else:
        target_category = False
    
    distance_metrics = eculidian_dist.columns
    y_pred = pd.Series(index= distance_metrics, dtype=object)
    for col in distance_metrics:
        y_index = eculidian_dist[col].sort_values(ascending = True).head(k).index.values
        pred_values = y_train.loc[y_index]
        if target_category:
            y_pred.loc[col] = stats.mode(pred_values)[0][0]
        else:
            y_pred.loc[col] = np.mean(pred_values)
    return y_pred  

### Loading Data

In [5]:
# Using iris dataset for testing.
# reading the data
iris_data = pd.read_csv('iris.csv').drop('Unnamed: 0', axis=1)
iris_data.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


### Testing compute_euclidian_dist function

In [10]:
x = iris_data[1:10].drop('Species', axis=1)
y = iris_data[11:20].drop('Species', axis=1)

In [11]:
compute_euclidian_dist(x, y)

Unnamed: 0,11,12,13,14,15,16,17,18,19
1,0.458258,0.141421,0.678233,1.360147,1.627882,1.053565,0.547723,1.174734,0.83666
2,0.374166,0.264575,0.5,1.363818,1.587451,1.00995,0.519615,1.236932,0.754983
3,0.374166,0.264575,0.519615,1.529706,1.714643,1.16619,0.655744,1.322876,0.866025
4,0.34641,0.640312,0.974679,0.916515,1.086278,0.547723,0.173205,0.793725,0.264575
5,0.812404,1.161895,1.571623,0.678233,0.616441,0.4,0.591608,0.331662,0.387298
6,0.3,0.489898,0.616441,1.360147,1.493318,0.953939,0.509902,1.208305,0.648074
7,0.223607,0.469042,0.905539,1.044031,1.236932,0.7,0.2,0.83666,0.424264
8,0.67082,0.424264,0.34641,1.791647,1.997498,1.431782,0.927362,1.612452,1.148913
9,0.34641,0.173205,0.728011,1.311488,1.555635,1.00995,0.5,1.1,0.754983


##### Calculates the distance between every data of x and with every data of y and represents the data in the form of dataframe.

### Testing knn_prediction() 

In [5]:
# Splitting train and test  using sklearn.train_test_split

X = iris_data.drop('Species', axis=1)
y = iris_data['Species']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,test_size=0.20)

In [6]:
# Standardising the X_train and X_test
scaler = StandardScaler()
scaler.fit(X_train)

X_train_t = scaler.transform(X_train)
X_test_t = scaler.transform(X_test)

X_train = pd.DataFrame(index=X_train.index.values, columns=X_train.columns, data=X_train_t)
X_test = pd.DataFrame(index=X_test.index.values, columns=X_test.columns, data=X_test_t)

In [33]:
# Prediction done using package from sklearn.neighbor

classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(X_train, y_train)
yhat_sklearn = classifier.predict(X_test)
yhat_sklearn = pd.Series(index=y_test.index.values, data = yhat_sklearn)

In [23]:
# Prediction done using the method knn_prediction build above. 

y_hat_knn_prediction = knn_prediction(X_train, y_train, X_test, 5)

In [36]:
'''
Checking if the pedictions made by the sklearn.neighbor.KneighborsClassifiers are same as the predictions made by the
method knn_prediction
''' 
(yhat_sklearn != y_hat_knn_prediction).sum()

0

### Conclusion 
##### The prediction results of the method developed is same as the KneighborsClassifiers from sklearn.neighbor package.