# Importing Packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

# K Nearest Neighbor Classification using Euclidean Distance

### Code to calculate Euclidean Distance between two points

In [None]:
def Euclidean_dist(pt1,pt2):
    distance=0.0
    for i in range(len(pt1)):
        distance += (pt1[i]-pt2[i])**2
    return math.sqrt(distance)

### Code to get 'n' Nearest Neighbors

In [None]:
def Nearest_neighbors(train,test_obs,n):
    neighbor_distance= []
    for i in range(len(train)):
        l1=list(train.iloc[i,:])+[Euclidean_dist(train.iloc[i,:-1],test_obs)]
        neighbor_distance= neighbor_distance+[l1]
    neighbor_distance.sort(key=lambda x: x[-1])
    nearest_neighbors= [neighbor_distance[i] for i in range(0,n)]
    y_pred= [i[-2] for i in nearest_neighbors]
    return(int(max(y_pred,key=y_pred.count)))

### Code to return the prediction (0 or 1) for a test observation

In [None]:
def Prediction(train,test_obs,n):
    
    NN=Nearest_neighbors(train,test_obs,3)
    M= [i[n-1] for i in NN]
    
    return(test_obs+[max(M)])

### Normalizing the predictors

In [None]:
def Normalize(data):
    df1=[]
    for i in range(len(data.columns)):
        z=[]
        z= [(k-np.mean(df.iloc[:,i]))/np.std(df.iloc[:,i]) for k in df.iloc[:,i]]
        df1.append(z)
    df1=pd.DataFrame(df1)
    df1=df1.T
    df1.columns=data.columns
    return(df1)

### F-Score

In [None]:
def F_score(Act,Pred):
    ConfusionMatrix= confusion_matrix(Act,Pred)
    
    return((2*ConfusionMatrix[1,1])/(2*ConfusionMatrix[1,1]+ConfusionMatrix[1,0]+ConfusionMatrix[0,1]))

### Code to calculate the accuracy of the Model

In [None]:
def Accuracy(Act,Pred):
    ConfusionMatrix= confusion_matrix(Act,Pred)
    #return(ConfusionMatrix)
    return((ConfusionMatrix[0,0]+ConfusionMatrix[1,1])/(len(Act)))

# Importing the data.

### Data can be downloaded from <a href="https://www.kaggle.com/uciml/pima-indians-diabetes-database" target="_blank">here</a>.

In [None]:
df= pd.read_csv('../input/pima-indians-diabetes-database/diabetes.csv')

In [None]:
df.describe()

# Data Exploration

#### Ideally Glucose, Blood Pressure, Skin Thickness, Insulin levels and BMI should be greater than 0. These values being 0 indicate bad data. 

#### Imputing 0 values with means

In [None]:
df[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']]= df[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN)

In [None]:
df.fillna(df.mean(),inplace=True)

In [None]:
df.describe()

In [None]:
#Distribution plots of predictors

df.hist(bins=10,figsize=(15,10))

#### Looking at the Correlation matrix, we can hypothesize that Glucose levels, BMI and Age are vital in determining whether or not a patient has diabetes 

In [None]:
plt.figure(figsize=(15,10))
p=sns.heatmap(df.corr(),annot=True)

In [None]:
sns.pairplot(data=df,hue='Outcome')

In [None]:
X=df.drop(columns='Outcome')
Y=df['Outcome']

In [None]:
X= Normalize(X)

In [None]:
#A sneakpeak of Normalized data
X.head()

### Splitting the available data into 70% Train and 30% Test data

In [None]:
X_train,X_test,Y_train,Y_test= train_test_split(X,Y,test_size=0.3,random_state=5)

In [None]:
X_train=X_train.join(Y_train)

In [None]:
print(X_train.shape,Y_train.shape,X_test.shape,Y_test.shape,sep='\n')

### Finding the best possible value for 'k' i.e. optimal value for number for neighbors

* The model is developed on train dataset and the accuracy is calculated on the test data. 
* The value of k that gives maximum accuracy is considered the best

In [None]:
Acc=[]
for j in range(1,20):
    pred=[]
    for i in range(len(X_test)):
        pred.append([Nearest_neighbors(X_train,X_test.iloc[i,:],j)])
    Acc= Acc+([Accuracy(Y_test,pred)])

### The accuracy is maximum at k=17. 

In [None]:
Acc

In [None]:
pred=[]
for i in range(len(X_test)):
    pred.append(Nearest_neighbors(X_train,X_test.iloc[i,:],Acc.index(max(Acc))+1))
    
X_test['Pred']= pred
X_test['Outcome']= Y_test

### The model has an accuracy of 76%

In [None]:
from sklearn.metrics import classification_report

print(classification_report(X_test['Outcome'], X_test['Pred']))

### Summary from Confusion Matrix:
* True Negatives: 125
* False Positives: 35
* False Negatives: 21
* True Positives: 50

In [None]:
pd.crosstab(X_test['Outcome'], X_test['Pred'], rownames=['True'], colnames=['Predicted'], margins=True)