<a href="https://colab.research.google.com/github/sangchoi93/data_minor/blob/knn/knnProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [340]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import matplotlib.pyplot as plt
import time
from sklearn.model_selection import KFold, cross_val_score

**Reading dataset**

In [341]:
df=pd.read_csv('heart_failure_clinical_records_dataset.csv')
df['DEATH_EVENT'].value_counts()

0    203
1     96
Name: DEATH_EVENT, dtype: int64

In [342]:
df.columns

Index(['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes',
       'ejection_fraction', 'high_blood_pressure', 'platelets',
       'serum_creatinine', 'serum_sodium', 'sex', 'smoking', 'time',
       'DEATH_EVENT'],
      dtype='object')

**Selecting features and class**

In [343]:
X = df[['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes',
       'ejection_fraction', 'high_blood_pressure', 'platelets',
       'serum_creatinine', 'serum_sodium', 'sex', 'smoking', 'time']].values
X[0:5]

array([[7.5000000e+01, 0.0000000e+00, 5.8200000e+02, 0.0000000e+00,
        2.0000000e+01, 1.0000000e+00, 2.6500000e+05, 1.9000000e+00,
        1.3000000e+02, 1.0000000e+00, 0.0000000e+00, 4.0000000e+00],
       [5.5000000e+01, 0.0000000e+00, 7.8610000e+03, 0.0000000e+00,
        3.8000000e+01, 0.0000000e+00, 2.6335803e+05, 1.1000000e+00,
        1.3600000e+02, 1.0000000e+00, 0.0000000e+00, 6.0000000e+00],
       [6.5000000e+01, 0.0000000e+00, 1.4600000e+02, 0.0000000e+00,
        2.0000000e+01, 0.0000000e+00, 1.6200000e+05, 1.3000000e+00,
        1.2900000e+02, 1.0000000e+00, 1.0000000e+00, 7.0000000e+00],
       [5.0000000e+01, 1.0000000e+00, 1.1100000e+02, 0.0000000e+00,
        2.0000000e+01, 0.0000000e+00, 2.1000000e+05, 1.9000000e+00,
        1.3700000e+02, 1.0000000e+00, 0.0000000e+00, 7.0000000e+00],
       [6.5000000e+01, 1.0000000e+00, 1.6000000e+02, 1.0000000e+00,
        2.0000000e+01, 0.0000000e+00, 3.2700000e+05, 2.7000000e+00,
        1.1600000e+02, 0.0000000e+00, 0.0000

In [344]:
y = df['DEATH_EVENT'].values
y[0:5]

array([1, 1, 1, 1, 1])

**Splitting and scaling data for training and testing**

In [345]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)
scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


**Distance functions**

In [346]:
# Distances
def euclidian(p1, p2): 
    dist = 0
    for i in range(len(p1)):
        dist = dist + np.square(p1[i]-p2[i])
    dist = np.sqrt(dist)
    return dist;

def manhattan(p1, p2): 
    dist = 0
    for i in range(len(p1)):
        dist = dist + abs(p1[i]-p2[i])
    return dist;

def minkowski(p1, p2, q): 
    dist = 0
    for i in range(len(p1)):
        dist = dist + abs(p1[i]-p2[i])**q
    dist = np.sqrt(dist)**(1/q)
    return dist;

**Knn algorithm**

In [347]:
# kNN Function
def knn_predict(X_train,y_train, X_test, k, dist='euclidian',q=2):
    pred = []
    # Adjusting the data type
    if isinstance(X_test, np.ndarray):
        X_test=pd.DataFrame(X_test)
    if isinstance(X_train, np.ndarray):
        X_train=pd.DataFrame(X_train)
        
    for i in range(len(X_test)):    
        # Calculating distances for our test point
        newdist = np.zeros(len(y_train))

        if dist=='euclidian':
            for j in range(len(y_train)):
                newdist[j] = euclidian(X_train.iloc[j,:], X_test.iloc[i,:])
    
        if dist=='manhattan':
            for j in range(len(y_train)):
                newdist[j] = manhattan(X_train.iloc[j,:], X_test.iloc[i,:])
    
        if dist=='minkowski':
            for j in range(len(y_train)):
                newdist[j] = minkowski(X_train.iloc[j,:], X_test.iloc[i,:],q)

        # Merging actual labels with calculated distances
        newdist = np.array([newdist, y_train])

        ## Finding the closest k neighbors
        # Sorting index
        idx = np.argsort(newdist[0,:])

        # Sorting the all newdist
        newdist = newdist[:,idx]
        #print(newdist)

        # We should count neighbor labels and take the label which has max count
        # Define a dictionary for the counts
        c = {'0':0,'1':0,'2':0 }
        # Update counts in the dictionary 
        for j in range(k):
            c[str(int(newdist[1,j]))] = c[str(int(newdist[1,j]))] + 1

        key_max = max(c.keys(), key=(lambda k: c[k]))
        pred.append(int(key_max))
        
    return pred


**Finding best k value for Knn**

In [348]:
knn_model_bestk_time=[]
myknn_model_bestk_time=[]
def find_Ks2(X_train, X_test, y_train, y_test, Ks):
    mean_acc2 = np.zeros((Ks-1))
    std_acc = np.zeros((Ks-1))
    for n in range(1,Ks):
        
        start = int(round(time.time() * 1000))
        #Train Model and Predict  
        neigh = KNeighborsClassifier(n_neighbors = n).fit(X_train,y_train)
        yhat=neigh.predict(X_test)
        mean_acc[n-1] = accuracy_score(y_test, yhat)
        std_acc[n-1]=np.std(yhat==y_test)/np.sqrt(yhat.shape[0])
        time_knn_scikit = int(round(time.time() * 1000)) - start
        knn_model_bestk_time.append(time_knn_scikit)

        start = int(round(time.time() * 1000))
        #Train Model and Predict  
        yhat = knn_predict(X_train, y_train, X_test, k=n)
        mean_acc2[n-1] = accuracy_score(y_test, yhat)
        std_acc2 = np.std(mean_acc2)
        time_knn_myown = int(round(time.time() * 1000)) - start
        myknn_model_bestk_time.append(time_knn_myown)

    return(mean_acc2, std_acc2,mean_acc, std_acc)
Ks = 20

mean_acc2, std_acc2,mean_acc, std_acc = find_Ks2(X_train, X_test, y_train, y_test, Ks)


print( "The best accuracy for scikit learn knn was with", mean_acc.max(), "with k=", mean_acc.argmax()+1) 
print( "The best accuracy for my own knn was with", mean_acc2.max(), "with k=", mean_acc2.argmax()+1) 

The best accuracy for scikit learn knn was with 0.8 with k= 11
The best accuracy for my own knn was with 0.8 with k= 11


In [349]:
best_k_knn = mean_acc.argmax()+1
best_k_myknn = mean_acc2.argmax()+1
kfold = KFold(n_splits=10)
kfold.split(X_train)
knn_model_time=[]
knn_accuracy_model = []
myknn_model_time=[]
myknn_accuracy_model=[]

for train_index, test_index in kfold.split(X):
    # Split train-test
    X_train, X_test = df.iloc[train_index], df.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]

    start = int(round(time.time() * 1000))
    neigh = KNeighborsClassifier(n_neighbors = best_k_knn)
    knn_model = neigh.fit(X_train,y_train)
    knn_accuracy_model.append(accuracy_score(y_test,neigh.predict(X_test),normalize=True)*100)
    time_knn = int(round(time.time() * 1000)) - start
    knn_model_time.append(time_knn)

    start = int(round(time.time() * 1000))
    yhat = knn_predict(X_train, y_train, X_test, k=best_k_myknn)
    myknn_accuracy_model.append(accuracy_score(y_test, yhat)*100)
    time_myknn = int(round(time.time() * 1000)) - start
    myknn_model_time.append(time_myknn)

    diff = [(i / j)*100 for i, j in zip(myknn_model_time,knn_model_time)] 
results_df = pd.DataFrame()
results_df['knn_accuracy_model'] = knn_accuracy_model
results_df['knn_model_time'] = knn_model_time
results_df['myknn_accuracy_model'] = myknn_accuracy_model
results_df['myknn_model_time'] = myknn_model_time
results_df['%diff'] = diff
results_df

Unnamed: 0,knn_accuracy_model,knn_model_time,myknn_accuracy_model,myknn_model_time,%diff
0,10.0,7,10.0,3546,50657.142857
1,16.666667,4,16.666667,3451,86275.0
2,53.333333,4,53.333333,3528,88200.0
3,76.666667,4,76.666667,3524,88100.0
4,76.666667,4,76.666667,3486,87150.0
5,90.0,4,90.0,3516,87900.0
6,60.0,4,60.0,3545,88625.0
7,80.0,4,80.0,3511,87775.0
8,86.666667,4,86.666667,3527,88175.0
9,86.206897,4,86.206897,3416,85400.0
