In [1]:
import pandas as pd
import numpy as np
import random
import math
import matplotlib as mt
import matplotlib.pyplot as plt

In [2]:
#data reading
def readdata():
    data = pd.read_csv("heart.csv")
    return data

In [3]:
#Normalizing the values
def normalize(X, mean, std):
    return (X - mean) / std

In [4]:
#finding euclidean distance
def euclidean_distance(row1, row2):
    distance = 0.0
    for i in range(len(row1)-1):
        distance += (row1[i] - row2[i])**2
    return math.sqrt(distance)

In [5]:
# Categors the attributes
continuous_variables  = ['Age', 'Cholesterol', 'Oldpeak', 'RestingBP', 'MaxHR']
categorical_variables = ['Sex', 'ChestPainType', 'RestingECG', 'ST_Slope', 'FastingBS', 'ExerciseAngina']

In [6]:
#scaling continuous variables
def scaleFeature(df, col):
    mean = df[col].mean()
    std = df[col].std()
    df[col] = df[col].apply(lambda x: normalize(x, mean, std))

In [7]:
oneh_encoded_data = pd.get_dummies(readdata(), columns = ['Sex', 'ChestPainType','RestingECG','ExerciseAngina','ST_Slope'])
data = oneh_encoded_data

In [8]:
data

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_F,Sex_M,ChestPainType_ASY,...,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0.0,0,0,1,0,...,0,0,0,1,0,1,0,0,0,1
1,49,160,180,0,156,1.0,1,1,0,0,...,1,0,0,1,0,1,0,0,1,0
2,37,130,283,0,98,0.0,0,0,1,0,...,0,0,0,0,1,1,0,0,0,1
3,48,138,214,0,108,1.5,1,1,0,1,...,0,0,0,1,0,0,1,0,1,0
4,54,150,195,0,122,0.0,0,0,1,0,...,1,0,0,1,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,110,264,0,132,1.2,1,0,1,0,...,0,1,0,1,0,1,0,0,1,0
914,68,144,193,1,141,3.4,1,0,1,1,...,0,0,0,1,0,1,0,0,1,0
915,57,130,131,0,115,1.2,1,0,1,1,...,0,0,0,1,0,0,1,0,1,0
916,57,130,236,0,174,0.0,1,1,0,0,...,0,0,1,0,0,1,0,0,1,0


In [None]:
for col in continuous_variables:
    scaleFeature(data, col)

In [None]:
# separate orginal dataset into training and testing sets
train=data.sample(frac=0.8,random_state=100)
test=data.drop(train.index)
newtrain = train.sample(frac=0.8,random_state=0) 
validation = train.drop(newtrain.index)

# predictions
validation['y'] = 0
test['y'] = 0

In [None]:
print('orginal dataset ' + str(data.shape))
print('training dataset ' + str(newtrain.shape))
print('validation dataset ' + str(validation.shape))
print('test dataset ' + str(test.shape))

In [None]:
# Calculate the Euclidean (L2) distance between the current row and a neighbor
def getDistance(row1, row2):
    columns = continuous_variables
    
    squares = 0
    for column in columns:
        value1 = row1[column]
        value2 = row2[column]
        square = (value1 - value2)**2
        squares = squares + square

    dist = math.sqrt(squares)
    return dist


# K closest neighbors
def getNeighbors(X, y, curRow, k):
    row1 = y.iloc[curRow]

    for rowNum in range(len(X.index)):
        row = X.iloc[rowNum]
        dist = getDistance(row1, row)

        if curRow != rowNum:
            X.at[rowNum, 'dist'] = dist
        else:
            X.at[rowNum, 'dist'] = -1
    
    neighbors = X[X['dist']>=0].sort_values('dist', ascending=True).head(k)
    return neighbors

def knnClassify(df, curRow, neighbors):
    class0 = len(neighbors[neighbors['HeartDisease']==0].index)
    class1 = len(neighbors[neighbors['HeartDisease']==1].index)
    yidx = test.columns.get_loc("y")

    # print(class0, class1)
    index = df.iloc[rowNum].index
    if class0 > class1:
        df.iat[curRow, yidx] = 0
    else:
        df.iat[curRow, yidx] = 1

In [None]:
def printConfusionMatrix(tp, fp, tn, fn):
    print("\n%15sActual" % "")
    print("%6s %7s %7s" % ("", "1", "0"))
    print("P%6s +--------+--------+" % "")
    print("r%6s | %-6s | %-6s |" % ("1", 'TP='+str(tp), 'FP='+str(fp)))
    print("e%6s +--------+--------+" % "")
    print("d%6s | %-6s | %-6s |" % ("0", 'FN='+str(fn), 'TN='+str(tn)))
    print(".%6s +--------+--------+\n" % "")


def getConfusionMatrix(df):
    tp = len(df[(df['HeartDisease']==1) & (df['y']==1)])
    fp = len(df[(df['HeartDisease']==0) & (df['y']==1)])
    tn = len(df[(df['HeartDisease']==0) & (df['y']==0)])
    fn = len(df[(df['HeartDisease']==1) & (df['y']==0)])

    return tp, fp, tn, fn


def getAccuracy(tp, fp, tn, fn):
    return (tp+tn)/(tp+tn+fp+fn)

def getPrecision(tp, fp, tn, fn):
    return tp/(tp+fp)

def getRecall(tp, fp, tn, fn):
    return tp/(tp+fn)

def getFmeasure(tp, fp, tn, fn):
    recall = getRecall(tp, fp, tn, fn)
    prec = getPrecision(tp, fp, tn, fn)
    return 2*(recall*prec)/(recall+prec)

In [None]:
accuracy = []
x = []
for k in range(1,20):
    # Get neighbors and predict class for each test case
    for rowNum in range(len(validation.index)):
        neighbors = getNeighbors(newtrain, validation, rowNum, k)
        knnClassify(validation, rowNum, neighbors)
    tp, fp, tn, fn = getConfusionMatrix(validation)
    accuracy.append(getAccuracy(tp, fp, tn, fn))
    x.append(k)
    k = k + 2

In [None]:
plt.plot(x, accuracy, color="skyblue")
plt.xlabel('k')
plt.ylabel('scores')
plt.show()

In [None]:
k = 19
for rowNum in range(len(test.index)):
    neighbors = getNeighbors(newtrain, test, rowNum, k)
    knnClassify(test, rowNum, neighbors)
tp, fp, tn, fn = getConfusionMatrix(test)
printConfusionMatrix(tp, fp, tn, fn)

print("For K = 1")                
print('Accuracy:  %8.5f' % getAccuracy(tp, fp, tn, fn))
print('Precison:  %8.5f' % getPrecision(tp, fp, tn, fn))
print('Recall:    %8.5f' % getRecall(tp, fp, tn, fn))
print('F-Measure: %8.5f' % getFmeasure(tp, fp, tn, fn))  