In [1057]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn

In [1058]:
from datetime import datetime as dt

start = dt.now()

In [1059]:
df_raw = pd.read_csv("cardio_train_data.csv")

df_raw.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [1060]:
print(df_raw.shape)

(70000, 13)


In [1061]:
df_raw.describe()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
count,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0
mean,49972.4199,19468.865814,1.349571,164.359229,74.20569,128.817286,96.630414,1.366871,1.226457,0.088129,0.053771,0.803729,0.4997
std,28851.302323,2467.251667,0.476838,8.210126,14.395757,154.011419,188.47253,0.68025,0.57227,0.283484,0.225568,0.397179,0.500003
min,0.0,10798.0,1.0,55.0,10.0,-150.0,-70.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,25006.75,17664.0,1.0,159.0,65.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
50%,50001.5,19703.0,1.0,165.0,72.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
75%,74889.25,21327.0,2.0,170.0,82.0,140.0,90.0,2.0,1.0,0.0,0.0,1.0,1.0
max,99999.0,23713.0,2.0,250.0,200.0,16020.0,11000.0,3.0,3.0,1.0,1.0,1.0,1.0


In [1062]:
# clean up data

df = df_raw.drop(["id"], axis=1)
df = df.drop(df.loc[:, "height":"gluc"], axis=1)

df.head()

Unnamed: 0,age,gender,smoke,alco,active,cardio
0,18393,2,0,0,1,0
1,20228,1,0,0,1,1
2,18857,1,0,0,0,1
3,17623,2,0,0,1,1
4,17474,1,0,0,0,0


In [1063]:
df["cardio"].value_counts()

0    35021
1    34979
Name: cardio, dtype: int64

In [1064]:
# count age by year

df["age"] = round(df["age"] / 365)
df["age"].describe()

count    70000.000000
mean        53.338686
std          6.765294
min         30.000000
25%         48.000000
50%         54.000000
75%         58.000000
max         65.000000
Name: age, dtype: float64

In [1065]:
df["age"].head()

0    50.0
1    55.0
2    52.0
3    48.0
4    48.0
Name: age, dtype: float64

In [1066]:
# recategorise age

# 2015 WHO's age classification
# 25 - 44 : Young : 0
# 45 - 59 : Middle : 1
# 60 - 74 : Elderly : 2
# 75 - 89 : Senile : 3
# 90++ : Long-livers : 4

# data set => min age: 30, max age: 65

df.loc[(df["age"] >= 25) & (df["age"] < 45), "age"] = 0
df.loc[(df["age"] >= 45) & (df["age"] < 60), "age"] = 1
df.loc[(df["age"] >= 60) & (df["age"] < 75), "age"] = 2
df.loc[(df["age"] >= 75) & (df["age"] < 90), "age"] = 3
df.loc[(df["age"] >= 90), "age"] = 4

df.tail()

Unnamed: 0,age,gender,smoke,alco,active,cardio
69995,1.0,2,1,0,1,0
69996,2.0,1,0,0,1,1
69997,1.0,2,0,1,0,1
69998,2.0,1,0,0,0,1
69999,1.0,1,0,0,1,0


In [1067]:
x = df.iloc[:, 0:5]
x.head()

Unnamed: 0,age,gender,smoke,alco,active
0,1.0,2,0,0,1
1,1.0,1,0,0,1
2,1.0,1,0,0,0
3,1.0,2,0,0,1
4,1.0,1,0,0,0


In [1068]:
y = df["cardio"]
y.head()

0    0
1    1
2    1
3    1
4    0
Name: cardio, dtype: int64

In [1069]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)

In [1070]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(n_neighbors=25)

In [1071]:
knn_model.fit(x_train,y_train)

KNeighborsClassifier(n_neighbors=25)

In [1072]:
y_pred = knn_model.predict(x_test)

In [1073]:
from sklearn.metrics import accuracy_score

accScore = accuracy_score(y_test, y_pred)
print("Accuracy score for K = 25 is",accScore)

Accuracy score for K = 25 is 0.5795428571428571


In [1074]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

array([[6935, 1753],
       [5605, 3207]], dtype=int64)

In [1075]:
knn_model.predict([(1,1,0,0,0)])



array([0], dtype=int64)

In [1076]:
knn_model.predict([(2,1,1,1,1)])



array([1], dtype=int64)

In [1077]:
running_secs = (dt.now() - start).seconds

print("Duration of training and prediction:", running_secs, "seconds")

Duration of training and prediction: 3 seconds


In [1078]:
# from sklearn.metrics import f1_score

# f1_score(y_test, y_pred, average="micro")

In [1079]:
# df_zero = df.loc[df['age'] == 0]
# df_one = df.loc[df['age'] == 1]
# df_two = df.loc[df['age'] == 2]
# df_three = df.loc[df['age'] == 3]
# df_four = df.loc[df['age'] == 4]

# df_zero.head()

In [1080]:
# df_one.head()


In [1081]:
# df_two.head(0)


In [1082]:
# k_array = np.arange(11,31,2)

# k_array

In [1083]:
# for k in k_array:
#     knn_ex = KNeighborsClassifier(n_neighbors=k)
#     knn_ex.fit(x_train, y_train)
#     ac = accuracy_score(y_test, knn_ex.predict(x_test))
#     print(k)
#     print(ac)

# seems to plateaued out at 98