In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn

In [2]:
from datetime import datetime as dt

start = dt.now()

In [3]:
df_raw = pd.read_csv("cardio_train_data.csv")

df_raw.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [4]:
print(df_raw.shape)

(70000, 13)


In [5]:
df_raw.describe()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
count,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0
mean,49972.4199,19468.865814,1.349571,164.359229,74.20569,128.817286,96.630414,1.366871,1.226457,0.088129,0.053771,0.803729,0.4997
std,28851.302323,2467.251667,0.476838,8.210126,14.395757,154.011419,188.47253,0.68025,0.57227,0.283484,0.225568,0.397179,0.500003
min,0.0,10798.0,1.0,55.0,10.0,-150.0,-70.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,25006.75,17664.0,1.0,159.0,65.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
50%,50001.5,19703.0,1.0,165.0,72.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
75%,74889.25,21327.0,2.0,170.0,82.0,140.0,90.0,2.0,1.0,0.0,0.0,1.0,1.0
max,99999.0,23713.0,2.0,250.0,200.0,16020.0,11000.0,3.0,3.0,1.0,1.0,1.0,1.0


In [6]:
# add mean blood pressure, view age by year

df_raw.rename(columns={"ap_hi":"systolic"}, inplace=True)
df_raw.rename(columns={"ap_lo":"diastolic"}, inplace=True)

mbp = round(df_raw["diastolic"] + (df_raw["systolic"] / 3))
df_raw.insert(12,"mbp", mbp)

df_raw["age"] = round(df_raw["age"] / 365)

df_raw.head()

Unnamed: 0,id,age,gender,height,weight,systolic,diastolic,cholesterol,gluc,smoke,alco,active,mbp,cardio
0,0,50.0,2,168,62.0,110,80,1,1,0,0,1,117.0,0
1,1,55.0,1,156,85.0,140,90,3,1,0,0,1,137.0,1
2,2,52.0,1,165,64.0,130,70,3,1,0,0,0,113.0,1
3,3,48.0,2,169,82.0,150,100,1,1,0,0,1,150.0,1
4,4,48.0,1,156,56.0,100,60,1,1,0,0,0,93.0,0


In [7]:
# clean up data

# gender:
# 1 = male
# 2 = female

# cholestrol:
# 1 = normal
# 2 = above normal
# 3 = well above normal

# cardio:
# 0 = Non-diseased
# 1 = Diseased

df = df_raw.drop(["id"], axis=1)
df = df.drop(df.loc[:, "height":"diastolic"], axis=1)
df = df.drop(df.loc[:, "gluc":"active"], axis=1)

df.head()

Unnamed: 0,age,gender,cholesterol,mbp,cardio
0,50.0,2,1,117.0,0
1,55.0,1,3,137.0,1
2,52.0,1,3,113.0,1
3,48.0,2,1,150.0,1
4,48.0,1,1,93.0,0


In [8]:
df.tail()

Unnamed: 0,age,gender,cholesterol,mbp,cardio
69995,53.0,2,1,120.0,0
69996,62.0,1,2,137.0,1
69997,52.0,2,3,150.0,1
69998,61.0,1,1,125.0,1
69999,56.0,1,2,120.0,0


In [9]:
df["age"].describe()

count    70000.000000
mean        53.338686
std          6.765294
min         30.000000
25%         48.000000
50%         54.000000
75%         58.000000
max         65.000000
Name: age, dtype: float64

In [10]:
df["cardio"].value_counts()

0    35021
1    34979
Name: cardio, dtype: int64

In [11]:
# recategorise age

# 2015 WHO's age classification
# 25 - 44 : Young : 0
# 45 - 59 : Middle : 1
# 60 - 74 : Elderly : 2
# 75 - 89 : Senile : 3
# 90++ : Long-livers : 4
# https://journals.scholarpublishing.org/index.php/ASSRJ/article/view/2924#:~:text=According%20to%20the%20new%20age,long%2Dlivers%20are%20after%2090.

# data set => min age: 30, max age: 65

# my data classification
# < 45 : "young"
# >= 45 && <60 : "middle"
# >= 60 : "old"

# df.loc[(df["age"] < 45), "age"] = "young"
# df.loc[(df["age"] >= 45) & (df["age"] < 60), "age"] = "middle"
# df.loc[(df["age"] >= 60), "age"] = "old"

# df.tail()

In [12]:
# define x

x = df.iloc[:, 0:4]
x.head()

Unnamed: 0,age,gender,cholesterol,mbp
0,50.0,2,1,117.0
1,55.0,1,3,137.0
2,52.0,1,3,113.0
3,48.0,2,1,150.0
4,48.0,1,1,93.0


In [13]:
# define y

y = df["cardio"]
y.head()

0    0
1    1
2    1
3    1
4    0
Name: cardio, dtype: int64

In [14]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)

In [15]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(n_neighbors=9)

In [16]:
knn_model.fit(x_train,y_train)

KNeighborsClassifier(n_neighbors=9)

In [17]:
y_pred = knn_model.predict(x_test)

In [18]:
from sklearn.metrics import accuracy_score

accScore = accuracy_score(y_test, y_pred)
print("Accuracy score for K = 9 is",accScore)

Accuracy score for K = 9 is 0.7121714285714286


In [19]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

array([[6421, 2267],
       [2770, 6042]], dtype=int64)

In [20]:
knn_model.predict([(65.0,1,0,148)])



array([1], dtype=int64)

In [21]:
knn_model.predict([(40.0,2,2,123)])



array([0], dtype=int64)

In [22]:
running_secs = (dt.now() - start).seconds

print("Duration of training and prediction:", running_secs, "seconds")

Duration of training and prediction: 2 seconds


In [23]:
# from sklearn.metrics import f1_score

# f1_score(y_test, y_pred, average="micro")

In [24]:
# df_zero = df.loc[df['age'] == 0]
# df_one = df.loc[df['age'] == 1]
# df_two = df.loc[df['age'] == 2]
# df_three = df.loc[df['age'] == 3]
# df_four = df.loc[df['age'] == 4]

# df_zero.head()

In [25]:
# df_one.head()


In [26]:
# df_two.head(0)


In [27]:
# k_array = np.arange(11,31,2)

# k_array

In [28]:
# for k in k_array:
#     knn_ex = KNeighborsClassifier(n_neighbors=k)
#     knn_ex.fit(x_train, y_train)
#     ac = accuracy_score(y_test, knn_ex.predict(x_test))
#     print(k)
#     print(ac)

# seems to plateaued out at 98