<a href="https://www.kaggle.com/code/shmohseni/heartrate?scriptVersionId=139185983" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# 10-year risk of coronary heart disease

In this dataset we want to realize if a patient has risk of coronary heart disease through future 10 ten years.

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import time
import warnings
warnings.filterwarnings("ignore")
pd.set_option("display.max.columns",None)
pd.set_option("display.max.rows",None)

In [4]:
df = pd.read_csv("/kaggle/input/cardio/cardio.csv", sep=",", header=0)
df.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [None]:
df.info()

In [None]:
df.describe().T

Now we look inside the data and see how many null or na values are placed in our data.

In [5]:
print("Na Values:\n",df.isna().sum())

Na Values:
 male                 0
age                  0
education          105
currentSmoker        0
cigsPerDay          29
BPMeds              53
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            388
TenYearCHD           0
dtype: int64


It turns out that we have some "Na" and "Null" values in a few columns. So we need to manage these before we dive into the algorithm.

In [6]:
#Education
df["education"].fillna(df["education"].mode().to_numpy()[0],inplace=True)
assert df["education"].isna().sum() == 0, "na values in education column"
#cigsPerDay
df["cigsPerDay"].fillna(df["cigsPerDay"].mean(),inplace=True)
assert df["cigsPerDay"].isna().sum() == 0, "na values in cigsPerDay column"
#BPMeds
df["BPMeds"].fillna(df["BPMeds"].mode().to_numpy()[0],inplace=True)
assert df["BPMeds"].isna().sum() == 0, "na values in cigsPerDay column"
#totChol
df["totChol"].fillna(df["totChol"].mean(),inplace=True)
assert df["totChol"].isna().sum() == 0, "na values in cigsPerDay column"
#BMI
df["BMI"].fillna(df["BMI"].mean(),inplace=True)
assert df["BMI"].isna().sum() == 0, "na values in cigsPerDay column"
#heartRate
df["heartRate"].fillna(df["heartRate"].mean(),inplace=True)
assert df["heartRate"].isna().sum() == 0, "na values in cigsPerDay column"
#glucose
df["glucose"].fillna(df["glucose"].mean(),inplace=True)
assert df["glucose"].isna().sum() == 0, "na values in cigsPerDay column"

Now after cleaning data, try to split train and test dataset with 70% of train and 30% of test data.

In [9]:
X = df.drop(columns=["TenYearCHD"],axis=1)
y = df["TenYearCHD"]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=1)

Define a pipeline to standardize and make a logistic regression model to train on the data. To get a deeper view on how does a pipeline work, we create a full view of standardized data. 

In [10]:
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns

# Create a pipeline for standardization
numerical_pipeline = Pipeline([
    ('scaler', StandardScaler())
])

# Create a column transformer to apply the pipeline to numerical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_cols)
    ],
    remainder='passthrough'  # Include non-numerical columns as they are
)

# Fit and transform the data using the pipeline
standardized_data = preprocessor.fit_transform(df)

# Display the standardized data
standardized_data


array([[ 1.1531919 , -1.23495068,  2.00837168, ...,  0.34274444,
        -0.21751656, -0.42330549],
       [-0.86715836, -0.41825733,  0.04448631, ...,  1.59027451,
        -0.26131108, -0.42330549],
       [ 1.1531919 , -0.18491638, -0.93745637, ..., -0.07309892,
        -0.52407818, -0.42330549],
       ...,
       [-0.86715836, -0.18491638,  0.04448631, ...,  0.67541912,
         0.17663409, -0.42330549],
       [-0.86715836, -0.65159829, -0.93745637, ...,  0.84175647,
         0.        , -0.42330549],
       [-0.86715836,  0.28176554,  0.04448631, ...,  0.34274444,
         1.09631895, -0.42330549]])

In [11]:
# Create a pipeline for logistic regression
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
numerical_pipeline = Pipeline([
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_cols)
    ],
    remainder='passthrough'
)

logreg_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

# Fit the pipeline to the training data
logreg_pipeline.fit(X_train, y_train)

# Predict on the test data
y_pred = logreg_pipeline.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
cm = confusion_matrix(y_test, y_pred)
print(cm)

Accuracy: 0.87
[[1093   13]
 [ 155   11]]


As you see accuracy of the model constructed is 87%, buy wait a minute! it seems that we make a slight mistake. "Education" column is an ordinal and its values stand for college degrees, but in the model we act on these values as a numeric. So we use one-hat encoding to replace with encoding and will see that if this correction, increase performance of the model or not.

In [12]:
encoded_df = pd.get_dummies(df, columns=["education"])

In [13]:
encoded_df.head()

Unnamed: 0,male,age,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD,education_1.0,education_2.0,education_3.0,education_4.0
0,1,39,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0,0,0,0,1
1,0,46,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0,0,1,0,0
2,1,48,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0,1,0,0,0
3,0,61,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1,0,0,1,0
4,0,46,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0,0,0,1,0


In [14]:
Xe = encoded_df.drop(columns=["TenYearCHD"],axis=1)
ye = encoded_df["TenYearCHD"]
Xe_train, Xe_test, ye_train, ye_test = train_test_split(Xe,ye,test_size=0.3,random_state=1)
# Create a pipeline for logistic regression
numerical_e_cols = Xe_train.select_dtypes(include=['int64', 'float64']).columns
numerical_e_pipeline = Pipeline([
    ('scaler-e', StandardScaler())
])

preprocessor_e = ColumnTransformer(
    transformers=[
        ('num_e', numerical_e_pipeline, numerical_e_cols)
    ],
    remainder='passthrough'
)

logreg_e_pipeline = Pipeline([
    ('preprocessor', preprocessor_e),
    ('classifier', LogisticRegression())
])

# Fit the pipeline to the training data
logreg_e_pipeline.fit(Xe_train, ye_train)

# Predict on the test data
ye_pred = logreg_e_pipeline.predict(Xe_test)

# Calculate accuracy
accuracy_e = accuracy_score(ye_test, ye_pred)
print(f"Accuracy: {accuracy_e:.2f}")
cm = confusion_matrix(y_test, y_pred)
print(cm)

Accuracy: 0.87
[[1093   13]
 [ 155   11]]


In [33]:
tic_reg = time.time()
ytotal_pred = logreg_e_pipeline.predict(Xe)
toc_reg = time.time()
accuracy_total = accuracy_score(ye, ytotal_pred)
print(f"Accuracy: {accuracy_total:.2f}")
cm = confusion_matrix(ye, ytotal_pred)
print(cm)
elapsed_time_reg = toc_reg - tic_reg
print(f"Time taken: {elapsed_time_reg:.4f}")

Accuracy: 0.85
[[3559   35]
 [ 588   56]]
Time taken: 0.0077


In [15]:
df_new = df[["male","age","prevalentHyp","TenYearCHD"]]

In [34]:
tic = time.time()
df_new["predict"] = (((df_new["male"] == 1) & (df_new["age"] > 52) & (df_new["age"] < 54)) | 
                     ((df_new["male"] == 0) & (df_new["age"] > 51) & (df_new["age"] < 58)))
df_new.replace({"False": 0, "True": 1},inplace=True)
toc = time.time()
accuracy_total_ = accuracy_score(df_new["TenYearCHD"], df_new["predict"])
print(f"Accuracy: {accuracy_total_:.2f}")
cm_ = confusion_matrix(df_new["TenYearCHD"], df_new["predict"])
print(cm_)
elapsed_time = toc - tic
print(f"Time taken:{elapsed_time:.4f}")

Accuracy: 0.76
[[3148  446]
 [ 563   81]]
Time taken:0.0031
