# Predicting the risk for heart disease with a classification model


This notebook is part of my **MLzoomcamp Midterm Project**. It includes simple data preprocessing steps to make the dataset machine learnable:
* some column have their values converted to integer 
* categorical nominal variables containing yes/no are replaced by 1/0 values
* categorical ordinal variables are replaced by a sequence of integers starting with zero (see tables bellow)


|gen_health|gen_health (ordinal)|
|:--------:|:------------:|
|poor      |0             |
|fair      |1             |
|good      |2             |
|very good |3             |
|excellent |4             |


|age_category|age_category (ordinal)|
|:---------:|:-:|
|18-24      |0|
|25-29      |1|
|30-34      |2|
|35-39      |3|
|40-44      |4|
|45-49      |5|
|50-54      |6|
|55-59      |7|
|60-64      |8|
|65-69      |9|
|70-74      |10|
|75-79      |11|
|80 or older|12|

> NOTE: One-hot encoding to variable as `sex`,`race` and `diabetic` will be applied after the train/test split.

In [1]:
# importing libraries
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
import sweetviz as sv
import sys

from sklearn.metrics import mutual_info_score

# importing user defined funtions
sys.path.append("../src")
from utility import (
    dataset_dimensions, 
    rename_columns,
    column_missing_values, 
    column_unique_values, 
)

# Data loading

In [2]:
df = pd.read_csv('../data/heart_2020_cleaned_no-duplicates.csv')

In [3]:
df.head(3)

Unnamed: 0,heart_disease,bmi,smoking,alcohol_drinking,stroke,physical_health,mental_health,diff_walking,sex,age_category,race,diabetic,physical_activity,gen_health,sleep_time,asthma,kidney_disease,skin_cancer
0,no,16.6,yes,no,no,3.0,30.0,no,female,55-59,white,yes,yes,very good,5.0,yes,no,yes
1,no,20.34,no,no,yes,0.0,0.0,no,female,80 or older,white,no,yes,very good,7.0,no,no,no
2,no,26.58,yes,no,no,20.0,30.0,no,male,65-69,white,yes,yes,fair,8.0,yes,no,no


# Data preprocessing

## Converting column values to integer

In [4]:
# converting column values to integer
df['physical_health'] = df['physical_health'].astype(int)
df['mental_health'] = df['mental_health'].astype(int)
df['sleep_time'] = df['sleep_time'].astype(int)

## Replacing yes/no with 1/0 values

In [5]:
# replace column values yes/no with 1/0
columns_yes_no = [
    'heart_disease',
    'smoking',
    'alcohol_drinking',
    'stroke',
    'diff_walking',
    'physical_activity',
    'asthma',
    'kidney_disease',
    'skin_cancer'
]

for column in columns_yes_no:
    df[column] = df[column].replace(['yes', 'no'], [1, 0])

In [6]:
df.head(3)

Unnamed: 0,heart_disease,bmi,smoking,alcohol_drinking,stroke,physical_health,mental_health,diff_walking,sex,age_category,race,diabetic,physical_activity,gen_health,sleep_time,asthma,kidney_disease,skin_cancer
0,0,16.6,1,0,0,3,30,0,female,55-59,white,yes,1,very good,5,1,0,1
1,0,20.34,0,0,1,0,0,0,female,80 or older,white,no,1,very good,7,0,0,0
2,0,26.58,1,0,0,20,30,0,male,65-69,white,yes,1,fair,8,1,0,0


## Replacing ordinal variables by a sequence of integers

In [7]:
# manual ordinal encoding
df['gen_health'] = df['gen_health'].replace(['poor', 'fair', 'good', 'very good', 'excellent'], [0, 1, 2, 3, 4])

df['age_category'] = (df['age_category']
                      .replace(
                          [
                              '18-24', 
                              '25-29', 
                              '30-34', 
                              '35-39', 
                              '40-44', 
                              '45-49', 
                              '50-54', 
                              '55-59', 
                              '60-64', 
                              '65-69',  
                              '70-74', 
                              '75-79',  
                              '80 or older'
                          ], 
                          [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
                      )
                     )

In [8]:
df.head()

Unnamed: 0,heart_disease,bmi,smoking,alcohol_drinking,stroke,physical_health,mental_health,diff_walking,sex,age_category,race,diabetic,physical_activity,gen_health,sleep_time,asthma,kidney_disease,skin_cancer
0,0,16.6,1,0,0,3,30,0,female,7,white,yes,1,3,5,1,0,1
1,0,20.34,0,0,1,0,0,0,female,12,white,no,1,3,7,0,0,0
2,0,26.58,1,0,0,20,30,0,male,9,white,yes,1,1,8,1,0,0
3,0,24.21,0,0,0,0,0,0,female,11,white,no,0,2,6,0,0,1
4,0,23.71,0,0,0,28,0,1,female,4,white,no,1,3,8,0,0,0


# Feature importance analysis

In [9]:
global_heart_disease = df.heart_disease.mean()
global_heart_disease

0.0903528803481408

## Heart disease ratio and risk ratio

In [10]:
categorical = [
    'smoking',
    'alcohol_drinking',
    'stroke',
    'physical_health',
    'mental_health',
    'diff_walking',
    'sex',
    'age_category',
    'race',
    'diabetic',
    'physical_activity',
    'gen_health',
    'sleep_time',
    'asthma',
    'kidney_disease',
    'skin_cancer'
]

In [11]:
for feature in categorical:
    df_group = df.groupby(feature).heart_disease.agg(['mean', 'count'])
    df_group['diff'] = df_group['mean'] - global_heart_disease
    df_group['risk'] = df_group['mean'] / global_heart_disease
    display(df_group)
    print()
    print()

Unnamed: 0_level_0,mean,count,diff,risk
smoking,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.064734,174312,-0.025618,0.716463
1,0.125403,127405,0.03505,1.387927






Unnamed: 0_level_0,mean,count,diff,risk
alcohol_drinking,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.09324,280136,0.002888,1.031959
1,0.052871,21581,-0.037482,0.585157






Unnamed: 0_level_0,mean,count,diff,risk
stroke,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.078963,289653,-0.011389,0.873945
1,0.36381,12064,0.273457,4.026542






Unnamed: 0_level_0,mean,count,diff,risk
physical_health,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.068597,208611,-0.021756,0.759207
1,0.057845,10459,-0.032508,0.640211
2,0.078742,14846,-0.011611,0.871491
3,0.097932,8608,0.007579,1.083885
4,0.110589,4467,0.020236,1.223965
5,0.11805,7590,0.027697,1.306545
6,0.13622,1270,0.045868,1.507649
7,0.100454,4629,0.010101,1.111793
8,0.12987,924,0.039517,1.437366
9,0.205556,180,0.115203,2.27503






Unnamed: 0_level_0,mean,count,diff,risk
mental_health,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.096345,187898,0.005992,1.066317
1,0.066378,9235,-0.023975,0.734652
2,0.06383,16309,-0.026523,0.70645
3,0.063196,10412,-0.027157,0.699439
4,0.060216,5364,-0.030137,0.666456
5,0.062982,14004,-0.027371,0.697067
6,0.074172,1510,-0.016181,0.820917
7,0.05765,5516,-0.032702,0.638059
8,0.064899,1094,-0.025453,0.718289
9,0.093596,203,0.003243,1.035895






Unnamed: 0_level_0,mean,count,diff,risk
diff_walking,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.066976,257362,-0.023377,0.741268
1,0.225995,44355,0.135642,2.501246






Unnamed: 0_level_0,mean,count,diff,risk
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.070226,159671,-0.020127,0.777238
male,0.112977,142046,0.022625,1.250403






Unnamed: 0_level_0,mean,count,diff,risk
age_category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.006501,19998,-0.083852,0.071947
1,0.008154,16312,-0.082199,0.090241
2,0.012588,17953,-0.077764,0.139325
3,0.015159,19526,-0.075194,0.167779
4,0.0245,19837,-0.065853,0.271155
5,0.036212,20518,-0.054141,0.400785
6,0.058266,23736,-0.032087,0.644871
7,0.079645,27610,-0.010708,0.881489
8,0.106378,31219,0.016025,1.177356
9,0.128639,31670,0.038286,1.423741






Unnamed: 0_level_0,mean,count,diff,risk
race,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
american indian/alaskan native,0.104391,5192,0.014038,1.155374
asian,0.033279,7993,-0.057074,0.368324
black,0.0758,22810,-0.014553,0.838934
hispanic,0.053233,27107,-0.037119,0.589173
other,0.081352,10891,-0.009001,0.900376
white,0.098343,227724,0.00799,1.088429






Unnamed: 0_level_0,mean,count,diff,risk
diabetic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.069167,251796,-0.021186,0.765522
"no, borderline diabetes",0.11644,6776,0.026087,1.288729
yes,0.220454,40589,0.130101,2.43992
yes (during pregnancy),0.042254,2556,-0.048099,0.46765






Unnamed: 0_level_0,mean,count,diff,risk
physical_activity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.138588,71305,0.048235,1.53385
1,0.075426,230412,-0.014927,0.834791






Unnamed: 0_level_0,mean,count,diff,risk
gen_health,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.341131,11286,0.250778,3.775537
1,0.204363,34659,0.11401,2.261826
2,0.104177,91239,0.013824,1.153001
3,0.050842,104796,-0.039511,0.562701
4,0.025026,59737,-0.065327,0.276985






Unnamed: 0_level_0,mean,count,diff,risk
sleep_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.108893,551,0.01854,1.205196
2,0.162437,788,0.072084,1.797802
3,0.174197,1992,0.083844,1.927961
4,0.152652,7730,0.062299,1.689509
5,0.106277,19101,0.015924,1.176245
6,0.083752,64655,-0.006601,0.926946
7,0.070244,89445,-0.020109,0.777444
8,0.092027,90202,0.001674,1.018526
9,0.10692,15853,0.016567,1.183358
10,0.149704,7782,0.059352,1.656886






Unnamed: 0_level_0,mean,count,diff,risk
asthma,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.08619,259066,-0.004162,0.953931
1,0.115636,42651,0.025283,1.279829






Unnamed: 0_level_0,mean,count,diff,risk
kidney_disease,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.08211,289941,-0.008243,0.908768
1,0.293308,11776,0.202956,3.246254






Unnamed: 0_level_0,mean,count,diff,risk
skin_cancer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.081832,272425,-0.008521,0.90569
1,0.169603,29292,0.07925,1.877114






## Mutual information

In [12]:
def mutual_info_score_heart_disease(series):
    return mutual_info_score(series, df.heart_disease)

In [13]:
mi = df[categorical].apply(mutual_info_score_heart_disease)
mi.sort_values(ascending=False)

age_category         0.033523
gen_health           0.027151
diff_walking         0.015175
diabetic             0.012960
stroke               0.011955
physical_health      0.011333
kidney_disease       0.006925
smoking              0.005384
physical_activity    0.004011
skin_cancer          0.003431
sleep_time           0.002811
sex                  0.002771
race                 0.001976
mental_health        0.001800
alcohol_drinking     0.000756
asthma               0.000601
dtype: float64

# Saving the dataset

In [14]:
# save the dataset
df.to_csv('../data/heart_2020_cleaned_preproc_ordinal.csv', index=False)