In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import math


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
cardio_df = pd.read_csv('../input/cardiovascular-disease-dataset/cardio_train.csv',sep=';')
cardio_df.head()

This dataset contains 12 columns. Age represents the number of days a person has been alive, for gender, 1 means the person is a female, 2 means male,and height and weight are self-explanatory. ap_hi represents the systolic blood pressure which is when the heart is contracting, and ap_lo is the diastolic blood pressure for when the heart muscle relaxes. Cholesterol column ranges from 1-3, where 1 is normal levels, 2 is above normal, and 3 is well above normal. Gluocse follows the same scale as well. Smoking, and alcohol indicate wether a person drinks or not. Cardio represents if a person has cardiovascular disease (CVD) or not. 

Lets drop id since it does not provide any additional information and rename the columns with more descriptive names.

In [None]:
cardio_df.drop(columns=['id'], inplace=True)
cardio_df.columns = ['age(day)', 'gender', 'height(cm)', 'weight(kg)', 'systolic', 'diastolic', 
                  'cholesterol', 'glucose', 'smoker', 'alcohol', 'physical activity', 'CVD']

As you can see measuring age by days is a bit strange way to measure. Let's convert days to years so we can get a better idea of someone's age. Since we don't know when exactly when this dataset was collected it will be hard to figure out how many leap years there are. As a result we will just divide the ages by 365 days. The drawback of this method is we are missing a few extra days which will make the calculated age(year) larger. This makes everyone appear older.

In [None]:
cardio_df['age(years)'] = cardio_df['age(day)'].apply(lambda x: math.floor(x / 365))

## EDA

Now that we have our columns ready lets being to do some EDA and try to some answer a few basic questions.

In [None]:
# Import relevant libraries for visualization
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#How many people in this dataset have CVD?
cardio_df['CVD'].value_counts().plot.pie(figsize=(5, 5))
plt.title("Number of people with CVD vs not CVD")
plt.show()

### Does smoking increase risk of CVD?

In [None]:
smoker_group = cardio_df.groupby(['smoker'], as_index=False)
count = smoker_group.count()['CVD']
num_smokers = smoker_group.sum()['CVD']
percentage_smokers_with_cvd = num_smokers / count * 100
plt.bar(x=[0,1], height=percentage_smokers_with_cvd, align='center', tick_label=["Non-Smoker", "Smoker"])
plt.title("Percentage of Non-smokers vs Smokers with CVD")
plt.show()

Common knowledge dictates that smoking increases the risk of CVD but in our data set it seems the opposite is true. This could be due to an underlying bias in our dataset.

We should also see the effects of smoking and blood pressure. Form clinical knowledge I would expect smokers to have higher blood pressure than non smokers on average

In [None]:
smoker_group.mean()[['systolic', 'diastolic']]

We dont need a bar chart to see that smoking does not increase blood pressure in out dataset on average which quite interesting since smoking has been shown through studies to increase smoking. 

### Does cholesterol correlate with CVD?

In [None]:
cholesterol_group = cardio_df.groupby(['cholesterol'], as_index=False)
cholesterol_count = cholesterol_group.count()['CVD']
cholesterol_sum = cholesterol_group.sum()['CVD']
cholesterol_percentage = cholesterol_sum / cholesterol_count * 100

cholesterol_percentage

plt.bar(x=[0,1,2], height=cholesterol_percentage, align='center', tick_label=["1", "2", "3"])
plt.title("Percentage of Cholesterol level with CVD")
plt.ylabel("% of people with CVD")
plt.xlabel("Cholesterol Level")
plt.show()

We can see that if someone has higher level of cholesterol, they are more likely to have CVD. 

### Does a particular gender have a higher risk of CVD?

In [None]:
gender_group = cardio_df.groupby(['gender'], as_index=False)
gender_count = gender_group.count()['CVD']
gender_sum = gender_group.sum()['CVD']
gender_percentage = gender_sum / gender_count * 100

# 0 = 1 (women)
# 1 = 2 (men)
plt.bar(x=[0,1], height=gender_percentage, yerr=gender_percentage.std(), align='center', tick_label=["Women", "Men"])
plt.title("Gender vs CVD")
plt.ylabel("% of people with CVD")
plt.xlabel("Gender")
plt.show()

Looks like men are slightly more like than women to have CVD which is in align with current clinical evidence. However difference is not statistically different.

### Whats the effect of alcohol on CVD?

In [None]:
alcohol_group = cardio_df.groupby(['alcohol'], as_index=False)
alcohol_count = alcohol_group.count()['CVD']
alcohol_sum = alcohol_group.sum()['CVD']
alcohol_percentage = alcohol_sum / alcohol_count * 100

plt.bar(x=[0,1], height=alcohol_percentage, yerr=alcohol_percentage.std(), align='center', tick_label=["Non-Drinkers", "Drinkers"])
plt.title("Effect of Alcohol on CVD")
plt.ylabel("% of people with CVD")
plt.xlabel("Drinking Status")
plt.show()

Looks like alcohol has almost no impact on CVD. This is not surprising since the current evidence for alcohol's impact on cardiovascular health is controversial. Some say it's detrimental others say it has protective effects.

### Whats the effect of glucose levels on CVD?

In [None]:
glucose_group = cardio_df.groupby(['glucose'], as_index=False)
glucose_count = glucose_group.count()['CVD']
glucose_sum = glucose_group.sum()['CVD']
glucose_percentage = glucose_sum / glucose_count * 100

plt.bar(x=[0,1,2], height=glucose_percentage, yerr=glucose_percentage.std(), align='center', tick_label=["1", "2", "3"])
plt.title("Effect of Glucose levels on CVD")
plt.ylabel("% of people with CVD")
plt.xlabel("Glucose Levels")
plt.show()

This is expected as higher glucose levels can mean person has bad diet and possibly diabetes. Also Diabetes usually coexists with CVD.

### Does Physicial activity have a protective effect against CVD?

In [None]:
physical_group = cardio_df.groupby(['physical activity'], as_index=False)
physical_count = physical_group.count()['CVD']
physical_sum = physical_group.sum()['CVD']
physical_percentage = physical_sum / physical_count * 100

plt.bar(x=[0,1], height=physical_percentage, yerr=physical_percentage.std(), align='center', tick_label=["non-active", "active"])
plt.title("Effect of Physical Activity on CVD")
plt.ylabel("% of people with CVD")
plt.xlabel("Activity Status")
plt.show()

Not surpising here that active people which have lower CVD rates. But I expected the effect to be much more pronounced. Seems here that the effect might not be statistically significant at all. This could be due to how the authors defined physicial activity when they collected the info.

### Does high SBP or DBP affect CVD? For the sake of simplicity we will assume no one has diabetes or any additional risk factors for CVD?

In [None]:
def get_high_SBP(sbp):
    if sbp > 140:
        return 1
    else:
        return 0
    
def get_high_DBP(dbp):
    if dbp > 90:
        return 1
    else:
        return 0

cardio_df['high SBP'] = cardio_df['systolic'].apply(get_high_SBP)
cardio_df['high DBP'] = cardio_df['diastolic'].apply(get_high_DBP)

high_sbp_group = cardio_df.groupby(['high SBP'], as_index=False)
high_sbp_count = high_sbp_group.count()['CVD']
high_sbp_sum = high_sbp_group.sum()['CVD']
high_sbp_percentage = high_sbp_sum / high_sbp_count * 100

high_dbp_group = cardio_df.groupby(['high DBP'], as_index=False)
high_dbp_count = high_dbp_group.count()['CVD']
high_dbp_sum = high_dbp_group.sum()['CVD']
high_dbp_percentage = high_dbp_sum / high_dbp_count * 100

fig = plt.figure(figsize=(6,10))
ax1 = fig.add_subplot(2,1,1)
ax2 = fig.add_subplot(2,1,2)
ax1.bar(x=[0,1], height=high_sbp_percentage, yerr=high_sbp_percentage.std(), align='center', tick_label=["not high", "high"])
ax1.set_title('High SBP and CVD')
ax1.set_ylabel("% of people with CVD")
ax1.set_xlabel("SBP status")

ax2.bar(x=[0,1], height=high_dbp_percentage, yerr=high_dbp_percentage.std(), align='center', tick_label=["not high", "high"])
ax2.set_title('High DBP and CVD')
ax2.set_ylabel("% of people with CVD")
ax2.set_xlabel("DBP status")

fig.show()

High DBP and High SBP have similar effect on CVD. This is because when someone's blood pressure rises, both the SBP and DBP rise together. However there are rare cases where SBP rises on its own. 

We wont be able to look at trends for height and weight since its hard to determine if a person weight is overweight for their height. Instead we will look at BMI which can has cut off's to tell us if a person is overweight.

## Feature Generation

Now that we have explored the basic data we can start to generate additional features.

Body Mass Index or BMI is a measure of body fat based on height and weight that applies to adult men and women. BMI offers a quick solution to figure out if a person is obese or not and as a result if the person has a high risk of CVD.
BMI is calculated by:
    
        BMI = weight(kg) / (height(m) ^ 2)

In [None]:
cardio_df['bmi'] = cardio_df['weight(kg)'] / (cardio_df['height(cm)'] / 100)**2

Next we need to group the BMI numbers into weight classes that was developed by NIH.

In [None]:
def get_bmi_groups(bmi):
    if bmi >= 16 and bmi <18.5:
        return "Underweight"
    elif bmi >= 18.5 and bmi < 25 :
        return "Normal weight"
    elif bmi >= 25 and bmi < 30:
        return "Overweight"
    elif bmi >= 30 and bmi < 35:
        return "Obese Class I (Moderately obese)"
    elif bmi >= 35 and bmi < 40:
        return "Obese Class II (Severely obese)"
    elif bmi >= 40 and bmi < 45:
        return "Obese Class III (Very severely obese)"
    elif bmi >= 45 and bmi < 50:
        return "Obese Class IV (Morbidly Obese)"
    elif bmi >= 50 and bmi < 60:
        return "Obese Class V (Super Obese)"
    elif bmi >= 60:
        return "Obese Class VI (Hyper Obese)"
    
    
cardio_df["bmi_group"] = cardio_df['bmi'].apply(get_bmi_groups)
cardio_df["bmi_group"] = cardio_df["bmi_group"].astype('category')

# Lets visualize the results

bmi_group_groups = cardio_df.groupby(['bmi_group'], as_index=False)
bmi_group_count = bmi_group_groups.count()['CVD']
bmi_group_sum = bmi_group_groups.sum()['CVD']
bmi_group_percentage = bmi_group_sum / bmi_group_count * 100

"""
0 = Normal Weight
1 = Obese Class 1
2 = Obese Class 2
3 = Obese Class 3
4 = Obese Class 4
5 = Obese Class 5
6 = Obese Class 6
7 = Overweight
8 = Underweight
"""

plt.figure(figsize=(18,4))
plt.bar(x=range(0,9), height=bmi_group_percentage, yerr=bmi_group_percentage.std(), align='center',
       tick_label=["Normal Weight", "Class 1", "Class 2", "Class 3", "Class 4", "Class 5", "Class 6", "Overweight", "Underweight"])
plt.title("BMI Class vs CVD rate")
plt.xlabel("BMI Group")
plt.ylabel("% of people with CVD")
plt.show()

We can see that nomal weight BMI and underweight BMI have the lowest effect probabilty of CVD. This is because a person with low BMI would have low body fat composition and most likely low cholesterol as well. Whats interesting here is class 6, the most obese class, has lower CVD risk than overweight or class 5. I can not find a reasonable explanation for this but I believe it could be due to data input error.

BMI does have a few shortcomings. It understimates fat composition for different ethinicites such as South Asians. Pregnant women, body builders, and children will also have inaccurate BMIs.

Another way to measure CVD risk is throught waist circumference (WC). WC is a measure of abdominal obesity. A person can have a normal BMI but still have large amount of fat stored in the abdomen. Abdominal obesity is well-researched risk factor for CVD and is being suggested to be used in adjunct with BMI to determine a person's CVD risk.

According to Bozeman et al. WC can be accurately predicted from a persons age, BMI, and race. Since we do not know the origin of this dataset and the racial makeup, we will assume everyone is Caucasian. Bozeman et al were able to reasonable predict WC from a dataset derived mostly from Caucasian and black individuals. 

There equation for men is:
        
        WCi = b0 + b1BMIi + b2AGEi + b3BLACKi + b4HISPi
        
While the equation for women is:
    
        WCi = c0 + c1BMIi + c2I{AGEi ≥ 35} + c3AGEi × I{AGEi ≥ 35} + c4BLACKi + c5HISPi

Lets implement these equations and predict WC.

Source: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3441760/

In [None]:
def predicted_men_waist(bmi, age):
    b0 = 22.61306
    b1BMI = 2.520738*(bmi)
    b2AGE = 0.1583812*(age)
    return b0 + b1BMI + b2AGE

def predicted_women_waist(bmi, age):
    c0 = 28.81919
    c1BMI = 2.218007*(bmi)
    age_35 = 0
    if age > 35:
        age_35 = 1
    
    c2IAGE35 = -3.688953 * age_35
    IAGE35 = -0.6570163 * age_35
            
    c3AGEi = 0.125975*(age)
    
    return (c0 + c1BMI + c2IAGE35 + IAGE35 + c3AGEi)

def man_or_woman_waist(row):
    if row['gender'] == 1:
        return predicted_women_waist(row['bmi'], row['age(years)'])
    else: 
        return predicted_men_waist(row['bmi'], row['age(years)'])

cardio_df['waist(cm)'] = cardio_df.apply(man_or_woman_waist, axis=1)

In [None]:
# Lets visualize the effect of waist circumference on CVD using cut offs from guidelines
# Cut off for men is > 103 cm
# Cut off for women is > 88 cm

def get_if_waist_over_cutoff(row):
    if row['gender'] == 1 and row['waist(cm)'] > 88:
        return 1
    elif row['gender'] == 2 and row['waist(cm)'] > 103: 
        return 1
    
    return 0

cardio_df['waist cut off'] = cardio_df.apply(get_if_waist_over_cutoff, axis=1)

# Visualizing the results 

waist_cut_off_group = cardio_df.groupby(['waist cut off'], as_index=False)
waist_cut_off_group_count = waist_cut_off_group.count()['CVD']
waist_cut_off_group_sum = waist_cut_off_group.sum()['CVD']
waist_cut_off_group_percentage = waist_cut_off_group_sum / waist_cut_off_group_count * 100

plt.bar(x=[0,1], height=waist_cut_off_group_percentage, yerr=waist_cut_off_group_percentage.std(), align='center', 
        tick_label=["Below Cut Off", "Over Cut Off"])
plt.title("Waist Cut Off vs CVD")
plt.ylabel("% of people with CVD")
plt.xlabel("Waist Cut Off or Not")
plt.show()

The authors equation reasonable predicted the WC for men however for women their equations was little bit more inaccurate. WC also stops being useful when the person is incredibly obese. For instance a person with a BMI of 40, their WC would not be a useful measure anymore.

Being over the cut off is a risk factor for CVD.

Mean Arterial Pressure (MAP) is the average arterial blood pressure level through one cardiac cycle. Studies have shown that MAP can be a good predictor of CVD in young men. MAP may not be true for our population. Either way we will explore the predictive power of MAP on CVD.

MAP is mathematically defined as:

    SBP = Systolic blood pressure
    DBP = Diastolic blood pressure  
    MAP = (SBP + 2 DBP) / 3
    
DBP is multiplied by 2 because the heart contraction cycle spends 2/3 of its time in diastolic phase and 1/3 in the systolic phase. 

Source: https://www.ahajournals.org/doi/pdf/10.1161/01.HYP.36.5.801

In [None]:
def mean_arterial_pressure(row):
    mean_ap = (row['systolic'] + 2*row['diastolic']) / 3
    return mean_ap

cardio_df['map'] = cardio_df.apply(mean_arterial_pressure, axis=1)

Pulse Pressure(PP) is the difference in pressure between the SBP and DBP. Studies have shown that PP can be a risk factor for CVD. However one drawback of PP is that it's predictive power breaks down for people with normal blood pressure (= 120/80 mmHg). For instance a person with BP of 160/120 has a PP of 40 while another person can have the same PP but with normal BP of 120/80.

None the less we will explore the effects of PP on CVD.

PP is defined as:

        PP = SBP - DBP

In [None]:
def get_pulse_pressure(row):
    pulse_pressure = (row['systolic'] - row['diastolic'])
    return pulse_pressure

cardio_df['pulse_pressure'] = cardio_df.apply(get_pulse_pressure, axis=1)

## Model Prediction

Lets see if we can build a model to predict if a person has CVD or not. We will use a random forest classifer as our model because this model handles categorical variables well and can provide high level of model explanability.

We will use F1 score as our metric because we want the model to have high level of precision and accuracy (in other terms we don't want the model to misdiagnose people).

In [None]:
# Encode BMI group to catergoical values so our classifer can handle the data.
cardio_df['bmi_group'] = cardio_df['bmi_group'].astype('category')
cardio_df['bmi_group_cat'] = cardio_df['bmi_group'].cat.codes

cardio_df.columns

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split 
from sklearn.metrics import f1_score

rf_clf = RandomForestClassifier()

## Lets just apply all columns and see what happens
X = cardio_df[['gender', 'height(cm)', 'weight(kg)', 'systolic',
               'diastolic', 'cholesterol', 'glucose', 'smoker', 'alcohol',
               'physical activity', 'age(years)', 'high SBP', 'high DBP', 'bmi',
               'waist(cm)', 'waist cut off', 'map', 'pulse_pressure',
               'bmi_group_cat']]

y = cardio_df['CVD']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
rf_clf.fit(X_train, y_train)
predictions = rf_clf.predict(X_test)

f1 = f1_score(y_test, predictions) 

print(f1)

When we put all out variables into the model we can an F score of ~0.67 which is not that great at all. Lets see if we can do some feature selection to improve our model more.

For our feature selection lets automate the selection using recursive feature elimination. This selector works by creating many models using a subset of features and trying to determine which feature had the important predictive power.

In [None]:
from sklearn.feature_selection import RFE

rfe = RFE(rf_clf, 3, step=1)
rfe.fit(X, y)

feature_rank = pd.DataFrame()
feature_rank['Features'] = ['gender', 'height(cm)', 'weight(kg)', 'systolic',
                           'diastolic', 'cholesterol', 'glucose', 'smoker', 'alcohol',
                           'physical activity', 'age(years)', 'high SBP', 'high DBP', 'bmi',
                           'waist(cm)', 'waist cut off', 'map', 'pulse_pressure',
                           'bmi_group_cat']
feature_rank['Ranking'] = rfe.ranking_
feature_rank

This is very interesting. The model confirms the current scientific evidence that BMI, waist circumference and systolic blood pressure are the biggest risk factors for CVD. Whats also interesting to note is BMI groups, waist cut offs, smoker status, and alcohol status were not as important.

Lets build another model from the the top 3 features and compare its F1 score with the original.

In [None]:
top_3_features_rf_clf = RandomForestClassifier()

X = cardio_df[['systolic', 'bmi', 'waist(cm)']]
y = cardio_df['CVD']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

top_3_features_rf_clf.fit(X_train, y_train)
predictions = top_3_features_rf_clf.predict(X_test)

f1_top_3 = f1_score(y_test, predictions) 

print(f1)
print(f1_top_3)

We can see that there is a ~0.02 difference between the two models. Which is great that we dont need that mean features to get the same predictive power but bad since the F1 score is still pretty low. 

Lets try building an ensemble model with catBoost, XGB and LightGBM and see if that improves our model.

## Ensemble Method

In [None]:
import xgboost as xgb
import catboost as cb
import lightgbm as lgb

# XGB model
data_dmatrix = xgb.DMatrix(data=X, label=y)
xgb_clf = xgb.XGBClassifier()
xgb_clf.fit(X_train, y_train)
xgb_predictions_train = xgb_clf.predict(X_train)

# CatBoost model
cb_clf = cb.CatBoostClassifier(silent=True)
cb_clf.fit(X_train, y_train)
cb_predictions_train = cb_clf.predict(X_train)

# LightGBM model
lg_clf = lgb.LGBMClassifier(silent=True)
lg_clf.fit(X_train, y_train)
lg_predictions_train = lg_clf.predict(X_train)

ensemble_df = X_train
ensemble_df['xgb'] = xgb_predictions_train
ensemble_df['cb'] = cb_predictions_train
ensemble_df['lg'] = lg_predictions_train

# Final ensemble model will be XGB
final_clf = xgb.XGBClassifier()
final_clf.fit(ensemble_df, y_train)

In [None]:
ensemble_df.head()

In [None]:
def ensemble_predict(X_test, xgb_clf, cb_clf, lg_clf, final_clf):
    xgb_predict = xgb_clf.predict(X_test)
    cb_predict = cb_clf.predict(X_test)
    lg_predict = lg_clf.predict(X_test)
    
    ensemble_df = X_test.copy(deep=True)
    ensemble_df['xgb'] = xgb_predict
    ensemble_df['cb'] = cb_predict
    ensemble_df['lg'] = lg_predict
    print(ensemble_df.columns)
    final_predictions = final_clf.predict(ensemble_df)
    
    return final_predictions


final_predictions = ensemble_predict(X_test, xgb_clf, cb_clf, lg_clf, final_clf)

In [None]:
ensemble_f1 = f1_score(y_test, final_predictions)
print(ensemble_f1)

Unfortunately Ensemble-ing did not increase the F1 score at all. Infact the F1 score dropped from the individual F1 scores of the gradient boosted algorithms.