dataset: https://www.kaggle.com/rashikrahmanpritom/heart-attack-analysis-prediction-dataset
Objective: creating a predicitor for Heart Attack chance based on provided dataset above.

In [1]:
#imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV, RandomizedSearchCV
from pandas_profiling import ProfileReport

from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import  LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from lightgbm import LGBMClassifier
from sklearn.dummy import DummyClassifier

Splitting the data into train, test

In [2]:
df = pd.read_csv("heart.csv")

In [3]:
df_train, df_test = train_test_split(df, random_state=123)

In [4]:
print(df_train.head())
df_train.columns

     age  sex  cp  trtbps  chol  fbs  restecg  thalachh  exng  oldpeak  slp  \
36    54    0   2     135   304    1        1       170     0      0.0    2   
148   44    1   2     120   226    0        1       169     0      0.0    2   
21    44    1   2     130   233    0        1       179     1      0.4    2   
187   54    1   0     124   266    0        0       109     1      2.2    1   
161   55    0   1     132   342    0        1       166     0      1.2    2   

     caa  thall  output  
36     0      2       1  
148    0      2       1  
21     0      2       1  
187    1      3       0  
161    0      2       1  


Index(['age', 'sex', 'cp', 'trtbps', 'chol', 'fbs', 'restecg', 'thalachh',
       'exng', 'oldpeak', 'slp', 'caa', 'thall', 'output'],
      dtype='object')

In [5]:
X_train = df_train
X_test = df_test

# setting up the target's
y_train = X_train['output']
X_train = X_train.drop(columns=['output'])

y_test = X_test['output']
X_test = X_test.drop(columns=['output'])

#unsure what these cols represent
drop_cols = ['oldpeak', 'slp', 'thall']
X_train = X_train.drop(columns=drop_cols)
X_test = X_test.drop(columns=drop_cols)

X_train.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,caa
36,54,0,2,135,304,1,1,170,0,0
148,44,1,2,120,226,0,1,169,0,0
21,44,1,2,130,233,0,1,179,1,0
187,54,1,0,124,266,0,0,109,1,1
161,55,0,1,132,342,0,1,166,0,0


In [6]:
numeric_features = ['age', 'trtbps', 'chol', 'thalachh', 'sex','cp','fbs','restecg','exng','caa']
categorical_features = []

numeric_preprocessing = make_pipeline(SimpleImputer(strategy='median'), 
                                      StandardScaler())
categorical_preprocessing = make_pipeline(SimpleImputer(strategy='constant', fill_value="?"),
                                          OneHotEncoder(handle_unknown='ignore', sparse=False))

preprocessing = ColumnTransformer([
    ('numeric', numeric_preprocessing, numeric_features),
    ('categorical', categorical_preprocessing, categorical_features)
])

In [7]:
lr = LogisticRegression()
lr_pipe = make_pipeline(preprocessing, lr)
lr_pipe.fit(X_train, y_train)
print(lr_pipe.score(X_train, y_train))


0.8237885462555066


In [8]:
lgbm = LGBMClassifier()
lgbm_pipe = make_pipeline(preprocessing, lgbm)
lgbm_pipe.fit(X_train, y_train)
#print(lgbm_pipe.score(X_train, y_train))

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('standardscaler',
                                                                   StandardScaler())]),
                                                  ['age', 'trtbps', 'chol',
                                                   'thalachh', 'sex', 'cp',
                                                   'fbs', 'restecg', 'exng',
                                                   'caa']),
                                                 ('categorical',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(fill_value='?',
       

Doing some basic CV to tune hyperparameters

In [9]:
param_choices = {
    "lgbmclassifier__num_leaves": [120, 300, 500],
    "lgbmclassifier__max_bin" : [100, 150, 200]
}

In [10]:
grid_search = GridSearchCV(lgbm_pipe, param_choices, verbose=2, n_jobs=-1, cv=10)
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
print(grid_search.best_score_)

Fitting 10 folds for each of 9 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    3.2s


{'lgbmclassifier__max_bin': 150, 'lgbmclassifier__num_leaves': 120}
0.7798418972332015


[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:    3.7s finished


Making a new LGBM based on cv

In [11]:
new_lgbm = LGBMClassifier(max_bin = 150, num_leaves = 120)
new_lgbm_pipe = make_pipeline(preprocessing, new_lgbm)
new_lgbm_pipe.fit(X_train, y_train)
new_lgbm_pipe.score(X_test, y_test)

0.7763157894736842

Score from LGBMClassifier 0.776

In [12]:
rf = RandomForestClassifier()
rf_pipe = make_pipeline(preprocessing, rf)
rf_pipe.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('standardscaler',
                                                                   StandardScaler())]),
                                                  ['age', 'trtbps', 'chol',
                                                   'thalachh', 'sex', 'cp',
                                                   'fbs', 'restecg', 'exng',
                                                   'caa']),
                                                 ('categorical',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(fill_value='?',
       

In [13]:
rfparam_choices = {
    "randomforestclassifier__max_depth": [None, 20, 40, 60, 80],
    "randomforestclassifier__n_estimators" : [100,200, 400, 800, 1200]
}

In [14]:
rfgrid_search = GridSearchCV(rf_pipe, rfparam_choices, verbose=2, n_jobs=-1, cv=10)
rfgrid_search.fit(X_train, y_train)
print(rfgrid_search.best_params_)
print(rfgrid_search.best_score_)

Fitting 10 folds for each of 25 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   29.6s


{'randomforestclassifier__max_depth': None, 'randomforestclassifier__n_estimators': 100}
0.8369565217391303


[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:   52.2s finished


In [15]:
new_rf = RandomForestClassifier(max_depth = None, n_estimators = 100)
new_rf_pipe = make_pipeline(preprocessing, new_rf)
new_rf_pipe.fit(X_train, y_train)
new_rf_pipe.score(X_test, y_test)

0.75

Score from RFClassifier 0.789

In [16]:
rf_importances = pd.DataFrame(data=new_rf_pipe[1].feature_importances_, index=numeric_features, columns=["Importance"])
rf_importances.sort_values(by="Importance", ascending=False)

Unnamed: 0,Importance
caa,0.189364
cp,0.169412
thalachh,0.152336
age,0.130469
chol,0.114833
trtbps,0.090168
exng,0.073834
sex,0.047249
restecg,0.019262
fbs,0.013074


#TODO, tune it more, check/deal with outliers

In [19]:
from canvasutils.submit import convert_notebook
convert_notebook("HeartAttackPred.ipynb", "html")

Notebook successfully converted! 
