In [1]:
import pandas as pd
import numpy as np
import os

pwd = os.getcwd()
pwd

'/home/sting/Documents/Data_Science'

In [3]:
df1 = pd.read_csv(pwd + "/Heart_Failure_Prediction/heart.csv")
df1.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [4]:
#dummies for categorical values
df1 = pd.get_dummies(df1, drop_first=True)
df1.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0.0,0,1,1,0,0,1,0,0,0,1
1,49,160,180,0,156,1.0,1,0,0,1,0,1,0,0,1,0
2,37,130,283,0,98,0.0,0,1,1,0,0,0,1,0,0,1
3,48,138,214,0,108,1.5,1,0,0,0,0,1,0,1,1,0
4,54,150,195,0,122,0.0,0,1,0,1,0,1,0,0,0,1


In [5]:
X = df1.drop(["HeartDisease"], axis=1).values
y = df1.HeartDisease

X.shape

#X - features, y - target

(918, 15)

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

X_train.shape

(688, 15)

In [7]:
#1. Decision Tree model
from sklearn.tree import DecisionTreeClassifier

tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)

DecisionTreeClassifier()

In [8]:
#2. Random Forest model
from sklearn.ensemble import RandomForestClassifier

forest_model = RandomForestClassifier()
forest_model.fit(X_train, y_train)


RandomForestClassifier()

In [9]:
from sklearn.metrics import accuracy_score, recall_score, precision_score

model_list = [forest_model, tree_model]

def model_eval(model):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    print("metrics for " + str(model))
    print("accuracy : {}".format(accuracy) + "\n" 
      +   "precision : {}".format(precision) + "\n"
      +   "recall : {}".format(recall) + "\n")

for model in model_list:
    model_eval(model)

metrics for RandomForestClassifier()
accuracy : 0.8652173913043478
precision : 0.8854961832061069
recall : 0.8787878787878788

metrics for DecisionTreeClassifier()
accuracy : 0.8043478260869565
precision : 0.8320610687022901
recall : 0.8257575757575758



In [10]:
#cross-validation test on training set for Linear Reg model, recall scoring
from sklearn.model_selection import cross_val_score

cvl_score = cross_val_score(tree_model, X_train, y_train, cv=3, scoring="recall")
cvl_score

array([0.73809524, 0.808     , 0.784     ])

In [11]:
#cross-validation test on training set for Random Forest model, recall scoring
from sklearn.model_selection import cross_val_score

cvl_score = cross_val_score(forest_model, X_train, y_train, cv=3, scoring="recall")
cvl_score

array([0.8968254, 0.936    , 0.92     ])

In [12]:
from sklearn.model_selection import RandomizedSearchCV

#parameter grid. We will try and take the most useful parameters
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

param_grid = {
    'n_estimators' : n_estimators,
    'max_features' : max_features,
    'max_depth' : max_depth,
    'min_samples_split' : min_samples_split,
    'min_samples_leaf' : min_samples_leaf,
    'bootstrap' : bootstrap
}

#defining the search model 
random_rf = RandomizedSearchCV(forest_model, param_grid,n_iter=100, scoring='recall', verbose=2, cv=3)

#fitting the grid search model
random_rf.fit(X, y)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END bootstrap=False, max_depth=70, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=1800; total time=  10.3s
[CV] END bootstrap=False, max_depth=70, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=1800; total time=  11.7s
[CV] END bootstrap=False, max_depth=70, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=1800; total time=  11.9s
[CV] END bootstrap=False, max_depth=80, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=800; total time=   4.5s
[CV] END bootstrap=False, max_depth=80, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=800; total time=   4.6s
[CV] END bootstrap=False, max_depth=80, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=800; total time=   4.6s
[CV] END bootstrap=False, max_depth=80, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimat

RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   scoring='recall', verbose=2)

In [13]:
random_rf.best_score_

0.8840236686390531

In [14]:
best_forest = random_rf.best_estimator_

In [15]:
model_eval(best_forest)

metrics for RandomForestClassifier(max_depth=60, max_features='sqrt', min_samples_leaf=2,
                       min_samples_split=10, n_estimators=600)
accuracy : 0.9304347826086956
precision : 0.9461538461538461
recall : 0.9318181818181818



In [16]:
#using joblib

import joblib

joblib.dump(best_forest, "heart_disease_model.pkl", compress=5)

['heart_disease_model.pkl']