In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

import prepare
import model
df = pd.read_csv('healthcare-dataset-stroke-data.csv')

In [2]:
# prepare data using script
df = prepare.prep_data(df).drop(columns=['age_range', 'id'])
df.stroke.dtype

dtype('O')

In [3]:
# cast stroke column as int
df['stroke'] = df['stroke'].astype('int64')
# check work
df.shape

(5109, 11)

In [4]:
# check work
df.head(3)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,30.640331,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1


In [5]:
# set list of columns to one-hot encode
col_list = ['gender','ever_married','work_type','residence_type','smoking_status']
# apply one-hot encoding using above list
df = pd.get_dummies(df, columns=col_list, drop_first=True)

In [6]:
# split using same random state as explore stage
trainvalidate, test = train_test_split(df, test_size=.2, random_state=777)
train, validate = train_test_split(trainvalidate, test_size=.25, random_state=777)
# check work
train.shape, validate.shape, test.shape

((3065, 16), (1022, 16), (1022, 16))

In [7]:
# isolate target
X_train, y_train = train.drop(columns='stroke'), train.stroke
X_validate, y_validate = validate.drop(columns='stroke'), validate.stroke
X_test, y_test = test.drop(columns='stroke'), test.stroke

In [8]:
# apply scaling using a MinMaxScaler
scaler, X_train_scaled, X_validate_scaled, X_test_scaled = model.Min_Max_Scaler(X_train, X_validate, X_test)

In [9]:
# build & fit models, append model predictions to the train and validate actuals dataframes
y_train_predictions,\
y_validate_predictions = model.classification_shotgun(X_train_scaled, y_train, X_validate_scaled, y_validate)

In [10]:
pd.set_option("display.max_rows", None, "display.max_columns", None)
y_train_predictions

Unnamed: 0,in_actuals,baseline,cv_tree_maxdepth1,cv_tree_maxdepth2,cv_tree_maxdepth3,cv_tree_maxdepth5,cv_tree_maxdepth10,rf_depth1,rf_depth2,rf_depth3,rf_depth5,rf_depth10,logit,nb_vsmooth1e-09,nb_vsmooth1e-08,nb_vsmooth1e-07,nb_vsmooth1e-06,nb_vsmooth1e-05,nb_vsmooth0.0001,nb_vsmooth0.001,nb_vsmooth0.01,nb_vsmooth10,nb_vsmooth100,knn_n3,knn_n5,knn_n10,knn_n25,knn_n75
448,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0
4128,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0
4699,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0
5035,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0
3007,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0
3212,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0
2154,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1695,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0
4451,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0
2380,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [11]:
# calculate accuracy and recall for in- and out-sample predictions
running_df = model.print_classification_results(y_train_predictions, y_validate_predictions)

In [12]:
# display accuracies and recalls for each model, sorting for best out-sample recall
running_df.sort_values(by='OutSample_Recall', ascending=False)

Unnamed: 0,Model,InSample_Accuracy,OutSample_Accuracy,InSample_Recall,OutSample_Recall
13,nb_vsmooth1e-08,0.377162,0.383562,0.980132,1.0
12,nb_vsmooth1e-09,0.351387,0.354207,0.980132,1.0
14,nb_vsmooth1e-07,0.424144,0.428571,0.953642,0.931818
15,nb_vsmooth1e-06,0.497879,0.510763,0.92053,0.909091
16,nb_vsmooth1e-05,0.578467,0.588063,0.89404,0.886364
17,nb_vsmooth0.0001,0.688091,0.695695,0.807947,0.795455
18,nb_vsmooth0.001,0.769984,0.785714,0.695364,0.659091
19,nb_vsmooth0.01,0.825122,0.83953,0.569536,0.545455
5,cv_tree_maxdepth10,0.980098,0.949119,0.609272,0.136364
22,knn_n3,0.956281,0.951076,0.172185,0.022727
