In [1]:
import pandas as pd

import prepare
import model

In [2]:
# clean original dataset up to explore stage
df = prepare.prep_data(pd.read_csv('healthcare-dataset-stroke-data.csv'))
df.head(3)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,residence_type,avg_glucose_level,bmi,smoking_status,stroke,age_range
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1,65-70
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,30.6,never smoked,1,60-65
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1,75-80


In [3]:
# encode, split, scale, SMOTE the explore-stage data
X_train, y_train, X_validate, y_validate, X_test, y_test = prepare.model_prep(df)
# check results
print('X_train with SMOTE:', X_train.shape)
print('y_train with SMOTE:', y_train.shape)
print('X_validate:', X_validate.shape)
print('y_validate:', y_validate.shape)
print('X_test:', X_test.shape)
print('y_test:', y_test.shape)

Before SMOTE applied: (3065, 31) (3065,)
After SMOTE applied: (5812, 31) (5812,)
X_train with SMOTE: (5812, 31)
y_train with SMOTE: (5812,)
X_validate: (1022, 31)
y_validate: (1022,)
X_test: (1022, 31)
y_test: (1022,)


In [4]:
# shotgun-out several classification models
y_train_predictions,\
y_validate_predictions = model.classification_shotgun(X_train, y_train, X_validate, y_validate)

In [5]:
# choose the baseline guess manually
y_train_predictions,\
y_validate_predictions = model.manual_baseline(y_train_predictions, y_validate_predictions, 1)

In [6]:
# calculate accuracy and recall for in- and out-sample predictions
running_df = model.print_classification_results(y_train_predictions, y_validate_predictions)

In [7]:
# display accuracies and recalls for each model, sorting for best out-sample recall
running_df.sort_values(by=['OutSample_Recall','OutSample_Accuracy'], ascending=False)

Unnamed: 0,Model,InSample_Accuracy,OutSample_Accuracy,InSample_Recall,OutSample_Recall
14,nb_vsmooth1e-07,0.701136,0.449119,0.990021,1.0
13,nb_vsmooth1e-08,0.691672,0.434442,0.990365,1.0
12,nb_vsmooth1e-09,0.691156,0.432485,0.990709,1.0
27,manual_baseline,0.5,0.043053,1.0,1.0
3,tree_maxdepth3,0.813145,0.783757,0.879215,0.977273
1,tree_maxdepth1,0.787853,0.746575,0.868548,0.977273
2,tree_maxdepth2,0.787853,0.746575,0.868548,0.977273
19,nb_vsmooth0.01,0.747763,0.59002,0.945974,0.977273
18,nb_vsmooth0.001,0.748624,0.585127,0.955609,0.977273
17,nb_vsmooth0.0001,0.752409,0.582192,0.966621,0.977273
