In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

import prepare
import model
df = pd.read_csv('healthcare-dataset-stroke-data.csv')

In [2]:
# prepare data using script
df = prepare.prep_data(df).drop(columns=['age_range', 'id'])
df.stroke.dtype

dtype('O')

In [3]:
# cast stroke column as int
df['stroke'] = df['stroke'].astype('int64')
# check work
df.shape

(5109, 11)

In [4]:
# check work
df.head(3)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,30.640331,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1


In [5]:
# set list of columns to one-hot encode
col_list = ['gender','ever_married','work_type','residence_type','smoking_status']
# apply one-hot encoding using above list
df = pd.get_dummies(df, columns=col_list, drop_first=True)

In [6]:
# split using same random state as explore stage
trainvalidate, test = train_test_split(df, test_size=.2, random_state=777)
train, validate = train_test_split(trainvalidate, test_size=.25, random_state=777)
# check work
train.shape, validate.shape, test.shape

((3065, 16), (1022, 16), (1022, 16))

In [7]:
# isolate target
X_train, y_train = train.drop(columns='stroke'), train.stroke
X_validate, y_validate = validate.drop(columns='stroke'), validate.stroke
X_test, y_test = test.drop(columns='stroke'), test.stroke

In [8]:
# apply scaling using a MinMaxScaler
scaler, X_train_scaled, X_validate_scaled, X_test_scaled = model.Min_Max_Scaler(X_train, X_validate, X_test)

In [9]:
# build & fit models, append model predictions to the train and validate actuals dataframes
y_train_predictions,\
y_validate_predictions = model.classification_shotgun(X_train_scaled, y_train, X_validate_scaled, y_validate)

In [None]:
y_train_predictions,\
y_validate_predictions = model.manual_baseline(y_train_predictions, y_validate_predictions, 1)

In [None]:
pd.set_option("display.max_rows", None, "display.max_columns", None)
y_train_predictions

In [None]:
# calculate accuracy and recall for in- and out-sample predictions
running_df = model.print_classification_results(y_train_predictions, y_validate_predictions)

In [None]:
# display accuracies and recalls for each model, sorting for best out-sample recall
running_df.sort_values(by='OutSample_Recall', ascending=False)

1. Use SMOTE on train and re-calc models
2. Plot ROC curves for train/validate

Might want to think about giving hospitals notice about who is most at-risk!!!

- PCA - Dimensionality Reduction (in relation to feature engineering) (avoid for now)
    * Will reduce 'visibility' of drivers, not nearly as good as 'intentional' design

ROC Curve (definitely implement)
- Done after modeling, another score to use
- https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html#sphx-glr-auto-examples-model-selection-plot-roc-py
- https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html

Probabiltiy can be returned from classification, in this case, it's the risk

In [21]:
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import label_binarize

In [24]:
y = label_binarize(df.stroke, classes=[0,1])
X = df.drop(columns=['stroke'])
n_classes = y.shape[1]

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .4, random_state=123)

In [28]:
clf = OneVsRestClassifier(SVC(kernel="linear", probability=True, random_state=123))
y_score = clf.fit(X_train, y_train).decision_function(X_test)

In [31]:
y_train[:,0]

array([-1.02805908, -1.03012164, -1.02069689, ..., -1.01695888,
       -1.02173508, -1.02841081])

In [30]:
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_train[:,i], y_score[:,i])
    roc_auc[i] = auc(fpr[i], tpr[i])

IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed

In [None]:
fpr, tpr, thresholds = roc_curve(y_train_predictions.in_actuals, y_train_predictions['nb_vsmooth1e-05'])

In [None]:
roc_auc = roc_auc_score(y_train_predictions.in_actuals, y_train_predictions['nb_vsmooth1e-05'])

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dt = DecisionTreeClassifier()

dt = dt.fit(X_train_scaled, y_train)
y_score = dt.predict_proba(X_train_scaled)



In [None]:
roc_curve(y_train, y_score[:,0])

In [None]:
fpr[0], tpr[0], _ = roc_curve(y_train, y_score[:,0])
roc_auc[0] = auc(fpr[0], tpr[0])

In [None]:
roc_curve(y_train.ravel(), y_score.ravel())

In [None]:
import matplotlib.pyplot as plt

plt.figure()
lw = 2
plt.plot(fpr, tpr, color='orange', lw=2, label ="ROC Curve (area = %0.2f)" % roc_auc)
# plt.plot([0, 1], [0, 1], color = 'navy')
# plt.xlim([0, 1])
# plt.ylim([0, 1.05])