# Stroke Prediction Exploration
Dataset from Kaggle: https://www.kaggle.com/fedesoriano/stroke-prediction-dataset

In [None]:
# Imports
import random

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
from imblearn.over_sampling import SMOTE
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [None]:
# Data Parsing
stroke_df: pd.DataFrame = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
display(stroke_df.head())
print(f'Total rows: {len(stroke_df)}')

In [None]:
# NaN/NA analysis
print('NA counts:')
display(stroke_df.isna().sum())
# Fill bmi NaN with mean
stroke_df.bmi.fillna(stroke_df.bmi.mean(), inplace=True)
print('NA counts:')
display(stroke_df.isna().sum())

In [None]:
categorical_cols = [
    'gender',
    'hypertension',
    'heart_disease',
    'ever_married',
    'work_type',
    'Residence_type',
    'smoking_status',
]

# Uniques
print('Categorical features uniques count:')
display(stroke_df[categorical_cols].nunique())

plt.style.use('ggplot')
fig, ax = plt.subplots(len(categorical_cols), 1, figsize=(16,4*len(categorical_cols)))

for ii, col in enumerate(categorical_cols):
    ax[ii].set_title(col)
    ax[ii].bar(x=stroke_df[col].astype(str).unique(), height=stroke_df[col].value_counts(), color='tab:blue')

In [None]:
numeric_cols = [
    'age',
    'avg_glucose_level',
    'bmi',
]

# Plot inter-feature correlations
print('Inter-feature correlations:')
plt.figure(figsize=(len(numeric_cols)/1.5,len(numeric_cols)/1.5))
corr = stroke_df[numeric_cols].corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, mask=mask, center=0, square=True, linewidths=.5, cbar_kws={"shrink": .5}, cmap="RdBu", vmin=-1, vmax=1)
sns.set(font_scale=1)

# Plot numeric distributions (histograms)
plt.style.use('ggplot')
fig, ax = plt.subplots(len(numeric_cols), 1, figsize=(16,4*len(numeric_cols)))
for ii, col in enumerate(numeric_cols):
    ax[ii].set_title(col)
    ax[ii].hist(x=stroke_df[col], color='tab:blue')

In [None]:
# Cleaning
print(f'Uncleaned row count: {len(stroke_df)}')

clean_stroke_df = stroke_df.copy(deep=True)

# Remove low count categories
# clean_stroke_df = clean_stroke_df[clean_stroke_df.gender != 'Other']
# clean_stroke_df = clean_stroke_df[clean_stroke_df.work_type != 'Never_worked']

# Remove NA
clean_stroke_df = clean_stroke_df.dropna()

# Remove Outliers - set threshold to None to not remove
extreme_outlier_thresholds = {
    'age': None,
    'avg_glucose_level': None,
    'bmi': None
}
for k, v in extreme_outlier_thresholds.items():
    if v:
        clean_stroke_df = clean_stroke_df[clean_stroke_df[k] <= v]

# Reset index
clean_stroke_df = clean_stroke_df.reset_index(drop=True)

print(f'Cleaned row count: {len(clean_stroke_df)}')

In [None]:
# Feature Engineering
clean_stroke_df['age_stdised'] = (clean_stroke_df.age - clean_stroke_df.age.mean()) / clean_stroke_df.age.std()
clean_stroke_df['bmi_stdised'] = (clean_stroke_df.bmi - clean_stroke_df.bmi.mean()) / clean_stroke_df.bmi.std()
clean_stroke_df['gluc_stdised'] = (clean_stroke_df.avg_glucose_level - clean_stroke_df.avg_glucose_level.mean()) / clean_stroke_df.avg_glucose_level.std()
clean_stroke_df['age_bmi'] = clean_stroke_df.age_stdised * clean_stroke_df.bmi_stdised
clean_stroke_df['age_gluc'] = clean_stroke_df.age_stdised * clean_stroke_df.gluc_stdised
clean_stroke_df['gluc_bmi'] = clean_stroke_df.gluc_stdised * clean_stroke_df.bmi_stdised
clean_stroke_df['age_gluc_bmi'] = clean_stroke_df.age_stdised * clean_stroke_df.gluc_stdised * clean_stroke_df.bmi_stdised
# One Hot Encoding
print('Unencoded Data:')
display(clean_stroke_df)
enc_stroke_df = pd.get_dummies(clean_stroke_df, columns=categorical_cols)
print('One Hot Encoded Data:')
display(enc_stroke_df)

In [None]:
# Over sample minority class
feature_cols = [x for x in enc_stroke_df.columns if x not in {'stroke', 'id'}]
target_col = 'stroke'

smote = SMOTE()
x_smote, y_smote = smote.fit_resample(enc_stroke_df[feature_cols], enc_stroke_df[[target_col]])

smoted_df = pd.concat([x_smote, y_smote], axis=1)

plt.style.use('ggplot')
fig, ax = plt.subplots(2, 1, figsize=(16,10))
ax[0].bar(x=stroke_df[target_col].astype(str).unique(), height=stroke_df[target_col].value_counts(), color='tab:blue')
ax[0].set_title('Original')
ax[1].bar(x=smoted_df[target_col].astype(str).unique(), height=smoted_df[target_col].value_counts(), color='tab:blue')
ax[1].set_title('SMOTE')

plt.show()

In [None]:
# Test train splits
train_df, test_df = train_test_split(smoted_df, test_size=0.2)
# Convert to numpy arrays
x_train = train_df[feature_cols].to_numpy()
y_train = train_df[target_col].to_numpy()
x_test = test_df[feature_cols].to_numpy()
y_test = test_df[target_col].to_numpy()

# Initialise list to hold model f1 scores
f1_results = list()

# Boolean flag to perform grid search (True) or use predetermined parameters (False) 
grid_search = False

In [None]:
# Define function for generating confusion matrix with Precision-Recall and ROC curves
def score_model(model,
                train_label,
                test_label,
                train_pred,
                test_pred,
                train_pred_proba,
                test_pred_proba):
    for heading, label_actual, label_pred, label_pred_prob in zip(['TRAINING SET', 'TEST SET'], [train_label, test_label], [train_pred, test_pred], [train_pred_proba, test_pred_proba]):
        print('\n{:s}'.format(heading))
        cm = metrics.confusion_matrix(label_actual, label_pred)
        if len(cm) == 1:
            cm = [[cm[0][0], 0], [0, 0]]
        df_cm = pd.DataFrame(cm, index=['Actual no', 'Actual yes'], columns=['Predicted no', 'Predicted yes'])
        display(df_cm)
        print('Precision: {:,.1f}%'.format(metrics.precision_score(label_actual, label_pred) * 100))
        print('Recall: {:,.1f}%'.format(metrics.recall_score(label_actual, label_pred) * 100))
        print('ROC AUC: {:.2f}'.format(metrics.roc_auc_score(label_actual, label_pred_prob)))
        print('Average Precision: {:.2f}'.format(metrics.average_precision_score(label_actual, label_pred_prob)))
        print('')

    fig, ax = plt.subplots(2, 1, figsize=(16,10))
    # Plot Precision-Recall Curve
    baseline_precision = len([x for x in y_test if x==1]) / len(y_test)
    ax[0].plot([0, 1], [baseline_precision, baseline_precision], 'k', linestyle='--', label='Baseline (AP: {:.2f})'.format(baseline_precision))
    metrics.plot_precision_recall_curve(model, x_train, y_train, color='b', ax=ax[0]);
    metrics.plot_precision_recall_curve(model, x_test, y_test, color='r', ax=ax[0]);
    ax[0].set_xlabel('Recall (True Positive Rate)', fontsize=12)
    ax[0].set_ylabel('Precision', fontsize=12)
    ax[0].set_title('Precision-Recall Curves', fontsize=14)
    ax[0].set_xlim([-.05,1.05])
    ax[0].set_ylim([-.05,1.05])
    ax[0].tick_params(labelsize=10)

    # Plot ROC curve
    ax[1].plot([0, 1], [0, 1], 'k', linestyle='--', label='Baseline (ROC score: 0.5)')
    train_fpr, train_tpr, _ = metrics.roc_curve(y_train, y_train_pred_proba)
    test_fpr, test_tpr,  _ = metrics.roc_curve(y_test, y_test_pred_proba)
    ax[1].plot(train_fpr, train_tpr, 'b', label='Train (ROC score: {:.2f})'.format(metrics.roc_auc_score(y_train, y_train_pred_proba)))
    ax[1].plot(test_fpr, test_tpr,  'r', label='Test (ROC score: {:.2f})'.format(metrics.roc_auc_score(y_test, y_test_pred_proba)))
    ax[1].legend()
    ax[1].set_xlabel('False Positive Rate', fontsize=12)
    ax[1].set_ylabel('True Positive Rate (Recall)', fontsize=12)
    ax[1].set_title('ROC Curves', fontsize=14)

    plt.show()

In [None]:
# XgBoost Model
print('XgBoost Model')

if grid_search:
    param_grid = {
        'learning_rate': [x / 10 for x in range(2, 9, 1)],
        'max_depth': range(5, 30, 5),
        'min_child_weight': range(1, 3, 1),
        'subsample': [x / 10 for x in range(7, 11, 1)]
    }

    init_clf = xgb.XGBClassifier(
        n_estimators=300,
        use_label_encoder=False,
        objective='binary:logistic',
        n_jobs=-1, 
        verbosity=0
    )

    gscv = GridSearchCV(
        estimator=init_clf,
        param_grid=param_grid,
        scoring='f1',
        n_jobs=1,
        refit=True,
        cv=5,
        verbose=4
    )
    gscv.fit(x_train, y_train)

    bst = gscv.best_estimator_
    print(f'Best Score (f1): {gscv.best_score_}')
    print('Grid searched parameters:')
    display(gscv.best_params_)
else:
    bst = xgb.XGBClassifier(
        n_estimators=300,
        use_label_encoder=False,
        objective='binary:logistic', 
        learning_rate=0.3,
        max_depth=20,
        min_child_weight=1,
        subsample=0.8,
        n_jobs=-1, 
        verbosity=1
    )

    bst.fit(x_train, y_train)

# Prediction
y_train_pred_proba = bst.predict(x_train)
y_test_pred_proba = bst.predict(x_test)
y_train_pred = [int(x > 0.5) for x in y_train_pred_proba]
y_test_pred = [int(x > 0.5) for x in y_test_pred_proba]

y_train = list(train_df[target_col])
y_test = list(test_df[target_col])

score_model(bst, y_train, y_test, y_train_pred, y_test_pred, y_train_pred_proba, y_test_pred_proba)

xgb.plot_importance(bst)
f1_results.append(('XgBoost', metrics.f1_score(y_test, y_test_pred)))

In [None]:
# Random Forest
print('Random Forest Model')

if grid_search:
    param_grid = {
        'criterion': ['gini'],
        'max_depth': range(15, 30, 5),
        'min_samples_split': range(2, 5, 1),
        'min_samples_leaf': range(1, 4, 1),
        'max_features': ['sqrt', None],
        'ccp_alpha': [x / 1000 for x in range(0, 25, 5)],
        'max_samples': [0.9, None]
    }

    init_rfc = RandomForestClassifier(
        n_estimators=300,
        bootstrap=True,
        n_jobs=-1,
        verbose=0,
        random_state=73
    )

    gscv = GridSearchCV(
        estimator=init_rfc,
        param_grid=param_grid,
        scoring='f1',
        n_jobs=1,
        refit=True,
        cv=5,
        verbose=4
    )
    gscv.fit(x_train, y_train)

    rfc = gscv.best_estimator_
    print(f'Best Score (f1): {gscv.best_score_}')
    print('Grid searched parameters:')
    display(gscv.best_params_)
else:
    rfc = RandomForestClassifier(
        criterion='gini',
        n_estimators=300,
        max_depth=15,
        min_samples_split=3,
        min_samples_leaf=1,
        max_features='sqrt',
        bootstrap=True,
        n_jobs=-1,
        verbose=1,
        ccp_alpha=0.0,
        random_state=73
    )
    rfc.fit(x_train, y_train)

# Prediction
y_train_pred_proba = rfc.predict_proba(x_train)[:, 1]
y_test_pred_proba = rfc.predict_proba(x_test)[:, 1]
y_train_pred = rfc.predict(x_train)
y_test_pred = rfc.predict(x_test)

y_train = list(train_df[target_col])
y_test = list(test_df[target_col])

score_model(rfc, y_train, y_test, y_train_pred, y_test_pred, y_train_pred_proba, y_test_pred_proba)

display(pd.DataFrame({
    'Feature': feature_cols,
    'Importance': rfc.feature_importances_
}).sort_values(by='Importance', ascending=False, inplace=False))

f1_results.append(('Random Forest', metrics.f1_score(y_test, y_test_pred)))

In [None]:
# Logistic Regression
print('Logistic Regression Model')

if grid_search:
    param_grid = {
        'tol': [0.00005, 0.0001, 0.00015],
        'max_iter': range(100, 1100, 100)
    }

    init_clf = LogisticRegression(
        n_jobs=-1,
        verbose=0,
        random_state=73
    )

    gscv = GridSearchCV(
        estimator=init_clf,
        param_grid=param_grid,
        scoring='f1',
        n_jobs=1,
        refit=True,
        cv=5,
        verbose=4
    )
    gscv.fit(x_train, y_train)

    clf = gscv.best_estimator_
    print(f'Best Score (f1): {gscv.best_score_}')
    print('Grid searched parameters:')
    display(gscv.best_params_)
else:
    clf = LogisticRegression(
        tol=0.00005,
        max_iter=800,
        n_jobs=-1,
        verbose=1,
        random_state=73
    )
    clf.fit(x_train, y_train)

# Prediction
y_train_pred_proba = clf.predict_proba(x_train)[:, 1]
y_test_pred_proba = clf.predict_proba(x_test)[:, 1]
y_train_pred = clf.predict(x_train)
y_test_pred = clf.predict(x_test)

y_train = list(train_df[target_col])
y_test = list(test_df[target_col])

score_model(clf, y_train, y_test, y_train_pred, y_test_pred, y_train_pred_proba, y_test_pred_proba)

display(clf.get_params())

display(pd.DataFrame({
    'Feature': feature_cols,
    'Coefficient': clf.coef_[0]
}))

f1_results.append(('Logistic Regression', metrics.f1_score(y_test, y_test_pred)))

In [None]:
f1_results.sort(key=lambda x: x[1], reverse=True)
display(f1_results)