In [None]:
'''General Header for Python Operations'''
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt

plt.style.use('ggplot')
pd.set_option('precision', 3)
np.set_printoptions(precision=3)

def hide_warnings():
    import warnings
    warnings.filterwarnings('ignore')
    return None

In [None]:
hide_warnings()

In [None]:
# Import Dataset
raw_df = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')

In [None]:
# Descriptive Stats and Info
display(raw_df.info())
display(raw_df.isna().sum())
display(raw_df.describe())

In [None]:
# Cast Categorical Data as Categorical Datatype and drop ID
cat_data = ['gender','hypertension','heart_disease','ever_married',
           'work_type','Residence_type','smoking_status','stroke']
df = raw_df.copy(deep=True)
df.drop(columns='id', inplace=True)
df[cat_data] = df[cat_data].astype('category')
display(df.info())
display(df.head())

In [None]:
# Check Balance of Stroke Indicator
strokeStrat = df.stroke.astype(int).sum()
print(f'Number of Stroke Labels {strokeStrat}')
print(f'Percent Data Stroke {strokeStrat/5110*100:0.2f} %')

Only 5% of the data is labeled as having a stroke. So if a classifier just guessed no stroke regardless of input, it would be correct 95% of the time using the full dataset. 

### Examine Numeric Features


In [None]:
# Visualize Numeric Data Distributions for Labels
num_data = ['age','avg_glucose_level','bmi']
fig,axes = plt.subplots(nrows=1,ncols=3,figsize=(15,5))
raw_df[num_data + ['stroke']].boxplot(by='stroke', return_type='axes', ax=axes)
plt.tight_layout()

Age looks like a very good feature, glucose level is also interesting, bmi perhaps not a much.

### Examine Categorical Features

In [None]:
# Check Unique Values in each Column
for col in cat_data:
    print(f'\n==== {col}')
    print(raw_df[col].value_counts() )

We can throw out the other category on gender to make the variable binary since there is only one entry for it. Smoking status unknown is a problem as it is really an NaN entry and makes up almost a thrid of the entries. We examine further...

In [None]:
# Check number of stroke victims in smoking status unknown
ssUnknown = raw_df['smoking_status'] == 'Unknown'
print('Smoking Status Unknown Stroke Counts')
raw_df['stroke'].loc[ssUnknown].value_counts()

About 20% of our stroke victims are in the unknown category. We will continue under the assumption that unknown constitutes it's own category and not an NaN for now.

### Cull Other From Gender

In [None]:
clean_idx = raw_df['gender'] != 'Other'
print(clean_idx.value_counts())
print(raw_df.gender.loc[clean_idx].value_counts())

### Build Pipeline for Data Preparation

In [None]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler,LabelBinarizer, LabelEncoder
from sklearn.impute import SimpleImputer

# Build Selector Class
class DataFrameSelector(BaseEstimator,TransformerMixin):
    # From Hands-On Machine Learning with Sklearn and Tensorflow
    def __init__(self,column_names):
        self.column_names = column_names
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        return X[self.column_names].values

# Build Binarizer Class
class CustomBinarizer(BaseEstimator,TransformerMixin):
    def __init__(self):
        self.encoder = LabelBinarizer()
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        return self.encoder.fit_transform(X).reshape(-1,1)

# Build Encoder Class
class CustomEncoder(BaseEstimator,TransformerMixin):
    def __init__(self):
        self.encoder = LabelEncoder()
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        return self.encoder.fit_transform(X).reshape(-1,1)
    
# Numeric Data Replace NaN in BMI with median
num_pipeline = Pipeline([
    ('selector',DataFrameSelector(num_data)),
    ('imputer',SimpleImputer(strategy='median')),
    ('std_scaler',StandardScaler())
])

# Label Encoder Data (Non-binary categories)
enc_data = ['work_type','smoking_status']
enc_pipes = []
for enc_col in enc_data:
    enc_pipes.append((enc_col,Pipeline([
        ('selector',DataFrameSelector(enc_col)),
        ('encoder',CustomEncoder())
    ])))

# Label Binarizer Data
bin_data = [x for x in cat_data if x not in enc_data]
bin_pipes = []
for bin_col in bin_data:
    bin_pipes.append((bin_col,Pipeline([
        ('selector',DataFrameSelector(bin_col)),
        ('binarizer',CustomBinarizer())
    ])))

# Final Pipeline
t_list = [('numerics',num_pipeline)] + enc_pipes + bin_pipes
prep_pipeline = FeatureUnion(transformer_list=t_list)


In [None]:
# Prepare Dataset
prepared_data = prep_pipeline.fit_transform(raw_df.loc[clean_idx])
prepared_cols = num_data + enc_data + bin_data
print(prepared_data.shape)
print(prepared_cols)
prepared_data

### Separate Dataset for Model Construction

In [None]:
from sklearn.model_selection import train_test_split

# Separate Targets and Features
X = prepared_data[:,:10]
y = prepared_data[:,10]
print(prepared_cols[:10])
print(f'X shape: {X.shape}')
print(prepared_cols[10])
print(f'y shape: {y.shape}')
print()

# Stratified test train split
np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.15)
print(f'X Train {X_train.shape}')
print(f'X Test {X_test.shape}')
print(f'y Train {y_train.shape}')
print(f'y Test {y_test.shape}')

In [None]:
# Check Stratification Original Data was ~ 5% stroke victims
print(f'Full Dataset Stroke Victims {y.mean()*100:0.2f}%')
print(f'Test Set Stroke Victims {y_test.mean()*100:0.2f}%')
print(f'Train Set Stroke Victims {y_train.mean()*100:0.2f}%')

In [None]:
# Undersample sample the majority imbalance
from imblearn.under_sampling import RandomUnderSampler
resample = RandomUnderSampler()
np.random.seed(42)
X_train_smpl, y_train_smpl = resample.fit_resample(X_train,y_train)
print(f'X Train Undersample {X_train_smpl.shape}')
print(f'y Train Undersample {y_train_smpl.shape}')
print()
print(f'y Train Undersample Set Stroke Victims {y_train_smpl.mean()*100:0.2f}%')


### Feature Selection

In [None]:
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
features = prepared_cols[:-1]

# Select with Forward Feature Selection and Logisitc Regression
lgr = LogisticRegression(penalty ='none')
sfs = SequentialFeatureSelector(lgr,k_features = (1,len(features)),
                                forward = True, scoring='roc_auc',cv=30)
sfs.fit(X_train_smpl,y_train_smpl,custom_feature_names=features)
pd.DataFrame.from_dict(sfs.get_metric_dict()).T

In [None]:
# Select Features from Random Forest Classifier
rfc = RandomForestClassifier()
sfs = SequentialFeatureSelector(rfc,k_features = (3,len(features)),
                                forward = True, scoring='roc_auc',cv=10)
sfs.fit(X_train_smpl,y_train_smpl,custom_feature_names=features)
pd.DataFrame.from_dict(sfs.get_metric_dict()).T

In [None]:
from xgboost import XGBClassifier
# Select Features from XGBoost

xgbc = XGBClassifier(eval_metric='logloss')
sfs = SequentialFeatureSelector(xgbc,k_features = (3,len(features)),
                                forward = True, scoring='accuracy',cv=10)
sfs.fit(X_train_smpl,y_train_smpl,custom_feature_names=features)
pd.DataFrame.from_dict(sfs.get_metric_dict()).T

In [None]:
from sklearn.svm import SVC
# Select Features from Support vector machine

svc = SVC(kernel='linear')
sfs = SequentialFeatureSelector(svc,k_features = (3,len(features)),
                                forward = True, scoring='roc_auc',cv=10)
sfs.fit(X_train_smpl,y_train_smpl,custom_feature_names=features)
pd.DataFrame.from_dict(sfs.get_metric_dict()).T

### Build Un-opmitmized Models for Selection

In [None]:
from sklearn.metrics import roc_curve, RocCurveDisplay, roc_auc_score,\
confusion_matrix, ConfusionMatrixDisplay
lgr_idx = (0, 3, 6)# logistic regression features


# Logisitic Regression
lgr = LogisticRegression(penalty='none')
lgr.fit(X_train_smpl[:,lgr_idx],y_train_smpl)
lgr_pred = lgr.predict(X_test[:,lgr_idx])

def mcr(y_true,y_pred):
    return np.mean(y_true!=y_pred)

def clfr_perfomance(y_true,y_pred):
    print(f'Area under curve: {roc_auc_score(y_true,y_pred):0.4f}')
    print(f'Misclassification Rate: {mcr(y_test,lgr_pred)*100:0.2f}%')
    # Plot Results
    f,(ax1,ax2) = plt.subplots(nrows=1,ncols=2, figsize=(15,7))
    cm = confusion_matrix(y_true,y_pred)
    cm_display = ConfusionMatrixDisplay(cm).plot(ax=ax1)
    ax1.tick_params(axis=u'both', which=u'both',length=0)
    ax1.grid(b=None)
    fpr, tpr, _ = roc_curve(y_true,y_pred)
    roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot(ax=ax2)
    
clfr_perfomance(y_test,lgr_pred)


In [None]:
# Random Forest
rfc_idx = (0, 4, 7, 8) # best features for random forest

rfc = RandomForestClassifier(random_state=42)
rfc.fit(X_train_smpl[:,rfc_idx],y_train_smpl)
rfc_pred = rfc.predict(X_test[:,rfc_idx])

clfr_perfomance(y_test,rfc_pred)

In [None]:
# XGBoost
xgb_idx = (0, 7, 8)

xgb = XGBClassifier(eval_metric='logloss')
xgb.fit(X_train_smpl[:,xgb_idx],y_train_smpl)
xgb_pred = xgb.predict(X_test[:,xgb_idx])

clfr_perfomance(y_test,xgb_pred)