In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import datasets, neighbors, linear_model
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import cross_val_score
from xgboost import plot_importance
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.compose import make_column_transformer
from imblearn.under_sampling import RandomUnderSampler 
from imblearn.over_sampling import SMOTE 

import warnings  
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df_ex = df.copy()
df_ex = df_ex[df_ex.gender != 'Other']

fp = 100*df_ex[df_ex.stroke==1].shape[0]/df_ex.shape[0]
print("The data has %f%% positive sample or a total of %d positive cases out of %d total cases" 
      % (fp, fp*df_ex.shape[0]/100, df_ex.shape[0]))

Let us classify and plot the numerical variables to ensure that the two groups behave similarly 

In [None]:
def splt1(ft, data, bins, lsc):
    rows = ['Male','Female']
    opt = list(map(lambda x: list(zip([0,1], [x]*2)), rows))
    
    opt = [item for lst in opt for item in lst]
    
    color_code = ['green', 'red', 'green', 'red']
    
    fig, axs = plt.subplots(2,2, sharey = True, figsize = (8,8))
    for i in range(len(opt)):
        splt = axs[i//2,i%2]
        sns.histplot(data = data,
                     x = data[(data['gender'] == opt[i][1])
                            & (data['stroke'] == opt[i][0])][ft], 
                     bins = bins,
                     stat = 'probability',
                     kde = True,
                     log_scale = lsc,
                     line_kws={"color": color_code[i], "lw": 3},
                     color = color_code[i],
                     label = 'stroke = ' + str(opt[i][0]),
                     ax = axs[i//2,i%2])
        
        splt.tick_params(axis ='x', which ='major', 
                   labelsize = 14, pad = 10)
        splt.set_yticks([])
        splt.legend(prop = {'size': 12})
        
    
        if i > 1:
            splt.set_xlabel(ft.title(), fontsize = 14)
        else:
            splt.set_xlabel(None)
            splt.set_xticks([])
            
        if ((i == 0) or (i == 2)):
            splt.set_ylabel(rows[i//2].title(), fontsize = 14)
        else:
            splt.set_ylabel(None)

    plt.tight_layout()
    plt.show()

In [None]:
splt1('age', df_ex, 30, False)

In [None]:
splt1('bmi', df_ex, 30, False)

In [None]:
splt1('avg_glucose_level', df_ex, 30, False)

The distribution of data in the two gender groups seems similar.  
Note: The bottom row of the subplots should print ylabel as "Female", it's working when I am running it on my laptop! 

In [None]:
fig = plt.figure(figsize = (7,7))
sns.heatmap((df_ex.iloc[:,1:]).corr(), annot=True)
plt.show()

Let us mark the categorical variables

In [None]:
cat_cols = df_ex.select_dtypes(
    include = ['object']).columns.values
print("The categorical variables are\n%s" %cat_cols)

We change the smoking_status of "formerly smoked" to "smokes" and replace the missing 'bmi' values with the median

In [None]:
df_ex['smoking_status'] = df_ex['smoking_status'].replace(['formerly smoked'], 'smokes')
df_ex['bmi'] = df_ex['bmi'].fillna(df_ex['bmi'].median())

Next we define the feature matrix, X (dropping 'id' and 'stroke') and the target, Y ('stroke'). The categorical variable values are transformed to numerical by applying LabelEncoder 

In [None]:
X = df_ex.drop(['id', 'stroke'], axis = 1)
Y = df_ex['stroke']

lb_enc = LabelEncoder()
for col in cat_cols:
    X[col] = lb_enc.fit_transform(X[col])

This is a case of *imbalanced classification*, hence we need to use either random under / over sampling to make sure that both the train and test set have a balanced distribution of the target (stroke = 0 / 1). We will compare both under sampling and over sampling using **RandomUnderSampler** and **SMOTE**

In [None]:
r_state = 0
rand_us = RandomUnderSampler(random_state = r_state)
X_urs, Y_urs = rand_us.fit_resample(X, Y)
print(Y_urs.value_counts())

In [None]:
rand_smote = SMOTE(random_state = r_state)
X_ors, Y_ors = rand_smote.fit_resample(X, Y)
print(Y_ors.value_counts())

We will evaluate the models using the f1-score, while keeping track of precision & recall scores and the confusion matrix. Let us split the data into train & test sets, and define the model evaluation

In [None]:
X_train_urs, X_test_urs, Y_train_urs, Y_test_urs = train_test_split(
    X_urs, Y_urs, test_size = 0.25, random_state = r_state)

X_train_ors, X_test_ors, Y_train_ors, Y_test_ors = train_test_split(
    X_ors, Y_ors, test_size = 0.25, random_state = r_state)

def m_run(model, _name, X_train, Y_train, X_test, Y_test):
    m_train = model.fit(X_train, Y_train)
    m_pred = model.predict(X_test)
    m_acc = accuracy_score(Y_test, m_pred)
    m_prc = precision_score(Y_test, m_pred, average = 'binary')
    m_rec = recall_score(Y_test, m_pred, average = 'binary')
    m_f1s = f1_score(Y_test, m_pred, average = 'binary')
    print("Classifier Type: %s" % _name)
    print("\n\
    F1 score: %f\n\
    Precision score: %f\n\
    Recall score: %f\n\
    Accuracy score: %f" % (m_f1s, m_prc, m_rec, m_acc))
    plot_confusion_matrix(xgb, X_test, Y_test)

In [None]:
xgb = XGBClassifier(learning_rate = np.random.randint(10,50)/100,
                    # using a random learning rate to ensure reaching global minima
                    n_estimators = 200, 
                    max_depth = 10,
                    objective='binary:logistic', eval_metric = 'auc',
                    random_state = r_state)

In [None]:
m_run(xgb, "XGBoost with random under sampling", X_train_urs, Y_train_urs, X_test_urs, Y_test_urs)

In [None]:
m_run(xgb, "XGBoost with random over sampling", X_train_ors, Y_train_ors, X_test_ors, Y_test_ors)

Lastly we would like to find out what are the key features used in our model 

In [None]:
_train = xgb.fit(X_train_ors, Y_train_ors)
plot_importance(xgb)
plt.show()

## Conclusion:
The XGBoost classifier coupled with random over sampling by SMOTE provides a F1-score of 0.96, predicting the occurrence of stroke with ~97% accuracy and the negative cases with 95% accuracy. The key features affecting the prediction are (1) avg_glucose_level, (2) age, and (3) bmi. 

From exploratory analysis, it seemed that the data distribution for both the genders are quite similar. However, it might be useful to segment the data by age groups and run the classifier on each groups. First, predicting the vulnerable cases in a younger age group (say, < 50) will improve the usefulness of the model, because people in that age group may not go for regular heath check up. Second, it might increase the accuracy of the model when applied to each of the age groups.     