In [None]:
import numpy as np, pandas as pd, xgb_wrapper as xgbw, lgbm_wrapper as lgbmw, project_tools as project, seaborn as sns, matplotlib.pyplot as plt, xgboost as xgb
import sklearn, imblearn
import sklearn.compose, sklearn.naive_bayes
from pprint import pprint as display
from tqdm.notebook import tqdm
import warnings
sns.set_style('dark')

In [None]:
warnings.filterwarnings('ignore')

<a name='start'></a>
# Stroke Prediction

Before exploring any of the data, I have decided to conduct some brief reaserch into the medical condition and its risk factors.

*A stroke is a serious medical condition where the blood flow to part of the brain is cut off.*

Some of the lifestyle-related risk factors for stroke are:
- smoking
- high blood pressure
- high blood cholesterol levels
- obesity
- a diet high in saturated fats and salt...
    - ... and low in fruit, fibre and vegetables
- diabetes
- heavy alcohol intake
- insufficent regular exercise
- heart disorders like coronary heart disease

This will tell us what to expect in our data, for example people who smoke and drink alcohol heavily will have a higher chance of having a stroke.

With this classification problem, a false negative is far more dangerous than a false positive.

<a name='obs'></a>
## Data Observation

In [None]:
# reading the csv data
data = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv', index_col='id')

# splitting into train and test sets
train_full, test_full = sklearn.model_selection.train_test_split(data, test_size=0.5, random_state=1, stratify=data['stroke'])

In [None]:
test, test_y = test_full.drop('stroke', axis=1), test_full['stroke']

In [None]:
train = train_full.copy()

In [None]:
train.info()

There are 10 features in the dataset, and the features relevant to the risk factors mentioned earlier are: 
- `bmi`
- `avg_glucose_level`
- `heart_disease`
- `smoking_status`
- `hypertension` (aka high blood pressure)

In [None]:
train.describe()

There are some missing values in the `bmi` column.

In [None]:
numerical = ['age', 'bmi', 'avg_glucose_level']
binary = ['hypertension', 'heart_disease']
categorical = ['ever_married', 'work_type', 'Residence_type', 'smoking_status', 'gender']

<a name='vis'></a>
## Data Visualisation with Seaborn and Matplotlib

### Univariate

In [None]:
fig, axs = plt.subplots(4, 2, figsize=(16*2,9*4))
for ax, col in zip(np.nditer(axs, flags=['refs_ok']), categorical+binary+['stroke']):
    ax = ax.item()
    sns.countplot(x=train[col], ax=ax)
plt.show()

More people have been married than not, more people live in an urban area than a rural area, more people do not have hypertension or heart disease, and most of the people have never smoked and have private jobs. Also, more people in the dataset are female than male.

The binary target is extremely skewed, so stratified cross validation would be the best choice - oversampling could be used.

In [None]:
g = sns.pairplot(data=data[numerical], diag_kind='hist', kind='scatter', diag_kws={'kde': True}, aspect=16/9, height=18)

The distributions of the `bmi` and `avg_glucose_level` features are right-tailed, whereas the distribution of the `age` feature is left-tailed.

### Bivariate

In [None]:
fig, axs = plt.subplots(4, 2, figsize=(16*2,9*4))
for ax, col in zip(np.nditer(axs, flags=['refs_ok']), categorical+binary):
    ax = ax.item()
    sns.barplot(x=train[col], y=train['stroke'], ax=ax)
plt.show()

According to the data, a person's chances of having a stroke are higher if:
- they have heart disease
- they live in an urban area
- they are male
- they have high blood pressure
- they have formally smoked or are presently smoking
- they have been married
- they are self employed

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(16*2,9*2))
for ax, col in zip(np.nditer(axs, flags=['refs_ok']), numerical):
    ax = ax.item()
    sns.regplot(x=train[col], y=train['stroke'], ax=ax, logistic=True, ci=False)
plt.show()

The logistic regression plots show that the chances of having a stroke increase with the increase of `avg_glucose_level`, `age` and `bmi`.

<a name='prep'></a>
## Data Preprocessing with Scikit-Learn

In [None]:
from sklearn.experimental import enable_iterative_imputer

In [None]:
ctrans = sklearn.pipeline.Pipeline([
    ('encode', sklearn.preprocessing.OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
ntrans = sklearn.pipeline.Pipeline([
    ('impute', sklearn.impute.IterativeImputer()),
    ('transfom', sklearn.preprocessing.PowerTransformer())
])

In [None]:
preprocess = sklearn.compose.ColumnTransformer([
    ('categorical', ctrans, categorical),
    ('binary', 'passthrough', binary),
    ('numerical', ntrans, numerical)
], remainder='passthrough', n_jobs=-1)

In [None]:
resampler = imblearn.over_sampling.SMOTE(sampling_strategy='all', n_jobs=4, random_state=1)

In [None]:
df_X, df_y = train.drop('stroke', axis=1), train.stroke

In [None]:
df_X[categorical+binary] = df_X[categorical+binary].apply(lambda x: x.factorize()[0])

In [None]:
df_X = df_X.fillna(0)

In [None]:
train_balanced = pd.concat(resampler.fit_resample(df_X, df_y), axis=1)

In [None]:
train_balanced

In [None]:
arr = train[categorical].values

In [None]:
arr[:, 0]

In [None]:
index = np.where(arr[:, 0] == 'No')

In [None]:
arry = train['stroke'].values

In [None]:
train[['stroke', 'ever_married']].groupby('ever_married').mean()

In [None]:
s = pd.DataFrame(np.concatenate([arr, arry.reshape(-1, 1)], axis=1))[[0, 5]]

In [None]:
s[5] = s[5].astype(int)

In [None]:
s.groupby(0).mean()

In [None]:
np.mean(arry[index])

## Dimensionality Reduction - PCA

In [None]:
X, y = train_balanced.drop('stroke', axis=1), train_balanced['stroke']

pca = sklearn.pipeline.make_pipeline(preprocess, sklearn.decomposition.PCA(n_components=2, random_state=1))

X_red = pca.fit_transform(X)

reduced_X = pd.DataFrame(X_red, index=X.index)

reduced_train = pd.concat([reduced_X, y], axis=1)

plt.figure(figsize=(32,18))
sns.scatterplot(data=reduced_train, x=1, y=0, hue='stroke')
plt.show()

<a name='tune'></a>
## Hyperparameter Tuning 

In [None]:
tune_train = train_balanced.sample(n=1000)

In [None]:
X, y = tune_train.drop('stroke', axis=1), tune_train.stroke

In [None]:
np.random.seed(0)
dist = np.random.uniform(low=0, high=1, size=(100))

In [None]:
models = {
    'logistic': {
        'model': sklearn.linear_model.LogisticRegression(penalty='l2', solver='liblinear', class_weight='balanced', random_state=1),
        'params': {
            'C': dist}},
    'random-forest': {
        'model': sklearn.ensemble.RandomForestClassifier(random_state=1),
        'params': {'bootstrap': [True, False],
                   'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
                   'max_features': ['auto', 'sqrt'],
                   'min_samples_leaf': [1, 2, 4],
                   'min_samples_split': [2, 5, 10],
                   'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]}},
    'xgb': {
        'model': xgbw.get_wrapper('XGBClassifier', random_state=1, booster='gbtree', n_estimators=1000, learning_rate=0.01, use_label_encoder=False),
        'params': {
            'reg_alpha': dist,
            'reg_lambda': dist,
            'gamma': dist,
            'min_child_weight': dist,
            'max_depth': [3, 4, 5, 6, 7]}},
    'lgbm': {
        'model': lgbmw.get_wrapper('LGBMClassifier', random_state=1, n_estimators=1000, learning_rate=0.01),
        'params': {
            'reg_alpha': dist,
            'reg_lambda': dist,
            'min_split_gain': dist,
            'min_child_weight': dist,
            'max_depth': [3, 4, 5, 6, 7]}}}

In [None]:
model_best = {}
for name, model in tqdm(models.items()):
    clf = model['model']
    pdist = model['params']
    pipeline = sklearn.pipeline.make_pipeline(preprocess, sklearn.model_selection.RandomizedSearchCV(estimator=clf, param_distributions=pdist, cv=10, n_iter=30, scoring='f1_weighted', n_jobs=-1))
    pipeline.fit(X, y)
    model_best[name] = sklearn.pipeline.make_pipeline(preprocess, pipeline['randomizedsearchcv'].best_estimator_)

In [None]:
X, y = train_balanced.drop('stroke', axis=1), train_balacned.stroke

In [None]:
scores = {}
fnscores = {}

for name, pipeline in tqdm(model_best.items()):
    results = sklearn.model_selection.cross_validate(pipeline, X, y, cv=10, scoring={'f1': sklearn.metrics.get_scorer('f1_weighted'), 'fnr': project.neg_false_negative_rate}, n_jobs=-1)
    scores[name] = results['test_f1']
    fnscores[name] = results['test_fnr']

In [None]:
scores = pd.DataFrame(scores)
fnscores = pd.DataFrame(fnscores)

In [None]:
scores.agg(['mean', 'std'])

The highest weighted f1 score belongs to the random forest model, followed by the lightgbm model, however the lightgbm model's scores has a lower standard deviation than the scores of the random forst model, suggesting that the performance of the model is more consistent across folds. 

In [None]:
fnscores.agg(['mean', 'std'])

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(32,18))
for ax, model in zip(np.nditer(axs, flags=['refs_ok']), scores.columns):
    ax = ax.item()
    sns.histplot(data=scores, x=model, kde=True, ax=ax)
plt.show()

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(32,18))
for ax, model in zip(np.nditer(axs, flags=['refs_ok']), fnscores.columns):
    ax = ax.item()
    sns.histplot(data=fnscores, x=model, kde=True, ax=ax)
plt.show()

In [None]:
plt.figure(figsize=(32,18))
sns.barplot(data=scores)
plt.show()

In [None]:
plt.figure(figsize=(32,18))
sns.barplot(data=fnscores * -1)
plt.show()

In [None]:
for name, pipeline in tqdm(model_best.items()):
    model_best[name] = pipeline.fit(X, y)

In [None]:
final_model_scores = pd.DataFrame(index=['accuracy', 'f1', 'auc', 'fnr'])

In [None]:
with tqdm(model_best.keys()) as bar:
    for model_name in bar:
        bar.set_description(model_name)
        clf = model_best[model_name]
        preds = clf.predict(test)
        final_model_scores[model_name] = (sklearn.metrics.get_scorer('balanced_accuracy')(clf, test, test_y),
                                          sklearn.metrics.get_scorer('f1_weighted')(clf, test, test_y), 
                                          sklearn.metrics.get_scorer('roc_auc')(clf, test, test_y), 
                                          project.neg_false_negative_rate(clf, test, test_y) * -1)
        output = pd.DataFrame({
            'PassengerId': test.index,
            'Survived': preds
        })
        file_name = '{}-preds.csv'.format(model_name)
        output.to_csv(file_name, index=False)

In [None]:
final_model_scores