In [None]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Checking data

In [None]:
raw_data = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
raw_data.head()

In [None]:
raw_data.info()

- `id` column is not needed
- `bmi` contains missing values

In [None]:
cat_features = ['gender', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
cont_features = ['age', 'avg_glucose_level', 'bmi']

In [None]:
for f in cat_features + ['stroke']:
    print(f + ':')
    print(raw_data[f].value_counts(), '\n')

In [None]:
raw_data['stroke'].value_counts(normalize = True)

- target variable `stroke` is highly imbalanced, stroke events represents only 5% of the data
- `stroke` events represent individuals that had experienced stroke in the past, not those who are likely to experience it in the future
- features are also imbalanced, `Other` (gender) and `Never_worked` (work type) and may be worth dropping
- `Unknown` value of smoking status is basically a missing value
- smoking status could be treated as an ordinal category - never smoked -> formerly smoked -> smokes

# Data cleaning

In [None]:
data = raw_data.copy()

# drop not needed id column

data.drop('id', axis=1, inplace=True)

In [None]:
# replace Unknown with NA

data['smoking_status'].replace('Unknown', np.nan, inplace=True)

In [None]:
# convert categories for easier manipulation

for f in cat_features:
    data[f] = data[f].astype('category')

In [None]:
# set smoking_status to be an ordinal category

data['smoking_status'] = data['smoking_status'].cat.reorder_categories(new_categories = ['never smoked', 'formerly smoked', 'smokes']).cat.as_ordered()

## Missing values analysis

In [None]:
data.isna().sum()

In [None]:
na_summary = data.isna().sum()
total_count = data.shape[0]
print('missing data ratio')
print('bmi:            {:.2f}'.format(na_summary['bmi'] / total_count))
print('smoking_status: {:.2f}'.format(na_summary['smoking_status'] / total_count))

print()
print('missing bmi vs. stroke')
print(data[data['bmi'].isna()]['stroke'].value_counts())

print()
print('missing smoking_status vs. stroke')
print(data[data['smoking_status'].isna()]['stroke'].value_counts())

- `bmi` contains missing values - 4% of DS
- `smoking_status` contains missing values - 30% of DS

Both groups of "missing values" contain rare positive stroke events. Therefore simple removal of these rows will also remove valuable stroke data. Filling NAs with fake values might be OK for bmi. Missing smoking statuses however represent a big chunk of data and fakes might have bad impact on this predictor.

# Exploratory data analysis

In [None]:
for f in cat_features:
    sns.countplot(x = data[f])
    plt.show()

Plots show how imbalanced some features are. There are also visible minority classes like `Other` or `Never_worked`

In [None]:
sns.pairplot(data[['stroke'] + cont_features].sort_values('stroke'), hue='stroke', height=4)

- age seems to have the biggiest impact on stroke event
- bmi looks less important
- 2 clusters of glucose level visible (low, high)

In [None]:
# use simple label encoding for further analysis

data_label_enc = data.copy()
for f in cat_features:
    data_label_enc[f] = data_label_enc[f].cat.codes

In [None]:
plt.figure(figsize = (10, 10))
sns.heatmap(data_label_enc.corr(), annot=True)

- correlation heatmap shows higher correlation between stroke and age, glucose level, hypertension, heart disease, ever married
- lower correlation between stroke and gender, work type, residence type, bmi, smoking status
- ever married is highly correlated to age which is natural since older people had been more likely married once

# Data preparation

## Feature selection

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

data_fs = data.copy()

data_fs.dropna(inplace = True)

for f in cat_features:
    data_fs[f] = data_fs[f].cat.codes

data_fs = MinMaxScaler().fit_transform(data_fs)

X_fs = data_fs[:,:-1]
Y_fs = data_fs[:,-1:]

fit = SelectKBest(score_func = chi2, k = 4).fit(X_fs, Y_fs)

df_scores = pd.DataFrame(fit.scores_)
df_columns = pd.DataFrame(data.columns)
feature_scores = pd.concat([df_columns, df_scores],axis = 1)
feature_scores.columns = ['feature','score']
fs_results = feature_scores.nlargest(20,'score')
fs_results

Most important features are hypertension, heart disease, age and glucose level. These results prove feature importance assumptions made in EDA. Despite ever married seems somehow important, I will not use it since it is highly correlated to age (and age looks like a better predictor).

In [None]:
final_features = fs_results['feature'][0:4].values
data_final_features = data.copy()[final_features]
data_final_features.head()

## Missing values

Luckily I don't need to bother with missing values since `bmi` and `smoking_status` features were not selected

In [None]:
data_final_features.isna().sum()

## Encoding categorical data

Selected categorical features are binary so I will use simple label encoding

In [None]:
data_enc = data_final_features.copy()

for f in data_final_features.select_dtypes('category').columns:
    data_enc[f] = data_enc[f].cat.codes

## train/test split

In [None]:
from sklearn.model_selection import train_test_split

y = data['stroke']
X = data_enc.copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

## Data resampling

Stoke event is quite rare (5% of the data) and therefore dataset needs to be resampled. To do that I have decided to use method that combines both over and under-sampling

In [None]:
print("dataset size: " + str(y_train.size))
print("stroke ratio: " + str(y_train.sum() / y_train.size))

In [None]:
from imblearn.combine import SMOTEENN

smt = SMOTEENN(random_state=42, sampling_strategy = 0.7)
X_train, y_train = smt.fit_resample(X_train, y_train)

print("dataset size: " + str(y_train.size))
print("stroke ratio: " + str(y_train.sum() / y_train.size))

The dataset now contains 40% of stroke events

## Standardization

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Modeling

## Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, plot_confusion_matrix

plot_confusion_matrix(model, X_test, y_test)  

print(classification_report(y_test, y_pred))

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# use grid search to tune hyper-parameters

param_grid = {
    'bootstrap': [True, False],
    'n_estimators': [100, 200, 400]
}

model = GridSearchCV(RandomForestClassifier(random_state = 42), param_grid, scoring = 'f1')

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

model.best_estimator_

In [None]:
plot_confusion_matrix(model, X_test, y_test)  

print(classification_report(y_test, y_pred))

# Summary

I have trained two classification models on stroke-prediction dataset.

As a result of EDA and feature selection I have picked four features to train models:

- categorical: `hypertension`, `heart_disease`
- continuous: `age`, `avg_glucose_level`

## Logistic regression classifier

Model performed with **f1-score: 0.22**. Recall of this model is 0.73 and precision 0.13 which means there is a higher chance to predict stroke but with more false positive predictions.


## Random forset classifier

Model performed with **f1-score 0.24**. Recall was however much lower comparing to logistic regression - 0.59. Precision 0.15 is slightly higher. This results in less false positive predictions but more missed stroke events (true positive).

# Conclusion

I would say that in case of stroke prediction is higher recall much more important than higher precision. Stroke observations represent individuals that **had already experienced stroke** in the past not those who will experience it in the future. Therefore it is likely that part of false positive predictions is actually subject to experience stroke in the future.