# Stroke Prediction and Analysis

- Someone in the United States has a stroke every 40 seconds.
- Every year, more than 795,000 people in the United States have a stroke.

![](https://images.medicinenet.com/images/article/main_image/stroke-symptoms-and-treatment.jpg)

# The Purpose of notebook

In this notebook, I will analyze a dataset of people who have been tested for stroke.


# About this dataset

- `id`: unique identifier
- `gende`: "Male", "Female" or "Other"
- `age`: age of the patient
- `hypertension`: 0 if the patient doesn't have hypertension, 1 if the patient has hypertension
- `heart_disease`: 0 if the patient doesn't have any heart diseases, 1 if the patient has a heart disease
- `ever_married`: "No" or "Yes"
- `work_type`: "children", "Govt_jov", "Never_worked", "Private" or "Self-employed"
- `Residence_type`: "Rural" or "Urban"
- `avg_glucose_level`: average glucose level in blood
- `bmi`: body mass index
- `smoking_status`: "formerly smoked", "never smoked", "smokes" or "Unknown"*
- `stroke`: 1 if the patient had a stroke or 0 if not

## Imports libs 

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Charts
import matplotlib.pyplot as plt
import seaborn as sns
from scikitplot.estimators import plot_learning_curve
from sklearn.metrics import plot_confusion_matrix
from keras.utils.vis_utils import plot_model

#  Models
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.layers import Conv2D, MaxPool2D, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import xgboost as xgb
import lightgbm as lgbm
import catboost as ctb


# Preprocessing
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek 
from sklearn.preprocessing import LabelEncoder

# Scoring
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, cross_val_score, KFold, cross_validate, StratifiedKFold
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score, accuracy_score
from sklearn.metrics import classification_report, roc_curve, roc_auc_score, confusion_matrix

# Hyperparameters and features importance
from sklearn.model_selection import GridSearchCV
import eli5
from eli5.sklearn import PermutationImportance

# remove verison errors
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning)

## Loading the dataset

In [None]:
path = '/kaggle/input/stroke-prediction-dataset/'

df_stroke = pd.read_csv(path + 'healthcare-dataset-stroke-data.csv')

## The size of the dataset

In [None]:
df_stroke.shape

# Exploratory data analysis

## Basic info about data

In [None]:
df_stroke.info()

## Sample data

In [None]:
df_stroke.sample(15)

## Checking missing values

In [None]:
df_stroke.isnull().sum().sum()

## Removing missing values

In [None]:
df_stroke.dropna(inplace=True)

## Checking duplicates

In [None]:
df_stroke.duplicated().sum()

#### Colors to charts

In [None]:
mycolors = ['red', 'blue', 'brown', 'orange']

#### I divide features to categorical, continous and label columns

In [None]:
categorical_cols = [ 'gender', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'] # 8
continous_cols = ['age', 'avg_glucose_level', 'bmi'] # 3
label_col = ['stroke'] #1

to_numeric = {
    'gender':{'Male': 0, 'Female': 1, 'Other': 2},
    'ever_married':{'No':0, 'Yes':1},
    'work_type': {'children': 0, 'Govt_job': 1, 'Never_worked': 2, 'Private': 3, 'Self-employed': 4},
    'Residence_type': {'Rural': 0, 'Urban': 1},
    'smoking_status': {'formerly smoked': 0, 'never smoked': 1, 'smokes': 2, 'Unknown': 3}
}

name_change = {
    'hypertension': {'0': "patient doesn't have hypertension", '1': 'patient has hypertension'}, 
    'heart_disease': {'0': "patient doesn't have any heart diseases", '1': "patient has a heart disease"},
}

## Statistics continous columns

In [None]:
df_stroke[continous_cols].describe().T

### Distribution of continuous features

In [None]:
cnt = 0
max_in_row = 1
for x in continous_cols:
    data = df_stroke[x]
    plt.figure(cnt//max_in_row, figsize=(25,8))
    plt.subplot(1, max_in_row, (cnt)%max_in_row + 1)
    plt.title(f'Distribution of {x} variable', fontsize=20)
    plt.xticks(fontsize=16)
    plt.yticks(fontsize=16)
    plt.xlabel(x, fontsize=16)
    plt.ylabel('Count', fontsize=16)
    sns.histplot(data, bins = 50, kde=50);
    cnt += 1

In [None]:
cnt = 0
max_in_row = 1
for x in continous_cols:
    plt.figure(cnt//max_in_row, figsize=(25,8))
    plt.subplot(1, max_in_row, (cnt)%max_in_row + 1)
    plt.title(x, fontsize=20)
    plt.xticks(fontsize=16)
    plt.yticks(fontsize=16)
    plt.xlabel(x, fontsize=16)
    plt.ylabel('Density', fontsize=16)
    sns.kdeplot(data=df_stroke, x=x, hue="stroke", fill=True, common_norm=False, alpha=.5, linewidth=0);
    cnt += 1

## Boxplot of continuous features

In [None]:
cnt = 0
max_in_row = 1
for x in continous_cols:
    data = df_stroke[x]
    plt.figure(cnt//max_in_row, figsize=(25,8))
    plt.subplot(1, max_in_row, (cnt)%max_in_row + 1)
    plt.title(x, fontsize=20)
    plt.xticks(fontsize=16)
    plt.yticks(fontsize=16)
    plt.xlabel(x, fontsize=16)
    sns.boxplot(data = data);
    sns.despine(offset=10, trim=True);
    cnt += 1

## Barplot of the categorical features


In [None]:
cnt = 0
max_in_row = 1
for x in categorical_cols:
    val1 = df_stroke[x].value_counts().index
    if x in name_change:
        val1 = [name_change[x][str(val)] for val in val1]
    cnt1 = df_stroke[x].value_counts().values
    plt.figure(cnt//max_in_row, figsize=(25,8))
    plt.subplot(1, max_in_row, (cnt)%max_in_row + 1)
    plt.title(x, fontsize=20)
    plt.xticks(fontsize=16)
    plt.yticks(fontsize=16)
    plt.xlabel(x, fontsize=16)
    plt.bar(val1, cnt1, color=mycolors);
    cnt += 1

#### There are more women than men in the data set

In [None]:
temp_df = df_stroke.copy()
temp_df = temp_df.drop(columns=['id'])
for x in categorical_cols:
    if x in to_numeric:
        temp_df[x] = temp_df[x].map(lambda a: to_numeric[x][a])

In [None]:
cnt = 0
max_in_row = 1
for x in categorical_cols:
    plt.figure(cnt//max_in_row, figsize=(25,8))
    plt.subplot(1, max_in_row, (cnt)%max_in_row + 1)
    plt.title(x, fontsize=20)
    plt.xticks(fontsize=16)
    plt.yticks(fontsize=16)
    plt.xlabel(x, fontsize=16)
    sns.kdeplot(data=temp_df, x=x, hue="stroke", fill=True, common_norm=False, alpha=.5, linewidth=0);
    cnt += 1

## Count of the target

In [None]:
f1 = df_stroke['stroke'].map(lambda x:  '1 = patient had a stroke' if x == 1 else "0 = patient hadn't a stroke")

plt.figure(figsize=(18,10))
val = f1.value_counts().index
cnt = f1.value_counts().values

plt.title('Count of the target', size=20)
plt.tick_params(labelsize=16)
plt.ylabel('Count', size=16)
plt.xlabel('output', size=16)
plt.bar(val, cnt, color = mycolors);
plt.show()

#### In dataset we have more cases with option 0


## Correlation Matrix

In [None]:
plt.figure(figsize = (24, 24))
sns.heatmap(temp_df.corr(), cmap = "coolwarm", annot=True, fmt='.1f', linewidths=0.1);
plt.yticks(rotation=0, size=16)
plt.xticks(rotation=90,size=16)
plt.title('Correlation Matrix', size=26)
plt.show()

In [None]:
plt.figure(figsize = (24, 24))
sns.heatmap(temp_df.corr()>=0.5, cmap = "coolwarm", annot=True, fmt='.1f', linewidths=0.1);
plt.yticks(rotation=0, size=16)
plt.xticks(rotation=90, size=16)
plt.title('Correlation Matrix', size=26)
plt.show()

#### As we can see, the variables weekly correlate with each other

In [None]:
sns.pairplot(temp_df, hue='stroke');

# Training model

#### Make one-hot encoding for caterical columns and simple scaler train data

In [None]:
df_stroke_tr = df_stroke.copy()
df_stroke_tr = df_stroke_tr.drop(columns=['id'])

for x in categorical_cols:
    if x in to_numeric:
        df_stroke_tr[x] = df_stroke_tr[x].map(lambda a: to_numeric[x][a])


X = df_stroke_tr.drop(['stroke'],axis=1)
y = df_stroke_tr['stroke']

sm = SMOTETomek(random_state=42)
X,y = sm.fit_resample(X, y.ravel())


# split the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 2021, stratify=y)


X_train_raw = X_train.copy()
X_test_raw = X_test.copy()
y_train_raw = y_train.copy()
y_test_raw = y_test.copy()

X_train_norm = X_train.copy()
X_test_norm = X_test.copy()
y_train_norm = y_train.copy()
y_test_norm = y_test.copy()
norm = MinMaxScaler()
X_train_norm[continous_cols] = norm.fit_transform(X_train_norm[continous_cols])
X_test_norm[continous_cols] = norm.transform(X_test_norm[continous_cols])

X_train_stand = X_train.copy()
X_test_stand = X_test.copy()
y_train_stand = y_train.copy()
y_test_stand = y_test.copy()
scaler = StandardScaler()
X_train_stand[continous_cols] = scaler.fit_transform(X_train_stand[continous_cols])
X_test_stand[continous_cols] = scaler.transform(X_test_stand[continous_cols])

### A functions that makes life easier

In [None]:
def train_model(model, X, y):
    model.fit(X, y)
    return model


def predict_model(model, X, proba=False):
    if ~proba:
        y_pred = model.predict(X)
    else:
        y_pred_proba = model.predict_proba(X)
        y_pred = np.argmax(y_pred_proba, axis=1)

    return y_pred


list_scores = []

def run_model(name, model, X_train, X_test, y_train, y_test, fc, proba=False):
    print(name)
    print(fc)
    
    model = train_model(model, X_train, y_train)
    y_pred = predict_model(model, X_test, proba)
    
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    print('accuracy: ', accuracy)
    print('recall: ',recall)
    print('precision: ', precision)
    print('f1: ', f1)
    print(classification_report(y_test, y_pred))
    
    
    plot_confusion_matrix(model, X_test, y_test, cmap='Blues');    
    plt.show()
    plot_learning_curve(model, X_train, y_train, cv=3, scoring='f1');    
    plt.show()
    
    list_scores.append({'Model Name': name, 'Feature Scaling':fc, 'Accuracy': accuracy, 'Recall': recall, 'Precision': precision, 'F1':f1})

In [None]:
feature_scaling = {
    'Raw':(X_train_raw, X_test_raw, y_train_raw, y_test_raw),
    'Normalization':(X_train_norm, X_test_norm, y_train_norm, y_test_norm),
    'Standardization':(X_train_stand, X_test_stand, y_train_stand, y_test_stand),
}

## Running some models on this data

In [None]:
model_svc = SVC(random_state=2021)

for fc_name, value in feature_scaling.items():
    X_train, X_test, y_train, y_test = value
    run_model('SVC', model_svc, X_train, X_test, y_train, y_test, fc_name)

In [None]:
logreg = LogisticRegression(solver='lbfgs', max_iter=1000, random_state=2021)

for fc_name, value in feature_scaling.items():
    X_train, X_test, y_train, y_test = value
    run_model('Logistic Regression', logreg, X_train, X_test, y_train, y_test, fc_name, proba=True)

In [None]:
for fc_name, value in feature_scaling.items():
    scores_1 = []
    X_train, X_test, y_train, y_test = value
    
    for i in range(2,50):
        knn = KNeighborsClassifier(n_neighbors = i)
        knn.fit(X_train, y_train)
        
        scores_1.append(accuracy_score(y_test, knn.predict(X_test)))
    
    max_val = max(scores_1)
    max_index = np.argmax(scores_1) + 2
    
    knn = KNeighborsClassifier(n_neighbors = max_index)
    knn.fit(X_train, y_train)

    run_model(f'KNeighbors Classifier n_neighbors = {max_index}', knn, X_train, X_test, y_train, y_test, fc_name)

In [None]:
for fc_name, value in feature_scaling.items():
    X_train, X_test, y_train, y_test = value
    
    dt = DecisionTreeClassifier()
    
    parameters = { 'max_depth':np.arange(1,5,1),'random_state':[2021]}
    searcher = GridSearchCV(dt, parameters)
    
    run_model('DecisionTree Classifier', searcher, X_train, X_test, y_train, y_test, fc_name )

In [None]:
rf = RandomForestClassifier(n_estimators=200, max_depth=3, random_state=2021)

for fc_name, value in feature_scaling.items():
    X_train, X_test, y_train, y_test = value
    run_model('RandomForest Classifier', rf, X_train, X_test, y_train, y_test, fc_name)

In [None]:
rf = RandomForestClassifier(n_estimators=200, max_depth=2, random_state=2021)

for fc_name, value in feature_scaling.items():
    X_train, X_test, y_train, y_test = value
    run_model('RandomForest Classifier', rf, X_train, X_test, y_train, y_test, fc_name)

In [None]:
rf = RandomForestClassifier(n_estimators=200, max_depth=2, random_state=2021)

for fc_name, value in feature_scaling.items():
    X_train, X_test, y_train, y_test = value
    run_model('RandomForest Classifier', rf, X_train, X_test, y_train, y_test, fc_name)

In [None]:
gbt = GradientBoostingClassifier(n_estimators = 200, max_depth=3, subsample=0.8, max_features=0.2, random_state=2021)
for fc_name, value in feature_scaling.items():
    X_train, X_test, y_train, y_test = value
    run_model('GradientBoosting Classifier', gbt, X_train, X_test, y_train, y_test, fc_name)

In [None]:
for fc_name, value in feature_scaling.items():
    X_train, X_test, y_train, y_test = value
    xgb_model = xgb.XGBClassifier(n_estimators = 200, max_depth=2, random_state=2021, use_label_encoder=False, eval_metric='mlogloss')
        
    run_model('XGBoost Classifier', xgb_model, X_train, X_test, y_train, y_test, fc_name)

In [None]:
for fc_name, value in feature_scaling.items():
    X_train, X_test, y_train, y_test = value
    xgb_model = xgb.XGBClassifier(n_estimators = 100, max_depth=3, random_state=2021, use_label_encoder=False, eval_metric='mlogloss')
        
    run_model('XGBoost Classifier', xgb_model, X_train, X_test, y_train, y_test, fc_name)

In [None]:
for fc_name, value in feature_scaling.items():
    X_train, X_test, y_train, y_test = value
    xgb_model = xgb.XGBClassifier(n_estimators = 200, max_depth=2, random_state=2021, use_label_encoder=False, eval_metric='mlogloss')
        
    run_model('XGBoost Classifier', xgb_model, X_train, X_test, y_train, y_test, fc_name)

In [None]:
for fc_name, value in feature_scaling.items():
    X_train, X_test, y_train, y_test = value
    xgb_model = xgb.XGBClassifier(n_estimators = 500, max_depth=2, random_state=2021, use_label_encoder=False, eval_metric='mlogloss')
        
    run_model('XGBoost Classifier', xgb_model, X_train, X_test, y_train, y_test, fc_name)

In [None]:
lgbm_model = lgbm.LGBMClassifier(max_depth = 2, n_estimators=500, subsample=0.8, random_state=2021)
for fc_name, value in feature_scaling.items():
    X_train, X_test, y_train, y_test = value
    run_model('Lightgbm Classifier', lgbm_model, X_train, X_test, y_train, y_test, fc_name)

## Summary scores

In [None]:
df_scores = pd.DataFrame(list_scores)
df_scores.style.highlight_max(color = 'lightgreen', axis = 0)

# Summary

#### We learned a lot of interesting knowledge about stroke.

#### I would love to know your comments and note about this.

#### If you liked it, make sure to vote :)

#### I'm going to make the next notebook soon.

<font size="6">
    <div style="text-align: center"> <b> Author </b> </div>
</font>

<font size="5">
    <div style="text-align: center"> JÄ™drzej </div>
    <div style="text-align: center"> Dudzicz </div>
</font>