# Stroke Binary Classification
## by Dahlia Weinberg

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statistics as stat

from pprint import pprint
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import recall_score

plt.style.use('seaborn-darkgrid')

In [None]:
df = pd.read_csv("/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")

In [None]:
df.head(3)

# Exploritory Data Analysis

In [None]:
df.shape

In [None]:
round(df.describe()).T

In [None]:
df.info()

In [None]:
df.nunique()

In [None]:
plt.figure(dpi=120)
sns.heatmap(df.corr(), annot=True)
plt.title('correlation matrix', weight='bold')
plt.show()

### Stroke has some correlation with age. 
### Age and BMI have some correlation.

In [None]:
df.stroke.value_counts(normalize=True)

### The target is only 5% of the dataset.  
### This means the dataset is unbalanced.

In [None]:
sns.boxplot(x='stroke', y='age', data=df, palette='rainbow')
plt.ylim(-10,100)

### The median age for people with stroke significantly higher.

In [None]:
df.gender.value_counts(normalize=True)

In [None]:
sns.boxplot(x='gender', y='age', data=df, palette='rainbow')
plt.ylim(-10,100)

### There are more males than females in this dataset but it's not overwhelmingly skewed. 
### The age distribution is pretty similar.

In [None]:
df.hypertension.value_counts(normalize=True)

In [None]:
sns.boxplot(x='hypertension', y='age', data=df, palette='rainbow')
plt.ylim(-10,100)

In [None]:
df.heart_disease.value_counts(normalize=True)

In [None]:
sns.boxplot(x='heart_disease', y='age', data=df, palette='rainbow')
plt.ylim(-10,100)

In [None]:
df.smoking_status.value_counts(normalize=True)

In [None]:
sns.boxplot(x='smoking_status', y='age', data=df, palette='rainbow')
plt.ylim(-10,100)

In [None]:
df.ever_married.value_counts()

In [None]:
sns.boxplot(x='ever_married', y='age', data=df, palette='rainbow')
plt.ylim(-10,100)

In [None]:
df.Residence_type.value_counts()

In [None]:
sns.boxplot(x='Residence_type', y='age', data=df, palette='rainbow')
plt.ylim(-10,100)

In [None]:
df.work_type.value_counts()

In [None]:
sns.boxplot(x='work_type', y='age', data=df, palette='rainbow')
plt.ylim(-10,100)

### Histrograms for numerical features

In [None]:
df.hist('age', bins=30)
plt.title("Age Distribution", weight='bold')
plt.show()

In [None]:
df.hist('avg_glucose_level', bins=30)
plt.xlabel("Average Glucose Level")
plt.ylabel("Count")
plt.show()

In [None]:
df.hist('bmi', bins=30)

In [None]:
plt.figure(dpi=120)
df.groupby('stroke')['age'].plot(kind='hist', bins=30, legend=True,
                                             alpha=0.7, title='Stroke by Age' )
plt.xlabel('Age')
plt.legend(shadow=True, frameon=True)
plt.show()

In [None]:
plt.figure(dpi=120)
df_stroke = df[df.stroke==1]
plt.hist(df_stroke.age, bins=70)
plt.title('Age distribution of people with stroke')
plt.axvline(40, color='red', linestyle='dashed', linewidth=2)
plt.xlabel('Age')
plt.ylabel('Count')
plt.show()

### Stroke rises with age and is barely present under the age of 40.

# Data Cleaning

### 1. Gender Feature

In [None]:
df.gender.value_counts()

In [None]:
df.drop(df[df['gender']=='Other'].index,inplace=True)

In [None]:
df[df['gender']=='Other'].index

In [None]:
df['gender'].replace({'Male':1, 'Female':2},  inplace=True)

### 2. Stroke and Age - cleaning outliers

In [None]:
sns.boxplot(x='stroke', y='age', data=df, palette='rainbow')
plt.ylim(-10,100)

In [None]:
df.query('age < 20 and stroke==1')

In [None]:
df.drop(index=[162, 245], axis=0, inplace=True)

In [None]:
sns.boxplot(x='stroke', y='age', data=df, palette='rainbow')
plt.ylim(-10,100)

### 3. Data imputation of missing values in BMI feature.
The Nan (missing values) were replaced with average BMI from their respective BMI age bins. 

In [None]:
plt.figure(dpi=120)
sns.heatmap(df.isnull(), yticklabels=False, cbar=False)
plt.title("Missing Values Heat Map")

In [None]:
df.age.max()

In [None]:
# create age bins
bins = [0, 40, 45, 50, 55, 60, 65, 70, 75, 82]
df['age_group']=pd.cut(df.age, bins)
df[['id', 'age', 'age_group']].sample(5)

In [None]:
# calculate average BMI for each age bin
age_group_to_meanbmi = df.groupby('age_group').mean()['bmi']
age_group_to_meanbmi

In [None]:
# replace missing values with age group mean BMI values
age_group_to_meanbmi = df.groupby('age_group').mean()['bmi']
for index, column in df.iterrows():
    if np.isnan(column.bmi):
        mean_bmi = column.age_group
        df.loc[index, 'bmi'] = age_group_to_meanbmi[mean_bmi]

plt.figure(dpi=120)
sns.heatmap(df.isnull(), yticklabels=False, cbar=False)

### 4. DF over 40 - making dataset more balanced.

In [None]:
df_over_40 = df[df.age >= 40]
df_over_40.sample()
#df_over_40.shape

In [None]:
# now the target is 8% rather than less than 5%
df_over_40.stroke.value_counts(normalize=True)

### 5. Turning smoking to a numerical feature using dummy

In [None]:
dummy_smoker = pd.get_dummies(df.smoking_status, drop_first=False, prefix='Smoker')
dummy_smoker.head()

In [None]:
dummy_smoker.shape, df.shape

In [None]:
df = pd.concat([df, dummy_smoker], axis=1)
df.head()

# Function for data cleaning

In [None]:
def clean_data(df: pd.DataFrame):
    
    #remove unknown gender (single row)
    df.drop(df[df['gender']=='Other'].index,inplace=True)
    
    #turning the gender feature from a string to numerical
    df['gender'].replace({'Male':1, 'Female':2},  inplace=True)
    
    # removing age outliers for stroke
    df.drop(index=[162, 245], axis=0, inplace=True)
    
    #placing bmi average/age to missing bmi values
    bins = [0, 40, 45, 50, 55, 60, 65, 70, 75, 82]
    pd.cut(df.age, bins)
    df['age_group']=pd.cut(df.age, bins)
    age_group_to_meanbmi = df.groupby('age_group').mean()['bmi']
    for index, column in df.iterrows():
        if np.isnan(column.bmi):
            mean_bmi = column.age_group
            df.loc[index, 'bmi'] = age_group_to_meanbmi[mean_bmi]
            
    #limiting the age to 40 and up.
    df = df[df.age >= 40]
    
    #making smoking statues a numerical feature
    dummy_smoker = pd.get_dummies(df.smoking_status, drop_first=False, prefix='Smoker')
    df = pd.concat([df, dummy_smoker], axis=1)
    
    return df

### Data cleaning function

In [None]:
df_raw = pd.read_csv("/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")

In [None]:
df_final = clean_data(df_raw)

In [None]:
df_final.head()

In [None]:
df_final.shape

# Data Analysis

In [None]:
df_final.columns

### The feature with the most visible correlation with stroke is age, glucose levels and BMI. 

In [None]:
sns.pairplot(df_final[['age', 'avg_glucose_level', 'bmi', 'stroke',]], hue='stroke')
plt.show()

In [None]:
# showing the distribution of stroke in the 40+ population
df_final.groupby('stroke')['age'].plot(kind='hist', bins=30, legend=True,
                                             alpha=0.7, title='Stroke by Age' )

plt.show()

# Training the stroke model

## Features found to be of significance in the Data Analysis


In [None]:
X_features = ['gender','age', 'hypertension', 'heart_disease','avg_glucose_level', 'bmi', 'Smoker_Unknown', 
              'Smoker_formerly smoked', 'Smoker_never smoked', 'Smoker_smokes']

### Baseline Models - Decision Tree was found to be the model with the highest performance

In [None]:
dtree = DecisionTreeClassifier()

model = dtree


X = df_final[X_features]
y = df_final.stroke
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=101)
model.fit(X_train, y_train)

### Baseline Train

In [None]:
y_pred = model.predict(X_train)
confusion_matrix(y_train, y_pred)

In [None]:
print(classification_report(y_train, y_pred))

### Baseline test

In [None]:
y_pred_test = model.predict(X_test)
print(confusion_matrix(y_test, y_pred_test))
print(classification_report(y_test, y_pred_test))

### Decision tree train was overfit and test was low. Yet it was the most effective of the classifiers.  

## Finding the features of greatest importance.

# Hyperparameter Tuning

# Decision Tree Random Grid Search

In [None]:
dtree = DecisionTreeClassifier()

# Look at parameters used by our current dtree
print('Parameters currently in use:\n')
pprint(dtree.get_params())

In [None]:
param_grid = {'splitter' : ["best", "random"],
              'max_depth':[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 
              'min_samples_split' : [1, 2, 3, 4],
              'min_samples_leaf': [2, 5, 8, 10]}

In [None]:
grid = GridSearchCV(DecisionTreeClassifier(), param_grid, scoring='recall', refit=True,verbose=1)
grid.fit(X_train,y_train)

In [None]:
grid.best_score_

In [None]:
grid.best_params_

# Cost complexity pruning - Hyperparameter tuning 

In [None]:
alphas = [0.001, 0.0013, 0.0017, 0.002, 0.0023, 0.0027, 0.003, 0.0033, 0.0037, 
         0.004, 0.0043, 0.0047, 0.005, 0.0053, 0.0057, 0.006, 0.01]

train_recall_scores = []
test_recall_scores = []

for alpha in alphas:
    dtree = DecisionTreeClassifier(
        ccp_alpha = alpha, class_weight='balanced', criterion='gini', random_state=42, 
        max_depth=100, min_samples_split=8, min_samples_leaf=2, splitter='best')
    
    dtree.fit(X_train, y_train)
    y_pred = dtree.predict(X_train)
    train_recall_score = recall_score(y_train, y_pred)
    train_recall_scores.append(train_recall_score)
    
    y_pred_test = dtree.predict(X_test)
    test_recall_score = recall_score(y_test, y_pred_test)
    test_recall_scores.append(test_recall_score)

In [None]:
plt.figure(dpi=120)

plt.plot(alphas, train_recall_scores, marker='o', label="train",
        drawstyle="steps-post")
plt.plot(alphas, test_recall_scores, marker='o', label="test",
        drawstyle="steps-post")
plt.xlabel("alpha")
plt.ylabel("recall")
plt.title("recall vs alpha for training and testing sets")
plt.legend(loc='center right', shadow=True, frameon=True)
plt.show()

## *** The graph showes that the ideal ccp_alpha for both train and test was 0.0053 

In [None]:
# The hyperparameter class_weight was tested with 'weights' and 'balanced'. The 'balanced' was more effective.
# weights = {0:10, 1:90}
# for criterion entropy caused overfitting
dtree = DecisionTreeClassifier(
    ccp_alpha = 0.0053, 
    class_weight='balanced', 
    criterion='gini', 
    random_state=42, 
    max_depth=20, 
    min_samples_split=2, 
    min_samples_leaf=3,
    splitter='best')

model = dtree

In [None]:
X = df_final[X_features]
y = df_final.stroke
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=101)
model.fit(X_train, y_train)

# Training the Decision Tree

In [None]:
y_pred = model.predict(X_train)
confusion_matrix(y_train, y_pred)

In [None]:
print(classification_report(y_train, y_pred))

# Test Decision Tree Model

In [None]:
y_pred_test = model.predict(X_test)
print(confusion_matrix(y_test, y_pred_test))
print(classification_report(y_test, y_pred_test))

In [None]:
feature_names = list(X_train.columns)

importances = list(model.feature_importances_)

data_dict = {'feature_names': feature_names,
            'importances': importances}

data_dict


df_features = pd.DataFrame(data_dict)
df_features.sort_values(by='importances', ascending=False, inplace=True)

In [None]:
df_features

In [None]:
plt.figure(dpi=100)
plt.bar(x=df_features.feature_names, height=df_features.importances)
plt.xticks(rotation=90)
plt.title('Feature Importance')
plt.show()

# Conclusions:
### Decision tree was the model with the best performance. 
The most important features in this model were age, BMI and average glucose level. <br>
The train set was overfit.<br>
Hyperparameters were tuned to overcome this problem.<br>
Grid search found that 'entropy' was better than 'gini' but this was also causing overfitting in the train.<br> 
The best tuning was found using:
1. class_weight as 'balanced'
2. finding the best ccp_alpha by optimizing for recall

The recall was thoroughly improved. <br>This was the most important aspect of the classification, since the idea was to identify those at risk of stroke.<br>
The precision and the f1 score were low.
