## Context
According to the World Health Organization (WHO) stroke is the 2nd leading cause of death globally, responsible for approximately 11% of total deaths.
This dataset is used to predict whether a patient is likely to get stroke based on the input parameters like gender, age, various diseases, and smoking status. Each row in the data provides relavant information about the patient.

## Attribute Information
1) id: unique identifier

2) gender: "Male", "Female" or "Other"

3) age: age of the patient

4) hypertension: 0 if the patient doesn't have hypertension, 1 if the patient has hypertension

5) heart_disease: 0 if the patient doesn't have any heart diseases, 1 if the patient has a heart disease

6) ever_married: "No" or "Yes"

7) work_type: "children", "Govt_jov", "Never_worked", "Private" or "Self-employed"

8) Residence_type: "Rural" or "Urban"

9) avg_glucose_level: average glucose level in blood

10) bmi: body mass index

11) smoking_status: "formerly smoked", "never smoked", "smokes" or "Unknown"*

12) stroke: 1 if the patient had a stroke or 0 if not

***Note: "Unknown" in smoking_status means that the information is unavailable for this patient***

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing the Libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
plt.style.use('seaborn-whitegrid')
import plotly.express as px
import plotly.graph_objects as go
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score

# Basic Data Wrangling

In [None]:
dataset = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
dataset.head()

In [None]:
dataset.describe(include = 'all')

In [None]:
dataset.info()

In [None]:
dataset.isnull().sum()

# Exploratory Data Analysis

In [None]:
dataset['is_Stroke'] = ' '
for i in range(len(dataset)):
    if dataset['stroke'][i] == 1:
        dataset['is_Stroke'][i] = 'Yes'
    else:
        dataset['is_Stroke'][i] = 'No'

In [None]:
plt.figure(figsize = (10, 7))
sns.countplot(x = dataset['gender'])
plt.title('Gender', fontsize = 20)
plt.xlabel('Gender')
plt.ylabel('Count')
plt.show()

In [None]:
plt.figure(figsize = (10, 7))
sns.countplot(x = dataset['is_Stroke'])
plt.title('Stroke', fontsize = 20)
plt.xlabel('Stroke')
plt.ylabel('Count')
plt.show()

In [None]:
plt.figure(figsize = (10, 7))
sns.countplot(x = dataset['heart_disease'])
plt.title('Heart Disease', fontsize = 20)
plt.xlabel('Heart Disease')
plt.ylabel('Count')
plt.show()

In [None]:
plt.figure(figsize = (10, 7))
sns.countplot(x = dataset['hypertension'])
plt.title('Hyper Tension', fontsize = 20)
plt.xlabel('Hyper Tension')
plt.ylabel('Count')
plt.show()

In [None]:
plt.figure(figsize = (10, 7))
sns.countplot(x = dataset['ever_married'])
plt.title('Married or Not?', fontsize = 20)
plt.xlabel('Married')
plt.ylabel('Count')
plt.show()

In [None]:
plt.figure(figsize = (10, 7))
sns.countplot(x = dataset['work_type'])
plt.title('Type of Work', fontsize = 20)
plt.xlabel('Work')
plt.ylabel('Count')
plt.show()

In [None]:
plt.figure(figsize = (10, 7))
sns.countplot(x = dataset['Residence_type'])
plt.title('Area of Residence', fontsize = 20)
plt.xlabel('Area')
plt.ylabel('Count')
plt.show()

In [None]:
plt.figure(figsize = (10, 7))
sns.countplot(x = dataset['smoking_status'])
plt.title('Smoking Status', fontsize = 20)
plt.xlabel('Somking')
plt.ylabel('Count')
plt.show()

In [None]:
plt.figure(figsize = (10, 7))
sns.barplot(x = dataset['gender'], y = dataset['heart_disease'], hue = dataset['work_type'])
plt.legend(loc = 'upper right')
plt.title('Gender vs. Heart Disease based on Type of Job', fontsize = 20)
plt.xlabel('Gender')
plt.ylabel('Heart Disease')
plt.show()

In [None]:
plt.figure(figsize = (10, 7))
sns.barplot(x = dataset['gender'], y = dataset['heart_disease'], hue = dataset['smoking_status'])
plt.legend(loc = 'upper right')
plt.title('Gender vs. Heart Disease based on Smoking', fontsize = 20)
plt.xlabel('Gender')
plt.ylabel('Heart Disease')
plt.show()

In [None]:
plt.figure(figsize = (10, 7))
sns.barplot(x = dataset['gender'], y = dataset['heart_disease'], hue = dataset['is_Stroke'])
plt.legend(loc = 'upper right')
plt.title('Gender vs. Heart Disease based on Stroke', fontsize = 20)
plt.xlabel('Gender')
plt.ylabel('Heart Disease')
plt.show()

In [None]:
sns.pairplot(dataset)
plt.show()

In [None]:
colors = ['mediumturquoise', 'darkorange', 'lightgreen']
fig = go.Figure(data = [go.Pie(labels = dataset['gender'])])
fig.update_traces(textfont_size = 20, marker = dict(colors = colors, line = dict(color = '#000000', width = 2)))
fig.update_layout(title_text = 'Gender')
fig.show()

In [None]:
colors = ['darkorange', 'lightgreen']
fig = go.Figure(data = [go.Pie(labels = dataset['ever_married'])])
fig.update_traces(textfont_size = 20, marker = dict(colors = colors, line = dict(color = '#000000', width = 2)))
fig.update_layout(title_text = 'Maritial Status')
fig.show()

In [None]:
colors = ['darkorange', 'lightgreen']
fig = go.Figure(data = [go.Pie(labels = dataset['work_type'])])
fig.update_traces(textfont_size = 20, marker = dict(line = dict(color = '#000000', width = 2)))
fig.update_layout(title_text = 'Working Status')
fig.show()

In [None]:
colors = ['orange', 'seagreen', 'gold', 'red']
fig = go.Figure(data = [go.Pie(labels = dataset['smoking_status'])])
fig.update_traces(textfont_size = 20, marker = dict(colors = colors, line=dict(color = '#000000', width = 2)))
fig.update_layout(title_text = 'Smoking Status')
fig.show()

In [None]:
df = pd.DataFrame(dataset['smoking_status'].value_counts())
px.bar(x = df.index, y = df.smoking_status, height = 400, text = df.smoking_status, labels = {'x':'Status', 'y': 'Count'},
      title = 'Smoking Status vs. Count')

In [None]:
px.bar(data_frame = dataset, x = 'smoking_status', y = 'heart_disease', color = 'is_Stroke', height = 400,
       labels = {'smoking_status':'Smoking Status', 'heart_disease':'Heart Disease'}, title = 'Smoking Status vs. Heart Disease based on Stroke')

In [None]:
counts, bins = np.histogram(dataset.bmi, bins=range(0, 100, 5))
bins = 0.5 * (bins[:-1] + bins[1:])

fig = px.bar(x = bins, y = counts, labels = {'x':'BMI', 'y':'Count'}, title = 'BMI Distribution')
fig.show()

In [None]:
counts, bins = np.histogram(dataset.age, bins=range(0, 90, 5))
bins = 0.5 * (bins[:-1] + bins[1:])

fig = px.bar(x = bins, y = counts, labels = {'x':'Age', 'y':'Count'}, title = 'Age Distribution')
fig.show()

In [None]:
px.histogram(data_frame = dataset, x = 'age', color = 'is_Stroke', height = 400,
             marginal = 'box', labels = {'age':'Age', 'count': 'Count'}, title = 'Age vs. Count based on Stroke')

In [None]:
px.box(x = dataset['bmi'], y = dataset['work_type'], color = dataset['Residence_type'], labels = {'x': 'BMI', 'y':'Work Type'},
      title = 'BMI vs. Work Type based on Residence Type')

In [None]:
px.box(x = dataset['bmi'], y = dataset['work_type'], color = dataset['gender'], labels = {'x': 'BMI', 'y':'Work Type'},
      title = 'BMI vs. Work Type based on Gender')

In [None]:
px.density_contour(data_frame = dataset, x = 'bmi', color = 'heart_disease', labels = {'bmi': 'BMI', 'index':''},
                  title = 'Density Contour plot of BMI based on Heart Disease')

In [None]:
px.density_contour(data_frame = dataset, x = 'age', color = 'heart_disease', labels = {'age': 'Age', 'index':''},
                  title = 'Density Contour plot of Age based on Heart Disease')

# Feature Engineering (Data Preprocessing)

In [None]:
dataset.drop(['is_Stroke', 'id'], axis = 1, inplace = True)
dataset.head()

### Encoding all the necessary columns

In [None]:
gender = pd.get_dummies(dataset['gender'], drop_first = True)
married = pd.get_dummies(dataset['ever_married'], drop_first = True)
work = pd.get_dummies(dataset['work_type'], drop_first = True)
residence = pd.get_dummies(dataset['Residence_type'], drop_first = True)
smoking = pd.get_dummies(dataset['smoking_status'], drop_first = True)
dataset = pd.concat([gender, married, work, residence, smoking, dataset], axis = 1)

### Dropping all the unecessary columns

In [None]:
dataset.drop(['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'], axis = 1, inplace = True)
dataset.head()

### Filling the Missing values of BMI (Using KNN Imputation) 

In [None]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors = 5)
dataset = pd.DataFrame(imputer.fit_transform(dataset), columns = dataset.columns)

In [None]:
dataset.isnull().sum()

In [None]:
dataset.rename(columns = {'Yes': 'ever_married'}, inplace = True)
dataset.head()

### Determining the Correlation between various Columns

In [None]:
plt.figure(figsize = (12, 8))
sns.heatmap(dataset.corr(), linecolor = 'white', linewidths = 1, annot = True)
plt.show()

# Classification Model

### Splitting the Dataset into Training and Test set

In [None]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

### Appyling SMOTEENN Algorithm to handle the unbalanced 'stroke' column

In [None]:
print("Before OverSampling, counts of label '1': {}".format(sum(y_train == 1))) 
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train == 0))) 
  
# import SMOTE module from imblearn library  
from imblearn.combine import SMOTEENN 
sm = SMOTEENN(random_state = 0) 
X_train_res, y_train_res = sm.fit_resample(X_train, y_train) 
  
print('After OverSampling, the shape of train_X: {}'.format(X_train_res.shape)) 
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_res.shape)) 
  
print("After OverSampling, counts of label '1': {}".format(sum(y_train_res == 1))) 
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res == 0)))

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
log = LogisticRegression(max_iter = 300)
log.fit(X_train_res, y_train_res)

y_pred_log = log.predict(X_test)

In [None]:
print("The Training Score of Logistic Regression is: {}%".format(log.score(X_train_res, y_train_res)*100))
print("The Accuracy Score of Logistic Regression is: {}%".format(accuracy_score(y_test, y_pred_log)*100))
print("The Confusion Matrix for Logistic Regression is: \n{}\n".format(confusion_matrix(y_test, y_pred_log)))
print('\n')
print(classification_report(y_test, y_pred_log))

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
dt.fit(X_train_res, y_train_res)

y_pred_dt = dt.predict(X_test)

In [None]:
print("The Training Score of Decision Tree is: {}%".format(dt.score(X_train_res, y_train_res)*100))
print("The Accuracy Score of Decision Tree is: {}%".format(accuracy_score(y_test, y_pred_dt)*100))
print("The Confusion Matrix for Decision Tree is: \n{}\n".format(confusion_matrix(y_test, y_pred_dt)))
print('\n')
print(classification_report(y_test, y_pred_dt))

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 300, criterion = 'entropy')
rf.fit(X_train_res, y_train_res)

y_pred_rf = rf.predict(X_test)

In [None]:
print("The Training Score of Random Forest is: {}%".format(rf.score(X_train_res, y_train_res)*100))
print("The Accuracy Score of Random Forest is: {}%".format(accuracy_score(y_test, y_pred_rf)*100))
print("The Confusion Matrix for Random Forest is: \n{}\n".format(confusion_matrix(y_test, y_pred_rf)))
print('\n')
print(classification_report(y_test, y_pred_rf))

In [None]:
print("F1 Score for Logistic Regression is: {}".format(f1_score(y_test, y_pred_log)))
print("F1 Score for Decision Tree is: {}".format(f1_score(y_test, y_pred_dt)))
print("F1 Score for Random Forest is: {}".format(f1_score(y_test, y_pred_rf)))

# Thank You :)