In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Stroke Prediction Dataset**
This dataset is used to predict whether a patient is likely to get stroke based on the input parameters like gender, age, various diseases, and smoking status. Each row in the data provides relavant information about the patient.

**Dataset Information:**

healthcare-data-stroke-data.csv: The csv contains data related to patients who may have heart disease and various attributes which determine that :

* id: unique identifier
* gender: "Male", "Female" or "Other"
* age: age of the patient
* hypertension: 0 if the patient doesn't have hypertension, 1 if the patient has hypertension
* heart_disease: 0 if the patient doesn't have any heart diseases, 1 if the patient has a heart disease
* ever_married: "No" or "Yes"
* work_type: "children", "Govt_jov", "Never_worked", "Private" or "Self-employed"
* Residence_type: "Rural" or "Urban"
* avg_glucose_level: average glucose level in blood
* bmi: body mass index
* smoking_status: "formerly smoked", "never smoked", "smokes" or "Unknown"*
* stroke: 1 if the patient had a stroke or 0 if not
Note: "Unknown" in smoking_status means that the information is unavailable for this patient

**Objective**

Visualize the relationships between various Healthy and Unhealthy habits to Heart Strokes, and there by predict the stroke probability with best model and hypertuned parameters.

**Assumptions**

1.Smoking can induce Stroke, is it true?

2.Heart with a Heart Disease is prone to Stroke, is it true?

3.Workload(work_type) results in high blood pressure and that could lead to Stroke, is it true?

4.Males are most susceptible to strokes due to high work related stress, is it true?

5.Being Married will increase the risk of having a stroke, is it true?

6.HyperTension, is it one of the reason for a stroke?

**Questions to be answered**

1.Does age has impact on strokes? and How is this parameter distributed?

2.Is there a difference in the rate of heart stroke for smokers and non smokers?

3.Does the type of job, whether stressful or not, contribute to heart stroke?

In [None]:
df = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')

Dataset imported

In [None]:
# Data manipulation libraries
import numpy as np
import pandas as pd

# Visualization libraries
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

# Avoid Warnings
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

#Common model helpers
from sklearn.preprocessing import(LabelEncoder)
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import (classification_report, accuracy_score, 
                             precision_score,
                             recall_score,
                             f1_score, 
                             confusion_matrix)

# imbalance dataset handling


from imblearn.over_sampling import (SMOTE)
# model algorithams
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

In [None]:
df.shape #-there are total 5110 rows and 12 columns

In [None]:
df.head() #first 5 in the dataset

In [None]:
df.tail() # Last 5 in the Dataset

In [None]:
df.dtypes

id,hypertension,heart_disease,stroke these 4 columns are of datatype int

age,avg_glucose_level,bmi are of datatype float

gender,smoking_status,ever_married,work_type and residence_type are of string type

In [None]:
df.info()

As per my observation all the columns match with the number of rows in the dataframe except for bmi column.

So there are missing values in bmi column

In [None]:
#lets import missingno package
#Missingno is a Python library 
#that provides the ability to understand the distribution of missing values through informative visualizations.
import missingno as msno

In [None]:
#visualizing in barplot
msno.bar(df)

So there are total 201 missing values in bmi column

# Treating Missing Values

In [None]:
per=df.isnull().sum()/len(df)*100
per

only 3.9 percent of the bmi data is missing , so instead of removing the entire column we can replace the missing values with mean

In [None]:
df['bmi'].fillna(df['bmi'].mean(),inplace=True)
df.isnull().sum()

missing values are replaced by the mean

In [None]:
#lets see all the categorical features
fig,axes = plt.subplots(4,2,figsize = (16,16))
sns.set_style('whitegrid')
fig.suptitle("Count plot for various categorical features")

sns.countplot(ax=axes[0,0],data=df,x='gender')
sns.countplot(ax=axes[0,1],data=df,x='hypertension')
sns.countplot(ax=axes[1,0],data=df,x='heart_disease')
sns.countplot(ax=axes[1,1],data=df,x='ever_married')
sns.countplot(ax=axes[2,0],data=df,x='work_type')
sns.countplot(ax=axes[2,1],data=df,x='Residence_type')
sns.countplot(ax=axes[3,0],data=df,x='smoking_status')
sns.countplot(ax=axes[3,1],data=df,x='stroke')

plt.show()

In [None]:
df['gender'].value_counts()

In [None]:
#removing unnecessary row
df.drop(df[df['gender']=='Other'].index,inplace=True)

In [None]:
df['gender'].value_counts()

In [None]:
#Dropping unnecessary columns
df.drop(columns=['id'],inplace=True)

Let us check whether gender has a part to play in brain strokes

before that lets check the count of people who had a stroke

In [None]:
sns.countplot(data=df,x='stroke')

As we can see number of people who got stroke is very much neglegible and due to this we may get underfitting or over fitting

In [None]:
stroke_gen = df[df['stroke'] == 1]['gender'].value_counts()
healthy_gen = df[df['stroke'] == 0]['gender'].value_counts()
female = df['gender'].value_counts().values[0]
male =  df['gender'].value_counts().values[1]
stroke_male = int(round( stroke_gen.values[1] / male *100, 0))
stroke_female = int(round (stroke_gen.values[0] / female * 100, 0))
healthy_male = int(round(healthy_gen.values[1] / male *100, 0))
healthy_female = int(round(healthy_gen.values[0] / female * 100, 0))

In [None]:
!pip install pywaffle
from pywaffle import Waffle

In [None]:
female_per = int(round(female/(female+male) * 100, 0))
male_per = int(round(male/(female+male)* 100, 0))

In [None]:
fig = plt.figure(FigureClass = Waffle, 
                 constrained_layout = True,
                 figsize = (7,7),
                 facecolor = '#f6f5f5',dpi = 100,
                 
                 plots = {'121':
                          {     
                           'rows':7,
                           'columns': 7,
                           'values' : [healthy_male,stroke_male],
                            'colors' : ['#512b58','#fe346e'],
                              'vertical' : True,
                              'interval_ratio_y': 0.1,
                              'interval_ratio_x': 0.1,
                              'icons' : 'male',
                              'icon_legend': False,
                               'icon_size':20,
                              'plot_anchor':'C',
                              'alpha':0.1
                          },
                          
                          '122' :
                          { 
                            'rows': 7,
                            'columns':7,
                            'values':[healthy_female,stroke_female],         
                              'colors' : ['#512b58','#fe346e'],
                              'vertical': True,
                              'interval_ratio_y': 0.1,
                              'interval_ratio_x': 0.1,
                              'icons' : 'female',
                              'icon_legend' :False,
                              'icon_size':20,
                              'plot_anchor':'C',
                              'alpha':0.1
                                                      
                           }
                         },
                   
)


fig.text(0., 0.8, 'Gender Risk for Stroke - effect of gender on strokes?', {'font':'Serif', 'size':20, 'color':'black', 'weight':'bold'})
fig.text(0., 0.73, 'Risk of stroke in both male and female are same,\nprove our initial assumption is wrong. ', {'font':'Serif', 'size':13, 'color':'black', 'weight':'normal'}, alpha = 0.7)
fig.text(0.24, 0.22, 'ooo', {'font':'Serif', 'size':16,'weight':'bold' ,'color':'#f6f5f5'})
fig.text(0.65, 0.22, 'ooo', {'font':'Serif', 'size':16,'weight':'bold', 'color':'#f6f5f5'})
fig.text(0.23, 0.28, '{}%'.format(healthy_male), {'font':'Serif', 'size':20,'weight':'bold' ,'color':'#512b58'},alpha = 1,)
fig.text(0.65, 0.28, '{}%'.format(healthy_female), {'font':'Serif', 'size':20,'weight':'bold', 'color':'#512b58'}, alpha = 1)
fig.text(0.21, 0.67, 'Male ({}%)'.format(male_per), {'font':'Serif', 'size':14,'weight':'bold' ,'color':'black'},alpha = 0.5,)
fig.text(0.61, 0.67, 'Female({}%)'.format(female_per), {'font':'Serif', 'size':14,'weight':'bold', 'color':'black'}, alpha = 0.5)
#fig.text(0., 0.8, 'Assumption was proven wrong', {'font':'Serif', 'size':24, 'color':'black', 'weight':'bold'})

fig.text(0.9,0.73, 'Stroke ', {'font': 'Serif','weight':'bold','Size': '16','weight':'bold','style':'normal', 'color':'#fe346e'})
fig.text(1.02,0.73, '|', {'color':'black' , 'size':'16', 'weight': 'bold'})
fig.text(1.035,0.73, 'No Stroke', {'font': 'Serif','weight':'bold', 'Size': '16','style':'normal', 'weight':'bold','color':'#512b58'},alpha = 1)


fig.show()
#this plot is taken from https://www.kaggle.com/aditimulye/stroke-prediction-visualization-prediction

It is interesting to note that although the number of males and females are different in the dataset, but, both of them are at equal risk to heart stroke.
Hence proving that our assumption that males are more susciptible to stroke due to work load, as wrong.

In [None]:
bmi=list(df['bmi'].values)
hist_data=[bmi]
group_lables=['bmi']
colour=['Red']
fig=ff.create_distplot(hist_data,group_lables,show_hist=True,colors=colour)
fig.show()

Due to outliers histplot is right skewed

Either the outliers can be removed or the distribution curve can be made less-skewed by mapping the values with a log but both cases will lead to loss of the number of datapoints with Stroke = 1

In [None]:
print("The shape after removing the BMI outliers : ",df.shape)
df.drop(df[df['bmi'] > 47].index, inplace = True)
print("The shape after removing the BMI outliers : ",df.shape)

In [None]:
bmi=list(df['bmi'].values)
hist_data=[bmi]
group_lables=['bmi']
colour=['Red']
fig=ff.create_distplot(hist_data,group_lables,show_hist=True,colors=colour)
fig.show()

In [None]:
fig=plt.figure(figsize=[8,6])
fig.patch.set_facecolor('#E0E0E0')
fig.patch.set_alpha(0.7)
plt.title(" worktype vs stroke")
sns.countplot(data=df, x="work_type",hue="stroke",edgecolor="black",color="#b8c7e1")

Although private employees count is more compared to other workers, it is evident that any
work exposes you to more stroke

In [None]:
fig=plt.figure(figsize=[8,6])
fig.patch.set_facecolor('#E0E0E0')
fig.patch.set_alpha(0.7)
plt.title(" smoker vs non smoker")
sns.countplot(data=df, x="smoking_status",hue="stroke",edgecolor="black",color="#b8c7e1")

Smoking increases the risk

In [None]:
fig=plt.figure(figsize=[8,6])
fig.patch.set_facecolor('#E0E0E0')
fig.patch.set_alpha(0.7)
plt.title("  Stroke/Hypertension")
sns.countplot(data=df, x="hypertension",hue="stroke",edgecolor="black",color="#b8c7e1")

more than 25% of stroke cases They had hypertension

In [None]:
fig=plt.figure(figsize=[8,6])
fig.patch.set_facecolor('#E0E0E0')
fig.patch.set_alpha(0.7)
plt.title("  Stroke/HeartDisease")
sns.countplot(data=df, x="heart_disease",hue="stroke",edgecolor="black",color="#b8c7e1")

here we can see that ppl who are married are having more number of strokes 
but dont just confirm it yet 
lets see the age of ppl who are married 

In [None]:
fig=plt.figure(figsize=[8,6])
fig.patch.set_facecolor('#E0E0E0')
fig.patch.set_alpha(0.7)
plt.title("Age according the marital status classes.")
sns.boxplot(data=df,y='age',x='ever_married',width=0.4,showfliers=False,color="#b8c7e1")

now we can see that people who are married are mostly of age group 50-70

In [None]:
fig = sns.FacetGrid(data=df, hue="stroke", aspect=4)
fig.map(sns.kdeplot, "age", shade=True)
fig.add_legend()
plt.savefig('stroke_age.png')

As we already know increase in age will increase the risk of getting stroke, and we proved it.

In [None]:
fig = sns.FacetGrid(data=df, hue="stroke", aspect=4)
fig.map(sns.kdeplot, "bmi", shade=True)
fig.add_legend()
plt.savefig('bmi.png')

mostly overwieghtedpeople have moew risk of getting a stroke

In [None]:
#converting categorical columns into numericals
labelencoder=LabelEncoder()
df['gender']=labelencoder.fit_transform(df['gender'])
df['ever_married']=labelencoder.fit_transform(df['ever_married'])
df['Residence_type']=labelencoder.fit_transform(df['Residence_type'])
df.head()

In [None]:
# Encode variables with more than 2 Classes

df = pd.get_dummies(df, columns= [i for i in df.columns if df[i].dtypes=='object'],drop_first=True)

In [None]:
#correlation between columns
plt.figure(figsize=(15,15))
sns.heatmap(df.corr(),annot=True)
plt.savefig('stroke_corr_heat.png')

columns age,hear_disease,hypertension,glucose_level are having positive correlation with respect to stroke

# Splitting Test and Train data

In [None]:
X=df.drop('stroke', axis=1)
y=df['stroke'].ravel()
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_scale=scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scale, y, test_size=0.3, stratify=y, shuffle=True, random_state=42)

In [None]:
lr=LogisticRegression(random_state=42)
lr.fit(X_train,y_train)
y_pred_lr=lr.predict(X_test)

In [None]:
accuracy_score(y_test,y_pred_lr)


In [None]:
print(classification_report(y_test,y_pred_lr))
print(confusion_matrix(y_test,y_pred_lr))

In [None]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
print(confusion_matrix(rf_pred, y_test))
print(classification_report(rf_pred, y_test))

With imbalanced data, the accuracy is not a metric that we can take into account because it is based on the the larger part of the target. In other words, this model is very accurate predincting when a people is not having a stroke, which is obviously what we don't need...

The poor result in class 1 of the target is expected because of the imbalanced dataset as well as the limited correlation among the variables.

In [None]:
# Balancing our dataset
#Using over-sampling method

from imblearn.over_sampling import SMOTE
sm = SMOTE()
X_oversampled, y_oversampled = sm.fit_resample(X, y)

sns.countplot(x = y_oversampled, data = df)
plt.savefig('stroke_oversampled.png')

In [None]:
# Train again with the new data

X_train, X_test, y_train, y_test = train_test_split(X_oversampled, y_oversampled, test_size = 0.2, random_state = 42)

In [None]:
#Logistic Regression
lr=LogisticRegression(random_state=42)
lr.fit(X_train,y_train)
y_pred_lr=lr.predict(X_test)

In [None]:
print(classification_report(y_test,y_pred_lr))
print(confusion_matrix(y_pred_lr, y_test))

In [None]:
#Decision Tree
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)
print(confusion_matrix(dt_pred, y_test))
print(classification_report(dt_pred, y_test))

In [None]:
#KNN
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)
print(confusion_matrix(knn_pred, y_test))
print(classification_report(knn_pred, y_test))

In [None]:
#Random forest
rft = RandomForestClassifier(random_state=42)
rft.fit(X_train, y_train)
rft_pred = rft.predict(X_test)
print(confusion_matrix(rft_pred, y_test))
print(classification_report(rft_pred, y_test))

So here we can see that the confusion matrix before balancing is

[[1424   73

[   0    1]]

and after balancing is 

[[889  38]

 [ 69 903]]


So after checking some of the algorithms , i have found that Random Forest has the highest accuracy 

Special Thanks to Aditi Mulye
https://www.kaggle.com/aditimulye/stroke-prediction-visualization-prediction
    Learnt alot from your notebook
    
And also thanks to many more ppl.
Its fun to learn new.