<div style="color:white;
           display:fill;
           border-radius:5px;
           background-color:blue;
           font-size:110%;
           font-family:Verdana;
           letter-spacing:0.5px">

<h1 style="text-align: center;
           padding: 10px;
              color:white">

Heart Stroke  EDA and Prediction
</h1>
</div>

![](https://viewmedica.com/images/thumbslarge/heartfailure_1280.jpg)

## Context
### According to the World Health Organization (WHO) stroke is the 2nd leading cause of death globally, responsible for approximately 11% of total deaths.

<div style="color:white;
           display:fill;
           border-radius:5px;
           background-color:blue;
           font-size:110%;
           font-family:Verdana;
           letter-spacing:0.5px">

<h1 style="text-align: center;
           padding: 10px;
              color:white">

Import the libraries and Load dataset
</h1>
</div>

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
        
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
from plotly.validators.scatter.marker import SymbolValidator
import plotly.offline as pyo
pyo.init_notebook_mode()

import missingno as msno

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

df = pd.read_csv("../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")
df.drop('id',axis=1,inplace=True)

<div style="color:white;
           display:fill;
           border-radius:5px;
           background-color:blue;
           font-size:110%;
           font-family:Verdana;
           letter-spacing:0.5px">

<h1 style="text-align: center;
           padding: 10px;
              color:white">

Features
</h1>
</div>

* id - Unique id
* gender - Gender, "Male", "Female" or "Other"
* age - Age
* hypertension - Hypertension binary feature,  0 if the patient doesn't have hypertension, 1 if the patient has hypertension
* heart_disease - Heart disease binary feature, 0 if the patient doesn't have any heart diseases, 1 if the patient has a heart disease
* ever_married - Has the patient ever been married?, "No" or "Yes"
* work_type - Work type of the patient, "children", "Govt_jov", "Never_worked", "Private" or "Self-employed"
* Residence_type - Residence type of the patient, "Rural" or "Urban"
* avg_glucose_level - Average glucose level in blood
* bmi - Body Mass Index
* smoking_status - Smoking status of the patient, "formerly smoked", "never smoked", "smokes" or "Unknown"*
* stroke - Stroke event, 1 if the patient had a stroke or 0 if not

### Note: "Unknown" in smoking_status means that the information is unavailable for this patient

<div style="color:white;
           display:fill;
           border-radius:5px;
           background-color:blue;
           font-size:110%;
           font-family:Verdana;
           letter-spacing:0.5px">

<h1 style="text-align: center;
           padding: 10px;
              color:white">

Missing data
</h1>
</div>

In [None]:
msno.bar(df)
plt.show()

### The only bmi has missingdata. But, It is only 4%. So, I will drop this missingdata

In [None]:
df = df.dropna()

<div style="color:white;
           display:fill;
           border-radius:5px;
           background-color:blue;
           font-size:110%;
           font-family:Verdana;
           letter-spacing:0.5px">

<h1 style="text-align: center;
           padding: 10px;
              color:white">

Visualizing Data
</h1>
</div>

In [None]:
def make_bar(feature, rows, cols):
    tmp2 = df.groupby(feature)['stroke'].mean()
    tmp = 1 - df.groupby(feature)['stroke'].mean().values
    fig.add_trace(go.Bar(x = df[feature].value_counts().index, y= df[feature].value_counts().values*tmp,
                     text=tmp,texttemplate='Not_Stroke : %{text:.2f}',textposition='auto'),rows,cols)
    fig.add_trace(go.Bar(x = df[feature].value_counts().index, y= df[feature].value_counts().values * df.groupby(feature)['stroke'].mean().values,
                     text=tmp2,texttemplate='Stroke : %{text:.2f}', textposition='auto'),rows,cols)

fig = make_subplots(rows=4, cols=2, shared_yaxes=True,subplot_titles=("Gender", "Hypertension",
                                                                     "Heart_disease","Ever_married",
                                                                     "Work_type","Residence_type",
                                                                     "Smoking_status"),vertical_spacing=0.1)
make_bar('gender',1,1)
make_bar('hypertension',1,2)
make_bar('heart_disease',2,1)
make_bar('ever_married',2,2)
make_bar('work_type',3,1)
make_bar('Residence_type',3,2)
make_bar('smoking_status',4,1)

fig.update_layout(barmode='stack')
fig.update_layout(font_family="Rockwell",height = 450*4, showlegend=False)
fig.update_layout(paper_bgcolor=px.colors.qualitative.Pastel2[2],bargap=0.2)
fig.show()

In [None]:
fig = make_subplots(rows=1, cols=3, shared_yaxes=True,subplot_titles=("Age", "avg_glucose_level","bmi"))

tmp = df['age']
fig.add_trace(go.Histogram(x = df[df['stroke'] ==0]['age'], nbinsx=100),1,1)
fig.add_trace(go.Histogram(x = df[df['stroke'] ==1]['age'], nbinsx=100),1,1)

fig.add_trace(go.Histogram(x = df[df['stroke'] ==0]['avg_glucose_level'], nbinsx=400),1,2)
fig.add_trace(go.Histogram(x = df[df['stroke'] ==1]['avg_glucose_level'], nbinsx=400),1,2)

fig.add_trace(go.Histogram(x = df[df['stroke'] ==0]['bmi'], nbinsx=460),1,3)
fig.add_trace(go.Histogram(x = df[df['stroke'] ==1]['bmi'], nbinsx=460),1,3)


fig.update_layout(font_family="Rockwell", showlegend=False)
fig.update_layout(barmode="overlay")
fig.update_layout(paper_bgcolor=px.colors.qualitative.Pastel2[2])
fig.show()

In [None]:
fig = px.parallel_categories(df, color ='stroke')
fig.show()

In [None]:
px.parallel_coordinates(df,color='stroke')

In [None]:
str_list = ['gender','ever_married','work_type','Residence_type','smoking_status']

df =pd.get_dummies(df, columns=str_list,drop_first=True)

ig, ax = plt.subplots(1, 1, figsize=(14, 10))
mask = np.triu(np.ones_like(df.corr(), dtype=bool))
ax = sns.heatmap(df.corr(),cmap='coolwarm',annot=True,mask=mask,)

<div style="color:white;
           display:fill;
           border-radius:5px;
           background-color:blue;
           font-size:110%;
           font-family:Verdana;
           letter-spacing:0.5px">

<h1 style="text-align: center;
           padding: 10px;
              color:white">

Prediction by DecisionTreeClassifier
</h1>
</div>

In [None]:
X = df.drop('stroke',axis=1)
y = df['stroke']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.2, random_state= 100)

In [None]:
accur = []
pred_as_1 =[]
for i in range(2,51):
    model = DecisionTreeClassifier(max_depth= i).fit(X_train, y_train)
    pred = model.predict(X_test)
    accur.append(accuracy_score(y_test, pred))
    pred_as_1.append(confusion_matrix(y_test, pred)[1][1])

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15,4))
x=list(range(2,51))
axes[0].plot(x,accur,'r')
axes[1].plot(x,pred_as_1,'b')
plt.show()

In [None]:
model = DecisionTreeClassifier(max_depth= 16).fit(X_train, y_train)
pred = model.predict(X_test)

confusion_matrix(y_test, pred)

In [None]:
print(classification_report(y_test,pred))

### I think, It is bad model. because, It's accuracy is high but, It can't predict stroke(1).

<div style="color:white;
           display:fill;
           border-radius:5px;
           background-color:blue;
           font-size:110%;
           font-family:Verdana;
           letter-spacing:0.5px">

<h1 style="text-align: center;
           padding: 10px;
              color:white">

Try control dataset
</h1>
</div>

In [None]:
df_control = pd.concat([df[df['stroke']==0].sample(500),df[df['stroke']==1]])

In [None]:
labels = ['Not Stroke','Stroke']
specs = [[{'type':'domain'}, {'type':'domain'}]]
fig = make_subplots(rows=1, cols=2,specs= specs)
fig.add_trace(go.Pie(labels=labels, values=list(df['stroke'].value_counts()), hole = .8),1,1)
fig.add_trace(go.Pie(labels=labels, values=list(df_control['stroke'].value_counts()), hole = .8),1,2)
fig.update_traces(textposition='outside', textinfo='percent+label')
fig.update_layout(
    title_text="Control the dataset's stroke rate",
    annotations=[dict(text='Previous', x=0.2, y=0.5, font_size=20, showarrow=False),
                dict(text='Now', x=0.8, y=0.5, font_size=20, showarrow=False),
                dict(text='->', x=0.5, y=0.5, font_size=20, showarrow=False)],
    height=500)
fig.update(layout_showlegend=False)
fig.show()

In [None]:
X = df_control.drop('stroke',axis=1)
y = df_control['stroke']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.2, random_state= 100)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
accur2 = []
pred_as_2 =[]
for i in range(2,51):
    model = RandomForestClassifier(max_depth= i).fit(X_train, y_train)
    pred = model.predict(X_test)
    accur2.append(accuracy_score(y_test, pred))
    pred_as_2.append(confusion_matrix(y_test, pred)[1][1])

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15,4))
x=list(range(2,51))
axes[0].plot(x,accur2,'r')
axes[1].plot(x,pred_as_2,'b')
plt.show()

In [None]:
model = RandomForestClassifier(max_depth= accur2.index(max(accur2))).fit(X_train, y_train)
pred = model.predict(X_test)
confusion_matrix(y_test, pred)

In [None]:
print(classification_report(y_test,pred))

### It is lower than previous decisiontree model, But I think this model is more better than previous model. Because the stroke 1's  score is improve!