In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install -q -U git+https://github.com/mljar/mljar-supervised.git@master

# Introduction

<center><img src="https://storage.googleapis.com/kaggle-datasets-images/1120859/1882037/04da2fb7763e553bdf251d5adf6f88d9/data-original.jpg?t=2021-01-26-19-57-05" width="700px"></center>

A stroke is a medical condition in which poor blood flow to the brain causes cell death. In 2018, 1 in every 6 deaths from cardiovascular disease was due to stroke.

In this kernel, I performed EDA on stroke prediction dataset using Plotly and seaborn and then used AutoML for predicting stroke.


# Table of content

* [<font size=4>Importing required libraries </font>](#1)
* [<font size=4>EDA</font>](#2)
    * [How does bmi affects stroke](#2.1)
    * [How does average glucose level affect stroke?](#2.2)
    * [How does age affects stroke?](#2.3)
    * [Lets check how numericle variables correlate with each other](#2.4)
* [<font size=4>Modeling</font>](#3)

* [<font size=4>Conclusion</font>](#4)

# Importing libraries <a id='1'></a>

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objs as go
import pandas_profiling as pp
from supervised.automl import AutoML
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# EDA <a id='2'></a>

In [None]:
df = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')

In [None]:
df.head()

In [None]:
df.info()

<html><font size=4 color='blue'>bmi contains some null values</font></html>

In [None]:
df.describe()

In [None]:
df

In [None]:
sns.set_style('whitegrid')
fig,ax = plt.subplots(4,2,figsize=(20,20))
ax = ax.flatten()
sns.countplot(data=df,x='stroke',ax=ax[0],palette='cool')
sns.countplot(data=df,x='hypertension',ax=ax[1],hue='stroke',palette='cool')
sns.countplot(data=df,x='heart_disease',ax=ax[2],hue='stroke',palette='cool')
sns.countplot(data=df,x='ever_married',ax=ax[3],hue='stroke',palette='cool')
sns.countplot(data=df,x='work_type',ax=ax[4],hue='stroke',palette='cool')
sns.countplot(data=df,x='gender',ax=ax[5],hue='stroke',palette='cool')
sns.countplot(data=df,x='Residence_type',ax=ax[6],hue='stroke',palette='cool')
sns.countplot(data=df,x='smoking_status',ax=ax[7],hue='stroke',palette='cool')
plt.show()

# <html> <font size=4 color='blue'>the dataset is unevenly distributed and contains very less no. of people who have suffered a stroke or other health problems</font>
</html>

# How does bmi affects stroke <a id='2.1'></a>

In [None]:
fig = px.histogram(df,x=df['bmi'],color='stroke',template='plotly_dark')
fig.show()

# how does average glucose level affect stroke? <a id='2.2'></a>

In [None]:
fig = px.histogram(df,x='avg_glucose_level',color='stroke',template='plotly_dark')
fig.show()

# how does age affects stroke? <a id='2.3'></a>

In [None]:
fig = px.histogram(df,x=df['age'],color='stroke',template='plotly_dark')
fig.show()

# Lets check how numericle variables correlate with each other <a id='2.4'></a>

In [None]:
trace1 = go.Histogram(
    x=df.bmi,
    opacity=0.75,
    name = "bmi",
    marker=dict(color='rgba(171, 50, 196, 0.6)'))
trace2 = go.Histogram(
    x=df.avg_glucose_level,
    opacity=0.75,
    name = "average glucose level",
    marker=dict(color='rgba(12, 50, 196, 0.6)'))
trace3 = go.Histogram(
    x=df.age,
    opacity=0.75,
    name = "age",
    marker=dict(color='rgba(20, 200, 20, 0.6)'))

data = [trace1, trace2,trace3]
layout = go.Layout(barmode='overlay',
                   title='correlation of numericle variables',
                   xaxis=dict( title='Value'),
                   yaxis=dict( title='Count'),
)
fig = go.Figure(data=data, layout=layout)
fig.show()

We can see a direct correlation between age and stroke. Older people are more prone to experiencing a stroke but we can't easily visualize such a relationship in BMI and average glucose level so let's plot a heatmap

In [None]:

plt.figure(figsize=(10,10))
sns.heatmap(df.corr(),cmap='Greens',annot=True)
plt.show()

In [None]:
sns.pairplot(data=df,hue='stroke')
plt.show()

In [None]:
pp.ProfileReport(df)

# Modeling <a id = '3'></a>

In [None]:
df = df.drop('id',axis=1)
y = df.pop('stroke')
df.head()

In [None]:
train_x,test_x,train_y,test_y = train_test_split(df,y,test_size=0.2,random_state=42)

In [None]:
model = AutoML(mode='Perform',eval_metric='f1')
model.fit(train_x,train_y)

In [None]:
preds = model.predict(test_x)

In [None]:
score = accuracy_score(preds,test_y)
print(score)

# Conclusion <a id='4'></a>

<html><font size= 4 color = "blue">this concludes my kernel Please upvote if you liked it</font></html>

![](https://media.giphy.com/media/kfvEdnjjcAoea0naii/giphy.gif)