In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Loading Data

In [None]:
data = pd.read_csv("/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")

## Basic info

In [None]:
data.info()

In [None]:
data.head()

In [None]:
data.describe()

## Missing Data

In [None]:
missing_df = (data.isnull().sum()*100)/data.shape[0]
print(missing_df[missing_df>0])

# Univariate Analysis

* Visualize Distribution of features

## Target Variable - stroke

In [None]:
sns.countplot(data=data, x='stroke');
print("% of People not having suffered a stroke:{:.2f}".format((data[data['stroke']==0].shape[0]*100)/data.shape[0]))
print("% of People having suffered a stroke:{:.2f}".format((data[data['stroke']==1].shape[0]*100)/data.shape[0]))

So we have a heavily skewed dataset with over 95% of observations for people who have never suffered a stroke.

## gender

In [None]:
sns.countplot(data=data, x='gender');
print("% of Females: {:.2f}".format((data[data['gender']=="Male"].shape[0]*100)/data.shape[0]))
print("% of Males: {:.2f}".format((data[data['gender']=="Female"].shape[0]*100)/data.shape[0]))
print("% of Other Genders: {:.2f}".format((data[data['gender']=="Other"].shape[0]*100)/data.shape[0]))

## age

In [None]:
plt.hist(data['age'].values, bins=50);
print("Mean Age: {}".format(data['age'].mean()))
print("Median Age: {}".format(data['age'].median()))

## hypertension

In [None]:
sns.countplot(data=data, x='hypertension');
print("% of People without HyperTension: {:.2f}".format((data[data['hypertension']==0].shape[0]*100)/data.shape[0]))
print("% of People with HyperTension: {:.2f}".format((data[data['hypertension']==1].shape[0]*100)/data.shape[0]))


## heart_disease

In [None]:
sns.countplot(data=data, x='heart_disease');
print("% of People without Heart disease: {:.2f}".format((data[data['heart_disease']==0].shape[0]*100)/data.shape[0]))
print("% of People with Heart disease: {:.2f}".format((data[data['heart_disease']==1].shape[0]*100)/data.shape[0]))

## ever_married

In [None]:
sns.countplot(data=data, x='ever_married');
print("% of Married/Previously Married People : {:.2f}".format((data[data['ever_married']=="Yes"].shape[0]*100)/data.shape[0]))
print("% of Un-Married People : {:.2f}".format((data[data['ever_married']=="No"].shape[0]*100)/data.shape[0]))

## work_type

In [None]:
sns.countplot(data=data, x='work_type');

## Residence_type

In [None]:
sns.countplot(data=data, x='Residence_type');
print("% of Urban Residency : {:.2f}".format((data[data['Residence_type']=="Urban"].shape[0]*100)/data.shape[0]))
print("% of Rural Residency : {:.2f}".format((data[data['Residence_type']=="Rural"].shape[0]*100)/data.shape[0]))

## avg_glucose_level

In [None]:
sns.displot(data['avg_glucose_level'].values);

## bmi

In [None]:
sns.displot(data['bmi'].values);

## smoking_status

In [None]:
sns.countplot(data=data, x='smoking_status');

# Bi/Multivariate Analysis

* Attempt to discover any correlation/causal relationship among features and target variable.
* We can look for answers to many interesting questions like 
    1. How do attributes like gender/residency etc affect the probabilty of suffering a stroke?
    2. Is there some specific combination of attributes which can be greatly attributed to a person suffering from a stroke?
    

In [None]:
sns.countplot(data=data, x='gender', hue='stroke');
plt.title("Gender vs Stroke");
print("% of Females who suffered a stroke: {:.3f}".format(((data[(data['gender']=='Female')&(data['stroke']==1)]).shape[0])*100/data[data['gender']=='Female'].shape[0]))
print("% of Males who suffered a stroke: {:.3f}".format(((data[(data['gender']=='Male')&(data['stroke']==1)]).shape[0])*100/data[data['gender']=='Male'].shape[0]))
print("% of Other gender who suffered a stroke: {:.3f}".format(((data[(data['gender']=='Other')&(data['stroke']==1)]).shape[0])*100/data[data['gender']=='Other'].shape[0]))

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(15, 5))
sns.countplot(data=data, x='hypertension', hue='stroke', ax=ax[0]);
sns.countplot(data=data, x='heart_disease', hue='stroke', ax=ax[1]);
sns.countplot(data=data, x='ever_married', hue='stroke', ax=ax[2]);

fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15, 6))
sns.countplot(data=data, x='work_type', hue='stroke', ax=ax[0]);
sns.countplot(data=data, x='Residence_type', hue='stroke', ax=ax[1]);


In [None]:
fig, ax = plt.subplots(nrows=3, ncols=1, figsize=(10, 17))

sns.kdeplot(data=data, x='age', hue='stroke', ax=ax[0]);
ax[0].set_title("Age distribution w.r.t Stroke");
sns.kdeplot(data=data, x='age', hue='hypertension', ax=ax[1]);
ax[1].set_title("Age distribution w.r.t Hypertension");
sns.kdeplot(data=data, x='age', hue='heart_disease',ax=ax[2]);
ax[2].set_title("Age distribution w.r.t Heart Disease");
sns.displot(data=data, x='age', hue='stroke', kind='kde', col="heart_disease");


We can see from the above plot that the risk of suffering  a stroke increases for people over the age of 40.

Also the Age distribution w.r.t Stroke is very similar to the Age distribution w.r.t Heart Disease.

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15, 6))
sns.countplot(data=data, x='hypertension', hue='work_type', ax=ax[0]);
sns.countplot(data=data, x='heart_disease', hue='work_type', ax=ax[1]);

From the above visualizations we can see that a significant number of people having heat diseases or suffering from hypertension work in private sector.

Exploring feature - "bmi"

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15, 6))
sns.kdeplot(data=data, x='bmi', hue='stroke', ax=ax[0]);
sns.kdeplot(data=data, hue='smoking_status', x='bmi', ax=ax[1]);
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15, 6))
sns.kdeplot(data=data, x='bmi', hue='work_type', ax=ax[0]);
sns.kdeplot(data=data, x='bmi', hue='heart_disease', ax=ax[1]);
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(15, 6))
sns.kdeplot(data=data, x='bmi', hue='hypertension', ax=ax);

Exploring feature - "avg_glucose_level"

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15, 6))
sns.kdeplot(data=data, x='avg_glucose_level', hue='stroke', ax=ax[0]);
sns.kdeplot(data=data, hue='smoking_status', x='avg_glucose_level', ax=ax[1]);
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15, 6))
sns.kdeplot(data=data, x='avg_glucose_level', hue='work_type', ax=ax[0]);
sns.kdeplot(data=data, x='avg_glucose_level', hue='heart_disease', ax=ax[1]);
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15, 6))
sns.kdeplot(data=data, x='avg_glucose_level', hue='hypertension', ax=ax[0]);
sns.scatterplot(data=data, y='avg_glucose_level', x='age', hue='stroke', ax=ax[1]);

Corerelation plot

In [None]:
corr = data[list(data.columns)[1:]].corr()
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5});

ax.set_title("Correlation Heatmap");

In [None]:

'''
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
'''
#sns.kdeplot(data=data)
#print(data.columns)
#fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15, 6))
#sns.kdeplot(data=data, x='bmi', hue='stroke', ax=ax[0]);
#sns.kdeplot(data=data, x='avg_glucose_level', hue='stroke', ax=ax[1]);
#fig2, ax2 = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))
#sns.countplot(data=data, x='smoking_status', hue='stroke', ax=ax2[0]);
#sns.kdeplot(data=data, hue='smoking_status', x='bmi', ax=ax2[1]);

#sns.countplot(data=data, x='heart_disease', hue='work_type', ax=ax[1]);
#sns.countplot(data=data, x='ever_married', hue='stroke', ax=ax[2]);

#sns.kdeplot(data=data, x='bmi', hue='work_type');
#sns.countplot(data=data, x='hypertension', hue='Residence_type');
#sns.displot(data=data, x='age', hue='stroke', kind='kde', col="heart_disease");
#sns.displot(data=data, x='age', hue='heart_disease', kind='kde');
#sns.displot(data=data, x='age', hue='heart_disease', kind='kde');
#plt.title("heart_disease vs gender");
#print("% of People with hypertension who suffered a stroke: {:.3f}".format(((data[(data['hypertension']==1)&(data['stroke']==1)]).shape[0])*100/data[data['hypertension']==1].shape[0]))
#print("% of People with hypertension who suffered a stroke: {:.3f}".format(((data[(data['hypertension']==1)&(data['stroke']==0)]).shape[0])*100/data[data['hypertension']==0].shape[0]))
#print("% of Males who suffered a stroke: {:.3f}".format(((data[(data['hypertension']=='Male')&(data['stroke']==1)]).shape[0])*100/data[data['gender']=='Male'].shape[0]))
#print("% of Other gender who suffered a stroke: {:.3f}".format(((data[(data['hypertension']=='Other')&(data['stroke']==1)]).shape[0])*100/data[data['gender']=='Other'].shape[0]))