In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Library imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix, f1_score, accuracy_score

import warnings
warnings.simplefilter("ignore")

## Load the dataset and verify the dataload

In [None]:
strokedf = pd.read_csv("../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")

# Check the initial rows
strokedf.head()

In [None]:
# Lets check the datastructure
strokedf.info()

In [None]:
# Check for missing values

strokedf.isna().sum()

### Missing value treatment for 'bmi' attribute

The approach is to substitue mean of the bmi by the target variable - stroke

In [None]:
strokedf['bmi'] = strokedf['bmi'].fillna(strokedf.groupby('stroke')['bmi'].transform('mean'))

# Check whether imputations are done
strokedf.isna().sum()

# Another alternate approach

# strokedf["bmi"] = strokedf.groupby("stroke").transform(lambda x: x.fillna(x.mean()))

## Exploratory Data Analysis

In [None]:
# Explore the target variable

sns.countplot(strokedf['stroke'])

strokedf['stroke'].value_counts()

#### exploration  - gender, hypertension and heart_disease attributes

In [None]:
print("Gender by the target variable")
print(strokedf.groupby('stroke')['gender'].value_counts())
print("\n")
print("hypertension by the target variable")
print(strokedf.groupby('stroke')['hypertension'].value_counts())
print("\n")
print("heart_disease by the target variable")
print(strokedf.groupby('stroke')['heart_disease'].value_counts())


# Doing the visualizations

fig, (ax1, ax2, ax3) = plt.subplots(1,3,figsize=(16,6))
sns.countplot(x='gender', hue='stroke', data=strokedf, ax=ax1);
sns.countplot(x='hypertension', hue='stroke', data=strokedf, ax=ax2);
ax2.set_ylabel("")
sns.countplot(x='heart_disease', hue='stroke', data=strokedf, ax=ax3);
ax3.set_ylabel("")
plt.show()

#### exploration Age attribute

In [None]:
# Explore Age attribute

print(strokedf.groupby('stroke')['age'].mean())

# Explore Age variable with respect to the stroke attribute
sns.catplot(x="stroke", y="age", kind="box", data=strokedf);

g = sns.FacetGrid(data=strokedf, col='stroke', height=5)
g.map(sns.distplot, 'age')
plt.show()

In [None]:
# Explore gender and age with respect to stroke and establish any conclusion

sns.catplot(x="gender", y="age", hue='stroke', kind="box", data=strokedf);

#### exploration - ever_married, work_type, Residence_type, smoking_status attributes

In [None]:
print("ever_married by the target variable")
print(strokedf.groupby('stroke')['ever_married'].value_counts())
print("\n")
print("work_type by the target variable")
print(strokedf.groupby('stroke')['work_type'].value_counts())
print("\n")
print("Residence_type by the target variable")
print(strokedf.groupby('stroke')['Residence_type'].value_counts())
print("\n")
print("smoking_status by the target variable")
print(strokedf.groupby('stroke')['smoking_status'].value_counts())


# Doing the visualizations

fig, (ax1, ax2, ax3, ax4) = plt.subplots(1,4,figsize=(20,6))
sns.countplot(x='ever_married', hue='stroke', data=strokedf, ax=ax1);
sns.countplot(x='work_type', hue='stroke', data=strokedf, ax=ax2);
ax2.set_ylabel("")
sns.countplot(x='Residence_type', hue='stroke', data=strokedf, ax=ax3);
ax3.set_ylabel("")
sns.countplot(x='smoking_status', hue='stroke', data=strokedf, ax=ax4);
ax4.set_ylabel("")
plt.show()

#### exploration avg_glucose level

In [None]:

# Explore avg_glucose_level variable with respect to the stroke attribute

print(strokedf.groupby('stroke')['avg_glucose_level'].mean())

sns.catplot(x="stroke", y="avg_glucose_level", kind="box", data=strokedf);

g = sns.FacetGrid(data=strokedf, col='stroke', height=5)
g.map(sns.distplot, 'avg_glucose_level')
plt.show()

#### exploration bmi attribute

In [None]:
# Explore bmi variable with respect to the stroke attribute

print(strokedf.groupby('stroke')['bmi'].mean())

sns.catplot(x="stroke", y="bmi", kind="box", data=strokedf);

g = sns.FacetGrid(data=strokedf, col='stroke', height=5)
g.map(sns.distplot, 'bmi')
plt.show()

## Modelling - Random Forest

### Peform Label Encoder Transformations

In [None]:
# Initialize the label encoder
label_encoder = LabelEncoder() 

# Encode labels 
strokedf['gender'] = label_encoder.fit_transform(strokedf['gender'])
strokedf['ever_married'] = label_encoder.fit_transform(strokedf['ever_married'])
strokedf['work_type'] = label_encoder.fit_transform(strokedf['work_type'])
strokedf['Residence_type'] = label_encoder.fit_transform(strokedf['Residence_type'])
strokedf['smoking_status'] = label_encoder.fit_transform(strokedf['smoking_status'])

### Perform Train , Test Split of the data

Since proportion of the stroke data is less, we will perform a stratified sampling

In [None]:
features = strokedf.drop('stroke', axis=1)
target = strokedf['stroke']

features_train, features_test, target_train, target_test = train_test_split(features, 
                                                                            target, 
                                                                            test_size=0.3, random_state=101,
                                                                           stratify = target)

### Building the Model

In [None]:
# Training the Random Forest model
from sklearn.ensemble import RandomForestClassifier

# we will build the random forest classifier both using entropy and gini index
rfc = RandomForestClassifier(n_estimators=100, criterion='entropy')
rfc.fit(features_train, target_train)

### Predictions and Evaluations

In [None]:
predictions = rfc.predict(features_test)

print("Confusion Matrix - Random Forest Using Gini Index\n")
print(confusion_matrix(target_test,predictions))
print("\n")
print("Classification Report \n")
print(classification_report(target_test,predictions))
print("\n")
print("Accuracy Score \n")
print(accuracy_score(target_test, predictions))
print("\n")
print("F1 Score \n")
print(f1_score(target_test, predictions))