In [None]:
### import necessary libraries 

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import imblearn

import warnings                    
warnings.filterwarnings('ignore')   ### this will ignore the warnings and we can look at much greener notebook 
                                    ### but I would recommend you to implement it at last because sometime warnings are good

# Exploratory Data Analysis

In [None]:
df = pd.read_csv("../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")
df.head()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
## df.bmi.describe()

In [None]:
# min_threshold = df['bmi'].quantile(0.001)
# max_threshold = df['bmi'].quantile(0.999)
# df[df['bmi']>max_threshold]

In [None]:
df.dropna(inplace = True)
df.shape

In [None]:
print("Gender : ", df['gender'].unique())

print("ever_married : ", df['ever_married'].unique())

print("Work Type : ", df['work_type'].unique())

print("Residence : ", df['Residence_type'].unique())

print("Smoking : ", df['smoking_status'].unique())

In [None]:
df.describe()

In [None]:
sns.distplot(df['age'], bins=10)  #rwidth=0.8
plt.show

In [None]:
df.gender.value_counts()

In [None]:
sns.barplot(df['gender'], df['stroke'])
plt.show()

In [None]:
plt.subplot(1, 2, 1)

sns.countplot(x='gender', hue='stroke', data=df)
sns.despine()

plt.subplot(1, 2, 2)
sns.countplot(x='stroke', hue='gender', data=df)
sns.despine()

plt.show()

In [None]:
df.ever_married.value_counts()

In [None]:
sns.barplot(df['ever_married'], df['stroke'])
plt.show()

In [None]:
plt.subplot(1, 2, 1)

sns.countplot(x='ever_married', hue='stroke', data=df)
sns.despine()

plt.subplot(1, 2, 2)
sns.countplot(x='stroke', hue='ever_married', data=df)
sns.despine()

plt.show()

In [None]:
df.work_type.value_counts()

In [None]:
sns.barplot(df['work_type'], df['stroke'])
plt.show()

In [None]:
plt.subplot(1, 2, 1)

sns.countplot(x='work_type', hue='stroke', data=df)
plt.xticks(rotation=30)
sns.despine()

plt.subplot(1, 2, 2)
sns.countplot(x='stroke', hue='work_type', data=df)
sns.despine()

plt.show()

In [None]:
df.Residence_type.value_counts()

In [None]:
sns.barplot(df['Residence_type'], df['stroke'])
plt.show()

In [None]:
plt.subplot(1, 2, 1)

sns.countplot(x='Residence_type', hue='stroke', data=df)
sns.despine()

plt.subplot(1, 2, 2)
sns.countplot(x='stroke', hue='Residence_type', data=df)
sns.despine()

plt.show()

In [None]:
df.smoking_status.value_counts()

In [None]:
sns.barplot(df['smoking_status'], df['stroke'])
plt.show()

In [None]:
plt.subplot(1, 2, 1)

sns.countplot(x='smoking_status', hue='stroke', data=df)
plt.xticks(rotation=30)
sns.despine()

plt.subplot(1, 2, 2)
sns.countplot(x='stroke', hue='smoking_status', data=df)
sns.despine()

plt.show()

In [None]:
df.stroke.value_counts()

In [None]:
sns.countplot(df['stroke'], data=df)
plt.show()

## Its clear that the data is imbalanced, so we'll handle it in Feature Engineering Section

# Feature Engineering

### Handle Categorical Data

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

### Label Encoder will help us to rank attributes in a feature 

In [None]:
dummies = pd.get_dummies(df['gender'])   

### dummies will convert attribute in numeric value

In [None]:
df['ever_married']=le.fit_transform(df.ever_married)

df['work_type']=le.fit_transform(df.work_type)

df['Residence_type']=le.fit_transform(df.Residence_type)

df['smoking_status']=le.fit_transform(df.smoking_status)

In [None]:
data = pd.concat([df, dummies], axis=1)
data.head()

In [None]:
dataset = data.drop(['id', 'gender', 'Other'], axis=1)

In [None]:
dataset.head()

In [None]:
dataset.shape

### Handling Imbalanced Data

In [None]:
## Splitting data into independent and dependent variables

X = dataset.drop(['stroke'], axis=1)
Y = dataset.stroke

In [None]:
X.head()

In [None]:
Y.head()

In [None]:
print(X.shape)
print(Y.shape)

In [None]:
Y.value_counts()   ### here we can clearly see that the data is highly imbalanced 
                   ### we have to make it balanced so that our model is not biased to any output

In [None]:
from imblearn.combine import SMOTETomek   ## this library will help us to "over sample" the data 

In [None]:
stk = SMOTETomek(random_state=42)

X_res,y_res = stk.fit_resample(X,Y)

In [None]:
print(X_res.shape)

print(y_res.shape)

In [None]:
from collections import Counter

print('Original dataset shape {}'.format(Counter(Y)))

print('Resampled dataset shape {}'.format(Counter(y_res)))

# Model Creation

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.3)

In [None]:
X_train.head()

In [None]:
y_train.value_counts()  ### now the data is balanced and this will lead us to better accuracy

In [None]:
### This step gives us an opportunity to drop highly correlated independent features so that we can save OUR model from 
### CURSE OF DIMENSIONALITY


#get correlations of each features in dataset

corr = X_train.corr()
plt.figure(figsize=(15,10))

#plot heat map

sns.heatmap(corr, annot=True)
plt.show() 

In [None]:
### There are no such independent features who are highly correlated to each other

In [None]:
### Cross Validation 

from sklearn.model_selection import RandomizedSearchCV

### Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc_model = RandomForestClassifier()

In [None]:
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]

max_features = ['auto', 'sqrt']

max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]

min_samples_split = [2, 5, 10, 15, 100]

min_samples_leaf = [1, 2, 5, 10]

In [None]:
rfc_random_grid={
    'n_estimators' : n_estimators,
    'max_features' : max_features,
    'max_depth' : max_depth,
    'min_samples_split' : min_samples_split,
    'min_samples_leaf' : min_samples_leaf
}

In [None]:
rfc_random = RandomizedSearchCV(estimator = rfc_model, param_distributions = rfc_random_grid, n_iter = 10, cv = 5, return_train_score=False)

In [None]:
rfc_random.fit(X_train, y_train)

In [None]:
rfc_prediction = rfc_random.predict(X_test)

In [None]:
 from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
print("Accuracy Score")
print(accuracy_score(y_test, rfc_prediction))

print('*'*50)

print("Classification Report\n")
print(classification_report(y_test, rfc_prediction))

print('*'*50)

print("Confusion Matrix\n")
print(confusion_matrix(y_test, rfc_prediction))

# **In conclusion, Achieved an accuracy of 95%.**

### I have tried to implement the basic life cycle of a Data Science Project.

### Feel free to give any comments about my notebook!

### Also, if my notebook was helpful, please give me an upvote !!!!!