# Overview
#### According to the World Health Organization (WHO) stroke is the 2nd leading cause of death globally, responsible for approximately 11% of total deaths.
#### This dataset is used to predict whether a patient is likely to get stroke based on the input parameters like gender, age, various diseases, and smoking status. Each row in the data provides relavant information about the patient.

#### Attribute Information
1) id: unique identifier

2) gender: "Male", "Female" or "Other"

3) age: age of the patient

4) hypertension: 0 if the patient doesn't have hypertension, 1 if the patient has hypertension

5) heart_disease: 0 if the patient doesn't have any heart diseases, 1 if the patient has a heart disease

6) ever_married: "No" or "Yes"

7) work_type: "children", "Govt_jov", "Never_worked", "Private" or "Self-employed"

8) Residence_type: "Rural" or "Urban"

9) avg_glucose_level: average glucose level in blood

10) bmi: body mass index

11) smoking_status: "formerly smoked", "never smoked", "smokes" or "Unknown"*

12) stroke: 1 if the patient had a stroke or 0 if not


# 1. Import libraries and read files

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
data = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
data.head()

In [None]:
data.tail()

# 2. Basic insights of data

In [None]:
data.shape

In [None]:
data.size

In [None]:
data.ndim

In [None]:
data.describe()

In [None]:
data.info()

# 3.Missing Values

In [None]:
data.isnull().sum()

In [None]:
data['bmi'] = data['bmi'].fillna(data['bmi'].mean())

In [None]:
data.isnull().sum()

## Label Encoding The data

In [None]:
data.head(2)

In [None]:
cat_cols = ['gender','ever_married','work_type','Residence_type','smoking_status']

In [None]:
from sklearn.preprocessing import LabelEncoder

def func_labelencoder(list1,features):
    encode = LabelEncoder()
    features[list1] = encode.fit_transform(features[list1].astype(str))

for i in cat_cols:
    func_labelencoder(i,data)

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.drop('id',axis=1,inplace=True)

In [None]:
data.shape

## Checking For Outliers

In [None]:
data.columns

In [None]:
fig,axis = plt.subplots(1,figsize=(16,8))
plt.boxplot(data)
plt.xticks(np.arange(1,12),['gender', 'age', 'hypertension', 'heart_disease', 'ever_married','work_type', 'Residence_type', 'avg_glucose_level', 'bmi','smoking_status', 'stroke'])
plt.show()

# 4. Data Visualization

In [None]:
fig,axes = plt.subplots(2,2,figsize=(16,10))
sns.boxplot(data['avg_glucose_level'],ax=axes[0,0]).set_title('avg_glucose_level Box Plot Before',fontsize=18)
sns.kdeplot(data['avg_glucose_level'],ax=axes[0,1]).set_title('avg_glucose_level Distribution Plot Before',fontsize=18)

sns.boxplot(data['bmi'],ax=axes[1,0]).set_title('bmi Box Plot Before',fontsize=18)
sns.kdeplot(data['bmi'],ax=axes[1,1]).set_title('bmi Distribution Plot Before',fontsize=18)
plt.show()

In [None]:
sns.pairplot(data)

In [None]:
sns.heatmap(data.corr())

# 5. Modeling


In [None]:
data.head()

In [None]:
data.tail()

## Spliting into X,y

In [None]:
X = data.drop('stroke',axis=1)
y = data['stroke']

In [None]:
X.head()

In [None]:
from sklearn.model_selection import train_test_split
Xtrain,Xtest,ytrain,ytest = train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
Xtrain.shape,ytrain.shape,Xtest.shape,ytest.shape


# Normalize



In [None]:
from sklearn.preprocessing import StandardScaler
std=StandardScaler()

Xtrain_std=std.fit_transform(Xtrain)
Xtest_std=std.transform(Xtest)

## 1. Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt=DecisionTreeClassifier()
dt.fit(Xtrain_std,ytrain)

DecisionTreeClassifier()
dt.feature_importances_

In [None]:
Y_pred=dt.predict(Xtest_std)

from sklearn.metrics import accuracy_score

ac_dt=accuracy_score(ytest,Y_pred)
ac_dt

## 2. Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
rf.fit(Xtrain_std,ytrain)

Y_pred=rf.predict(Xtest_std)
ac_rf=accuracy_score(ytest,Y_pred)
ac_rf

## 3.Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()
lr.fit(Xtrain_std,ytrain)

Y_pred_lr=lr.predict(Xtest_std)

ac_lr=accuracy_score(ytest,Y_pred_lr)
ac_lr

In [None]:
lr.predict([[1,67.0,0,1,1,2,1,228.69,36.600000,1]])

In [None]:
import pickle
import os

scaler_path=os.path.join('./','lr.pkl')
with open(scaler_path,'wb') as scaler_file:
    pickle.dump(std,scaler_file)

## 4. K-nearest Neighbours

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier()
knn.fit(Xtrain_std,ytrain)

Y_pred=knn.predict(Xtest_std)
ac_knn=accuracy_score(ytest,Y_pred)
ac_knn

In [None]:
dic={'Models':['Decision Tree','Random Forest Classifier','Logistic Regression','K-nearest Neighbours'],'Accuracy':[ac_dt,ac_rf,ac_lr,ac_knn]}
acc=pd.DataFrame(dic)
acc

### Thanks for your time, if you enjoyed reading this notebook an upvote would be appreciated : )