In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,roc_auc_score
from sklearn.linear_model import LogisticRegression,Lasso
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

In [None]:
data = pd.read_csv("/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")
data.head()

# Check for null values

In [None]:
data.isna().sum()

In [None]:
# Imputing the bmi feature with mean
data['bmi'].fillna(data['bmi'].mean(),inplace=True)

In [None]:
data.isna().sum()

# Check for Outliers

In [None]:
data.describe().T

In [None]:
fig,axs=plt.subplots(2,3,figsize=(15,12))
axs[0,0].boxplot(data['id'])
axs[0,0].set(xlabel='id')
axs[0,1].boxplot(data['age'])
axs[0,1].set(xlabel='age')
axs[0,2].boxplot(data['hypertension'])
axs[0,2].set(xlabel='hypertension')
axs[1,0].boxplot(data['heart_disease'])
axs[1,0].set(xlabel='heart_disease')
axs[1,1].boxplot(data['avg_glucose_level'])
axs[1,1].set(xlabel='avg_glucose_level')
axs[1,2].boxplot(data['bmi'])
axs[1,2].set(xlabel='bmi');

# EDA
## Uni-variate Analysis
## Bi-variate Analysis

In [None]:
sns.pairplot(data,corner=True)

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(data.corr(),annot=True,cmap='Blues')

In [None]:
# here we can see that data['id'] is trivial than the others. Hence we drop it.
data.drop('id',axis=1,inplace=True)

# Encoding the categorical features

In [None]:
data.info()

In [None]:
encode=['gender','ever_married','work_type','Residence_type','smoking_status']

In [None]:
from sklearn.preprocessing import LabelEncoder
for i in encode:
    l=LabelEncoder()
    data[i]=l.fit_transform(data[i])

In [None]:
data.head(20)

# Feature split

In [None]:
x=data.drop(columns='stroke')
y=data['stroke']

# train-test split

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

# Scaling data

In [None]:
s=MinMaxScaler()
s.fit(x_train)
x_train=s.transform(x_train)
x_test=s.transform(x_test)

# Check for imbalanced data

In [None]:
plt.hist(y_train);

In [None]:
# Training data is highly imbalanced...so, we use smote to balance the dataset

In [None]:
mod=SMOTE()
x_train1,y_train1=mod.fit_resample(x_train,y_train)
plt.hist(y_train1);

# Modelling

# Logistic Regression

### we have to reduce false negatives as predicting stroke as not stroke is very dangerous.

In [None]:
lg=LogisticRegression(max_iter=500,penalty='l1',solver='liblinear',C=3.0)
lg.fit(x_train1,y_train1)
sns.heatmap(confusion_matrix(y_test,lg.predict(x_test)),annot=True,fmt='.2f',xticklabels=['Not Stroke','Stroke'],yticklabels=['Not Stroke','Stroke'])

In [None]:
print(classification_report(y_test,lg.predict(x_test)))

# Decision Tree

In [None]:
d=DecisionTreeClassifier(criterion='gini',max_depth=11)
d.fit(x_train1,y_train1)
sns.heatmap(confusion_matrix(y_test,d.predict(x_test)),annot=True,fmt='.2f',xticklabels=['Not Stroke','Stroke'],yticklabels=['Not Stroke','Stroke'])

In [None]:
print(classification_report(y_test,d.predict(x_test)))

# Random Forest

In [None]:
r=RandomForestClassifier(criterion='entropy',n_estimators=95)
r.fit(x_train1,y_train1)
sns.heatmap(confusion_matrix(y_test,r.predict(x_test)),annot=True,fmt='.2f',xticklabels=['Not Stroke','Stroke'],yticklabels=['Not Stroke','Stroke'])

In [None]:
print(classification_report(y_test,r.predict(x_test)))

# KNN

In [None]:
k=KNeighborsClassifier(n_neighbors= 1)
k.fit(x_train1,y_train1)
sns.heatmap(confusion_matrix(y_test,k.predict(x_test)),annot=True,fmt='.2f',xticklabels=['Not Stroke','Stroke'],yticklabels=['Not Stroke','Stroke'])

In [None]:
print(classification_report(y_test,k.predict(x_test)))

# Naive Baye's

In [None]:
n=GaussianNB()
n.fit(x_train1,y_train1)
sns.heatmap(confusion_matrix(y_test,n.predict(x_test)),annot=True,fmt='.2f',xticklabels=['Not Stroke','Stroke'],yticklabels=['Not Stroke','Stroke'])

In [None]:
print(classification_report(y_test,n.predict(x_test)))

In [None]:
from sklearn.metrics import roc_curve
fpr,tpr,thresh=roc_curve(y_test,lg.predict_proba(x_test)[:,1])
fpr1,tpr1,thresh1=roc_curve(y_test,d.predict_proba(x_test)[:,1])
fpr2,tpr2,thresh2=roc_curve(y_test,r.predict_proba(x_test)[:,1])
fpr3,tpr3,thresh3=roc_curve(y_test,k.predict_proba(x_test)[:,1])
fpr4,tpr4,thresh4=roc_curve(y_test,n.predict_proba(x_test)[:,1])

In [None]:
plt.plot(fpr,tpr,color='blue',label='logistic')
plt.plot(fpr1,tpr1,color='green',label='decision tree')
plt.plot(fpr2,tpr2,color='red',label='random forest')
plt.plot(fpr3,tpr3,color='yellow',label='knn')
plt.plot(fpr4,tpr4,color='black',label='naive')
plt.legend()
plt.show()

In [None]:
roc_auc_score(y_test,lg.predict(x_test))

In [None]:
roc_auc_score(y_test,d.predict(x_test))

In [None]:
roc_auc_score(y_test,r.predict(x_test))

In [None]:
roc_auc_score(y_test,k.predict(x_test))

In [None]:
roc_auc_score(y_test,n.predict(x_test))

In [None]:
accuracy_score(y_test,lg.predict(x_test))

In [None]:
accuracy_score(y_test,d.predict(x_test))

In [None]:
accuracy_score(y_test,r.predict(x_test))

In [None]:
accuracy_score(y_test,k.predict(x_test))

In [None]:
roc_auc_score(y_test,n.predict(x_test))

In [None]:
#Logistic Regression is giving better predictions considering the highest auc_roc score.