In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:

data=pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
data.head()

In [None]:
data.tail()

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
data.isnull().sum()

In [None]:
from numpy import NaN
data=data.replace({NaN:28.89})
data.isna().sum()

In [None]:
data.shape

In [None]:

import seaborn as sns
import matplotlib.pyplot as plt

import warnings
import plotly.express as px
import plotly.io as pio
warnings.filterwarnings("ignore")
import plotly.graph_objects as go

In [None]:
cor=data.corr()
plt.figure(figsize=(15,6))
sns.set_style('darkgrid')
sns.heatmap(cor,annot=True)
plt.show()

In [None]:
ax=px.histogram(data,x='age',template='plotly_dark',color='stroke',title='Age distribution')
ax.show()

In [None]:
ax=px.histogram(data,x='avg_glucose_level',template='plotly_dark',color='stroke',title='Average glucose level')
ax.show()

In [None]:
ax=px.pie(data,names='stroke',template='plotly_dark',hole=.7,)
ax.show()

In [None]:
!pip install plotly==4.5.2
ax=px.sunburst(data,names='work_type',path=['stroke','work_type'],template='plotly_dark',title='Work types based on stroke chances')
ax.show()

In [None]:
ax=px.pie(data,names='work_type',template='plotly_dark',title='Work types',hole=0.7)
ax.show()

In [None]:
ax=px.pie(data,names='Residence_type',template='plotly_dark',hole=.7)
ax.show()

In [None]:
data2=data[['age','avg_glucose_level','bmi','stroke']]
data2.head()
ax=px.parallel_coordinates(data2,color='stroke',labels={'stroke':'Stroke',
                                                       'avg_glucose_level':'Average Glucose Level','bmi':'Bmi',
                                                       'age':'Age'},
                           color_continuous_scale=px.colors.diverging.Tealrose,color_continuous_midpoint=1)
ax.show()

In [None]:
from sklearn.preprocessing import LabelEncoder

le=LabelEncoder()
data["gender"]=le.fit_transform(data["gender"])
data["ever_married"]=le.fit_transform(data["ever_married"])
data["work_type"]=le.fit_transform(data["work_type"])
data["Residence_type"]=le.fit_transform(data["Residence_type"])
data["smoking_status"]=le.fit_transform(data["smoking_status"])
data.head()

In [None]:
X = data.iloc[:,1:-1].values
y = data.iloc[:,-1].values
print('X shape',X.shape)
print('Y shape',y.shape)

In [None]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [None]:
target=data["stroke"]
train=data.drop("stroke",axis=1)
X_train,X_test,y_train,y_test=train_test_split(train,target,test_size=0.2,random_state=4)
print(y_test.shape)
y_test=y_test.to_frame()
print(y_test.shape)
scores=[]
for i in range(1,50):
  knn=KNeighborsClassifier(n_neighbors=i)
  knn.fit(X_train,y_train)
  scores.append(accuracy_score(y_test,knn.predict(X_test)))
plt.figure(figsize=(15,6))
sns.lineplot(np.arange(1,50),scores)
plt.show()

In [None]:
knn=KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train,y_train)
pred=knn.predict(X_test)
accuracy_score(y_test,pred)

In [None]:
cm=confusion_matrix(y_test,pred)
sns.heatmap(cm,annot=True)
cr_report=classification_report(y_test,pred)
cr_report

In [None]:
lr=LogisticRegression()
lr.fit(X_train,y_train)
lrpred=lr.predict(X_test)
accuracy_score(y_test,lrpred)
cm=confusion_matrix(y_test,lrpred)
sns.heatmap(cm,annot=True)
classification_report(y_test,lrpred)

In [None]:
!pip install imblearn
from imblearn.over_sampling import SMOTE
print('Before OverSampling, counts of label 1: {}'.format(sum(y_train==1)))
print('Before OverSampling, counts of label 0: {} \n'.format(sum(y_train==0)))

In [None]:
sm=SMOTE(random_state=3)
X_train_res, y_train_res = sm.fit_resample(X_train,y_train.ravel())

print('After OverSampling, the shape of train_x: {}'.format(X_train_res.shape))
print('After OverSampling, the shape of train_y: {}'.format(y_train_res.shape))

print('After OverSampling, counts of label 1: {}'.format(sum(y_train_res == 1)))
print('After OverSampling, counts of label 0: {}'.format(sum(y_train_res == 0)))

In [None]:
knn2=KNeighborsClassifier(n_neighbors=8)
knn2.fit(X_train_res,y_train_res)
knnpred2=knn2.predict(X_test)
smknn_score=accuracy_score(y_test,knnpred2)
smknn_score

In [None]:
lr2=LogisticRegression()
lr2.fit(X_train_res,y_train_res)
lr2pred=lr2.predict(X_test)
smlr_score=accuracy_score(y_test,lr2pred)
smlr_score

In [None]:
rf= RandomForestClassifier()
rf.fit(X_train,y_train)
rfpred= rf.predict(X_test)
accuracy_score(y_test,rfpred)

In [None]:
rf2=RandomForestClassifier()
rf2.fit(X_train_res,y_train_res)
rf2pred=rf2.predict(X_test)
accuracy_score(y_test,rf2pred)

In [None]:
dc= DecisionTreeClassifier()
dc.fit(X_train,y_train)
preddc= dc.predict(X_test)
accuracy_score(y_test,preddc)

In [None]:
dc= DecisionTreeClassifier()
dc.fit(X_train_res,y_train_res)
preddc2= dc.predict(X_test)
accuracy_score(y_test,preddc2)