In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,plot_confusion_matrix,classification_report,recall_score,precision_score,f1_score

In [None]:
df = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
df.head()

In [None]:
df.info()

In [None]:
df['gender'].unique()        #Checking if the value has null in it

In [None]:
df['ever_married'].unique()

In [None]:
df['work_type'].unique()

In [None]:
df['Residence_type'].unique()

In [None]:
df['smoking_status'].unique()          #The unique helps us to decide later if it must be used in get dummies
                                       # If it non-binary then we include it for get_dummy

In [None]:
df.isnull().sum()                 #Calculating the number of null values

In [None]:
df.isnull().sum()['bmi']/len(df)*100           #We have approximately 4% of the bmi as NULL values

In [None]:
sns.heatmap(df.isnull(),cbar=False)

In [None]:
cor_mat=df.corr()

In [None]:
sns.heatmap(cor_mat,mask=np.triu(cor_mat),linewidths=0.5,square=True,center=0)      #Visualising the null point,The white stripes represent Null values

In [None]:
sns.catplot(x='smoking_status',y='bmi',data=df,kind='box')    #Now we check for the attribute which has less variabilty

In [None]:
sns.catplot(x='work_type',y='bmi',data=df,kind='box')

In [None]:
sns.catplot(x='gender',y='bmi',data=df,kind='box')

In [None]:
sns.catplot(x='ever_married',y='bmi',data=df,kind='box')   #It looks like work_type has comparitively less variability

In [None]:
def bmi_filler(col):
    work_type=col[0]
    t_bmi=col[1]
    if pd.isnull(t_bmi):
        return df[df['work_type']==work_type]['bmi'].median()
    else:
        return t_bmi                     #We then fill it with the Median values of bmi with respect to work_type

In [None]:
df['bmi'] = df[['work_type','bmi']].apply(bmi_filler,axis=1)     #We then call the function

In [None]:
sns.heatmap(df.isnull(),cbar=False)           #We then check if the entire heatmap is black

In [None]:
R_type={'Urban':1, 'Rural':0}      #We map the values in the col
E_married={'Yes':1, 'No':0}

In [None]:
df['Residence_type']=df['Residence_type'].map(R_type)
df['ever_married']=df['ever_married'].map(E_married)

In [None]:
df=pd.get_dummies(df,columns=['work_type','gender','smoking_status'])    # We get dummies for more than non-binary values with one hot encoding

In [None]:
X=df.drop(['stroke'],axis=1).copy()
y=df['stroke'].copy()

In [None]:
X=df.drop(['id'],axis=1).copy()

In [None]:
X.head()

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3)  #Split the data into traing and testing

In [None]:
log_model = LogisticRegression(max_iter=400)

In [None]:
log_model.fit(X_train,y_train)

In [None]:
y_predict=log_model.predict(X_test)

In [None]:
accuracy_score(y_test,y_predict)       #We calculate the accuracy as 100%

In [None]:
plot_confusion_matrix(log_model,X_test,y_test,display_labels=['No_Stroke','Stroke'])   #Also the predicted values as correct

In [None]:
sns.countplot(x="stroke", data=df)           #The sample with stroke in the dataset is less

In [None]:
from imblearn.combine import SMOTETomek

In [None]:
smk = SMOTETomek()                    #So we are oversampling the dataset,
X_res,y_res = smk.fit_resample(X,y)

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X_res,y_res,test_size=0.3)

In [None]:
log_model = LogisticRegression(max_iter=500)

In [None]:
log_model.fit(X_train,y_train)

In [None]:
y_predict=log_model.predict(X_test)

In [None]:
accuracy_score(y_test,y_predict)

In [None]:
recall_score(y_test,y_predict)

In [None]:
precision_score(y_test,y_predict)

In [None]:
f1_score(y_test,y_predict)

In [None]:
plot_confusion_matrix(log_model,X_test,y_test,display_labels=['No_Stroke','Stroke'])