In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
import sklearn.metrics as metrics
from sklearn.metrics import confusion_matrix
#import the required Libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

## Loading the Data

we will create a pandas dataframe from the csv file that contains health care stroke data.

In [None]:
#Read the data into a pandas dataframe
health_df=pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
health_df.head()

## Exploring the data 

Let's explore the dataframe by looking at different statistical informations.

In [None]:
health_df.dtypes

In [None]:
#Let's drop the id column since we don't need it.
health_df.drop('id',axis=1,inplace=True)
health_df.head()

**Column Descriptions**
* **id**: unique identifier
* **gender**: "Male", "Female" or "Other"
* **age**: age of the patient
* **hypertension**: 0 if the patient doesn't have hypertension, 1 if the patient has hypertension
* **heart_disease**: 0 if the patient doesn't have any heart diseases, 1 if the patient has a heart disease
* **ever_married**: "No" or "Yes"
* **work_type**: "children", "Govt_jov", "Never_worked", "Private" or "Self-employed"
* **Residence_type**: "Rural" or "Urban"
* **avg_glucose_level**: average glucose level in blood
* **bmi**: body mass index
* **smoking_status**: "formerly smoked", "never smoked", "smokes" or "Unknown"*
* **stroke**: 1 if the patient had a stroke or 0 if not  
**Note*: "Unknown" in smoking_status means that the information is unavailable for this patient*

In [None]:
health_df.shape

In [None]:
health_df.isnull().sum()

In [None]:
health_df[health_df.isna().any(axis=1)]

In [None]:
health_df['bmi'].median()

In [None]:
#Let's replace null values with mode values(values appearing the most) in the BMI column.
health_df['bmi'].fillna(health_df['bmi'].mode()[0], inplace=True)
health_df.isna().sum()

In [None]:
health_df['smoking_status'].unique()

In [None]:
health_df.groupby(["smoking_status"])["smoking_status"].count()

We have 1544 samples with a smoking_status of Unkown. We will impute this field by replacing the unknown values with the most repeated one which is 'never smoked'.

In [None]:

health_df['smoking_status']=health_df['smoking_status'].replace(['Unknown'],'never smoked')
health_df.groupby(["smoking_status"])["smoking_status"].count()

In [None]:
d = {'Male': 0, 'Female': 1,'Other':2}
health_df['gender'] = health_df['gender'].map(d).fillna(health_df['gender'])
health_df.head()

In [None]:
health_df['ever_married']=health_df['ever_married'].replace({'No': 0, 'Yes': 1})
health_df.head()

In [None]:
health_df['work_type'].unique()

In [None]:
health_df=pd.concat([health_df,pd.get_dummies(health_df['work_type'], prefix='work_type')],axis=1)
health_df.drop(['work_type'],axis=1, inplace=True)

In [None]:
health_df['Residence_type'].unique()

In [None]:
health_df=pd.concat([health_df,pd.get_dummies(health_df['Residence_type'], prefix='Residence_type')],axis=1)
health_df.drop(['Residence_type'],axis=1, inplace=True)

In [None]:
health_df.head()

In [None]:
health_df=pd.concat([health_df,pd.get_dummies(health_df['smoking_status'], prefix='smoking_status')],axis=1)
health_df.drop(['smoking_status'],axis=1, inplace=True)

In [None]:
health_df.head()

In [None]:
health_df['gender'].unique()

## Data Visualization

In [None]:
plt.hist(health_df['age'])

In [None]:
plt.hist(health_df['gender'])

In [None]:
sns.histplot(health_df['stroke'])

In [None]:
sns.histplot(data=health_df, x="age", hue="gender")

In [None]:
sns.pairplot(data=health_df, hue="stroke")

In [None]:
plt.figure(figsize=(12,8)) 
sns.heatmap(health_df.corr(), annot=True, cmap='gist_heat', linewidths = 2)
plt.show()

---

# **Data Preparation**

In [None]:
health_df['stroke']=health_df['stroke'].astype('float')

In [None]:
X=health_df.loc[:, health_df.columns != 'stroke']
Y=health_df.loc[:, health_df.columns == 'stroke']
print(X.shape,Y.shape)

In [None]:
#Train Test split

xtrain,xtest,ytrain,ytest=train_test_split(X,Y,test_size=0.3)

# **Building the Model**

In [None]:

model = LogisticRegression(solver='liblinear', random_state=0)
model.fit(xtrain,ytrain)


In [None]:

cf_matrix=confusion_matrix(ytest,model.predict(xtest))
sns.heatmap(cf_matrix, annot=True)

In [None]:
rndfrst = RandomForestClassifier(max_depth=100, random_state=0)
rndfrst.fit(xtrain, ytrain)

In [None]:
cf_matrix=confusion_matrix(ytest,rndfrst.predict(xtest))
sns.heatmap(cf_matrix, annot=True)

In [None]:
model.score(xtest,ytest)

In [None]:
rndfrst.score(xtest,ytest)

In [None]:
probs = model.predict_proba(xtest)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(ytest, preds)
roc_auc = metrics.auc(fpr, tpr)

In [None]:
plt.title('Logistic Regression ROC')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
probs = rndfrst.predict_proba(xtest)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(ytest, preds)
roc_auc = metrics.auc(fpr, tpr)

In [None]:
plt.title('Random Forest ROC')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:

print(classification_report(ytest, model.predict(xtest)))
print(classification_report(ytest, rndfrst.predict(xtest)))


---

In [None]:
#Importing SMOTE
from imblearn.over_sampling import SMOTE

In [None]:
#Oversampling the data
smote = SMOTE(random_state = 101)
X, y = smote.fit_resample(X, Y)

In [None]:
sns.histplot(y)

In [None]:
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.3)

In [None]:
smot_model=LogisticRegression(solver='liblinear', random_state=0)
smot_model.fit(x_train,y_train)

In [None]:
smot_model.score(x_test,y_test)

In [None]:
smot_rndfrst=RandomForestClassifier(max_depth=100, random_state=0)
smot_rndfrst.fit(x_train, y_train)

In [None]:
smot_rndfrst.score(x_test,y_test)

In [None]:
smot_model_cf=confusion_matrix(y_test,smot_model.predict(x_test))
sns.heatmap(cf_matrix, annot=True)

In [None]:
smot_rndfrst_cf=confusion_matrix(y_test,smot_rndfrst.predict(x_test))
sns.heatmap(cf_matrix, annot=True)

In [None]:

print(classification_report(y_test, smot_model.predict(x_test)))
print(classification_report(y_test, smot_rndfrst.predict(x_test)))


In [None]:
probs = smot_model.predict_proba(x_test)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)
plt.title('Logistic Regression ROC')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
probs = smot_rndfrst.predict_proba(x_test)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)
plt.title('Random Forest ROC')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()