In [None]:
import numpy as np # linear algebra

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# **Import**

In [None]:
import pandas as pd
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, ConfusionMatrixDisplay, precision_score, confusion_matrix, recall_score
from sklearn.model_selection import train_test_split
from sklearn import metrics
import seaborn as sns
import matplotlib.pyplot as plt

# **Loading dataset**

In [None]:
data = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')

# **Checking dataset**

In [None]:
data.info()
data['gender'].unique()

In [None]:
data['ever_married'].unique()

In [None]:
data['work_type'].unique()

In [None]:
data['Residence_type'].unique()

In [None]:
data['smoking_status'].unique()

# **Data cleaning**

In [None]:
data["gender"] = data["gender"].apply(lambda x: 1 if x=="Male" else 0)
data["ever_married"] = data["ever_married"].apply(lambda x: 1 if x=="Yes" else 0)
data["Residence_type"] = data["Residence_type"].apply(lambda x: 1 if x=="Urban" else 0)
data["work_type"] = data["work_type"].apply(lambda x: 0 if x=="children" else 1)
data["smoking_status"] = data["smoking_status"].apply(lambda x: 0 if x=="never smoked" else 1 )
data = data.drop('id', axis = 1)
data.bmi = data.bmi.fillna(-9999)

data.head()

# **Split dataset into training set and test set**

# **70% training and 30% testing**

In [None]:
x = data.drop('stroke',axis=1).values
y = data['stroke'].values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

# **Building XGBoost Classifier model**

In [None]:

from xgboost.sklearn import XGBClassifier
clf = XGBClassifier(
booster = "gbtree",
objective = "reg:logistic",
n_estimators=50,
alpha = 0,
learning_rate= 0.2,
max_depth=8,
subsample=1,
gamma=0,
reg_lambda=1,
max_delta_step=0,
colsample_bytree=1,
min_child_weight=1,
seed=1000
)

clf.fit(x_train, y_train,eval_metric='auc')
y_pred=clf.predict(x_test)
y_true= y_test

XG_f1 = f1_score(y_true = y_test , y_pred = y_pred, average = 'weighted')
ROC = roc_auc_score(y_test, y_pred)  
precision = precision_score(y_test, y_pred)  
recall = recall_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize = (6, 4))
sns.heatmap(cm, cmap = 'Oranges', annot = True, fmt = 'd', linewidths = 3, cbar = False,yticklabels = ['No Stroke', 'Stroke'], xticklabels = ['Predicted No Stroke', 'Predicted Stroke'])
plt.show()

print("XGB_Accuracy : %.4g" % metrics.accuracy_score(y_true, y_pred),"    ",  "XGB_F1-score : %.4g" % XG_f1, "\n","XGB_ROC_AUC_score : %.4g" % ROC,"    ", "XGB_precision : %.4g" % precision, "    ","XGB_recall : %.4g" % recall)


# **Building Random Forest Classifier model**

In [None]:
from sklearn import ensemble
forest = ensemble.RandomForestClassifier(n_estimators = 100)
forest_fit = forest.fit(x_train, y_train)
y_test_predicted = forest.predict(x_test)
y_pred=forest.predict(x_test)
y_true= y_test
RF_f1 = f1_score(y_true = y_test , y_pred = y_pred, average = 'weighted')

RF_ROC = roc_auc_score(y_test, y_pred)  
RF_precision = precision_score(y_test, y_pred)  
RF_recall = recall_score(y_test, y_pred)
RF_cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize = (6, 4))
sns.heatmap(RF_cm, cmap = 'Oranges', annot = True, fmt = 'd', linewidths = 3, cbar = False,yticklabels = ['No Stroke', 'Stroke'], xticklabels = ['Predicted No Stroke', 'Predicted Stroke'])
plt.show()

print("RF_Accuracy : %.4g" % metrics.accuracy_score(y_true, y_pred),"    ",  "RF_F1-score : %.4g" %  RF_f1, "\n","RF_ROC_AUC_score : %.4g" % RF_ROC,"    ", "RF_precision : %.4g" % RF_precision, "    ", "RF_recall : %.4g" % RF_recall)