In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import recall_score, f1_score, precision_score, accuracy_score, confusion_matrix, classification_report


df = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
df.head()

# Missing data

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor

DT_bmi_pipe = Pipeline( steps=[ 
                               ('scale',StandardScaler()),
                               ('lr',DecisionTreeRegressor(random_state=42))
                              ])
X = df[['age','gender','bmi']].copy()
X.gender = X.gender.replace({'Male':0,'Female':1,'Other':-1}).astype(np.uint8)

Missing = X[X.bmi.isna()]
X = X[~X.bmi.isna()]
Y = X.pop('bmi')
DT_bmi_pipe.fit(X,Y)
predicted_bmi = pd.Series(DT_bmi_pipe.predict(Missing[['age','gender']]),index=Missing.index)
df.loc[Missing.index,'bmi'] = predicted_bmi

# Encoding categorical values

In [None]:
df['gender'] = df['gender'].replace({'Male':0,'Female':1,'Other':-1}).astype(np.uint8)
df['Residence_type'] = df['Residence_type'].replace({'Rural':0,'Urban':1}).astype(np.uint8)
df['work_type'] = df['work_type'].replace({'Private':0,'Self-employed':1,'Govt_job':2,'children':-1,'Never_worked':-2}).astype(np.uint8)
df['ever_married'] = df['ever_married'].replace({'No':0,'Yes':1}).astype(np.uint8)
df['smoking_status'] = df['smoking_status'].replace({'never smoked':0,'formerly smoked':1, 'smokes':2, 'Unknown':99}).astype(np.uint8)
df.head()

# Sampling

In [None]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE, BorderlineSMOTE

highly_corr_features =  ['gender','age','hypertension','heart_disease','ever_married', 'work_type','avg_glucose_level','bmi']
X = df[highly_corr_features]
y = df['stroke']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

#SMOTE
X_train_resh, y_train_resh = SMOTE(random_state=42).fit_resample(X_train, y_train.ravel())

print("After OverSampling, counts of label '1': {}".format(sum(y_train_resh==1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train_resh==0)))

# XGBoost

In [None]:
from xgboost.sklearn import XGBClassifier 

#XGBoot
clf = XGBClassifier(n_estimators=800, 
                    learning_rate= 0.3, 
                    max_depth=3, 
                    min_child_weight=3, 
                    gamma=0.4, 
                    subsample=0.7, 
                    colsample_bytree=0.8,
                    scale_pos_weight=1, 
                    seed=65,
                    use_label_encoder=False)
clf.fit(X_train_resh,y_train_resh,eval_metric='auc') 
y_pred = clf.predict(X_test)

print("Accuracy: %.4g" % accuracy_score(y_test, y_pred))
print("F1-Score: %.4g \n" % f1_score(y_test, y_pred))
print("Confusion matrix: \n {}".format(confusion_matrix(y_test, y_pred)))
print("\n {}".format(classification_report(y_test, y_pred)))