In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing, metrics
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from xgboost.sklearn import XGBClassifier

data preprocessing
----

In [None]:
"""
1) id: unique identifier
2) gender: "Male", "Female" or "Other"
3) age: age of the patient
4) hypertension: 0 if the patient doesn't have hypertension, 1 if the patient has hypertension
5) heart_disease: 0 if the patient doesn't have any heart diseases, 1 if the patient has a heart disease
6) ever_married: "No" or "Yes"
7) work_type: "children", "Govt_jov", "Never_worked", "Private" or "Self-employed"
8) Residence_type: "Rural" or "Urban"
9) avg_glucose_level: average glucose level in blood
10) bmi: body mass index (have NaN)
11) smoking_status: "formerly smoked", "never smoked", "smokes" or "Unknown"*
12) stroke: 1 if the patient had a stroke or 0 if not
*Note: "Unknown" in smoking_status means that the information is unavailable for this patient
"""
# read data
data = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
data.head()

In [None]:
data["gender"].value_counts()

In [None]:
data["age"].plot(kind="hist", bins=50)

In [None]:
data["hypertension"].value_counts()

In [None]:
data["heart_disease"].value_counts()

In [None]:
data["ever_married"].value_counts()

In [None]:
data["work_type"].value_counts()

In [None]:
data["Residence_type"].value_counts()

In [None]:
data["avg_glucose_level"].plot(kind="hist", bins=50)

In [None]:
data["bmi"].plot(kind="hist", bins=50)

In [None]:
data["smoking_status"].value_counts()

In [None]:
# deal with category features
data.drop(["id"], axis=1, inplace=True)
data["ever_married"] = data["ever_married"].map({"Yes": 1, "No": 0})
data["smoking_status_Unknown"] = (data["smoking_status"] == "Unknown").astype(int)
data["smoking_status"] = data["smoking_status"].map({"never smoked": 0,
                                                     "formerly smoked": 1,
                                                     "smokes": 2,
                                                     "Unknown": 0})
data = pd.get_dummies(data)
data.head()

In [None]:
data.isnull().sum(axis=0)

In [None]:
data.loc[data["bmi"].isnull(), ]["stroke"].value_counts()

In [None]:
data.loc[data["bmi"].notnull(), ]["stroke"].value_counts()

In [None]:
# solution 3: fill with median
data_fill = data.fillna(data["bmi"].median())
data_fill["stroke"].value_counts()

In [None]:
data_fill["bmi"].plot(kind="hist", bins=50)
data["bmi"].plot(kind="hist", bins=50)

xgboost
--

In [None]:
X = data_fill[data_fill.columns.drop(["stroke"])]
y = data_fill["stroke"]

fold = 5
result_acc = np.zeros(fold)
result_f1 = np.zeros(fold)

sfolder = StratifiedKFold(n_splits=fold, random_state=0, shuffle=True)
i=0
for train, test in sfolder.split(data_fill, data_fill['stroke']):
    X_train = X.iloc[train, :]
    X_test = X.iloc[test, :]
    y_train = y.iloc[train]
    y_test = y.iloc[test]

    pos_weigth = np.unique(y_train, return_counts=True)[1][0] / np.unique(y_train, return_counts=True)[1][1]

    xgb = XGBClassifier(objective='binary:logitraw', 
                        n_estimators=200,
                        max_depth=3,
                        min_child_weight=7,

                        scale_pos_weight=pos_weigth, seed=0, use_label_encoder=False)
    xgb.fit(X_train, y_train)

    y_test_result = xgb.predict(X_test)
    result_acc[i] = metrics.accuracy_score(y_test, y_test_result)
    result_f1[i] = metrics.f1_score(y_test, y_test_result)
    i += 1
print("accuracy_score: {score}+-{std}".format(score=round(result_acc.mean(), 3), std=round(result_acc.std(), 3)))
print("f1_score: {score}+-{std}".format(score=round(result_f1.mean(), 3), std=round(result_f1.std(), 3)))

xgboost + SMOTE
--

In [None]:
X = data_fill[data_fill.columns.drop(["stroke"])]
y = data_fill["stroke"]

fold = 5
result_acc = np.zeros(fold)
result_f1 = np.zeros(fold)
sfolder = StratifiedKFold(n_splits=fold, random_state=0, shuffle=True)
i=0
from sklearn.preprocessing import StandardScaler 

for train, test in sfolder.split(data_fill, data_fill['stroke']):
    X_train = X.iloc[train, :].values
    y_train = y.iloc[train].values

    sm = SMOTE(random_state=0)
    X_train, y_train = sm.fit_resample(X_train, y_train)
    X_train = pd.DataFrame(X_train, columns=data_fill.columns.drop("stroke"))

    binary_features = X_train.columns.drop(["age", "avg_glucose_level", "bmi", "smoking_status"])
    X_train[binary_features] = X_train[binary_features].apply(lambda x: x>0.5)
    X_train["smoking_status"] = X_train["smoking_status"].apply(lambda x: 2 if x > 1.5 else 1 if x > 0.5 else 0)

    X_test = X.iloc[test, :]
    y_test = y.iloc[test].values
    
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)

    pos_weigth = np.unique(y_train, return_counts=True)[1][0] / np.unique(y_train, return_counts=True)[1][1]
    xgb = XGBClassifier(objective='binary:logitraw', 
                        n_estimators=200,
                        max_depth=2,
                        min_child_weight=7,
                        gamma=1,
                        subsample=1,
                        reg_alpha=0,
                        reg_lambda=1,
                        learning_rate=0.3,
                        scale_pos_weight=pos_weigth, seed=0, use_label_encoder=False)
    xgb.fit(X_train, y_train)

    y_test_result = xgb.predict(X_test)
    result_acc[i] = metrics.accuracy_score(y_test, y_test_result)
    result_f1[i] = metrics.f1_score(y_test, y_test_result)
    i += 1
    
print("accuracy_score: {score}+-{std}".format(score=round(result_acc.mean(), 3), std=round(result_acc.std(), 3)))
print("f1_score: {score}+-{std}".format(score=round(result_f1.mean(), 3), std=round(result_f1.std(), 3)))