In [12]:
import pandas as pd
import numpy as np

import matplotlib.pylab as plt
import lightgbm as lgb 

from sklearn.metrics import accuracy_score, roc_auc_score 

from sklearn.model_selection import (
    train_test_split,
    TimeSeriesSplit,
    KFold,
    StratifiedKFold,
    GroupKFold,
    StratifiedGroupKFold,
)


In [9]:
url = 'https://raw.githubusercontent.com/sitsawek/dataset/main/healcare_stroke_data/healthcare-dataset-stroke-data.csv'

In [10]:
df = pd.read_csv(url)

In [20]:
def get_prep_data(df):
    df['ever_married'] = (
        df['ever_married'].replace('Yes', True).replace('No', False)
    )
    df['gender'] = df['gender'].astype('category')
    df['smoking_status'] = df['smoking_status'].astype('category')
    df['Residence_type'] = df['Residence_type'].astype('category')
    df['work_type'] = df['work_type'].astype('category')
    df['docter'] = np.random.randint(0, 8, size=len(df))
    holdout_ids = df.sample(n=500, random_state=529).index

    train = (
        df.loc[~df.index.isin(holdout_ids)]
        .sample(frac=1, random_state=529)
        .sort_values('docter')
        .reset_index(drop=True)
    )

    holdout = (
        df.loc[df.index.isin(holdout_ids)]
        .sample(frac=1, random_state=529)
        .sort_values('docter')
        .reset_index(drop=True)
    )
    return train, holdout

In [22]:
train, holdout = get_prep_data(df)

In [30]:
def get_X_y(train):
    FEATURES = [
        'gender',
        'age',
        'hypertension',
        'heart_disease',
        'ever_married',
        'work_type',
        'Residence_type',
        'avg_glucose_level',
        'bmi',
        'smoking_status',
    ]

    GROUPS = 'doctor'

    TARGET = 'stroke'

    X = train[FEATURES]
    y = train[TARGET]
    groups = GROUPS
    return X, y, groups


In [31]:
X, y, groups = get_X_y(train)

In [33]:
clf = lgb.LGBMClassifier(n_estimators=100)
clf.fit(X, y)

In [37]:
# Predict
pred = clf.predict(X)
pred_proba = clf.predict_proba(X)[:,1]

In [43]:
acc_score = accuracy_score(y, pred)
auc_score = roc_auc_score(y, pred_proba)
print(f'Accuracy : {acc_score:0.4f} AUC : {auc_score:0.4f}')

Accuracy : 0.9911 AUC : 0.9997


In [46]:
X_holdouts, y_holdouts, groups_holdouts = get_X_y(holdout)
pred_holdouts = clf.predict(X_holdouts)
pred_proba_holdouts = clf.predict_proba(X_holdouts)[:, 1]
acc_score_holdouts = accuracy_score(y_holdouts, pred_holdouts)
auc_score_holdouts = roc_auc_score(y_holdouts, pred_proba_holdouts)
print(f'Accuracy : {acc_score_holdouts:0.4f} AUC : {auc_score_holdouts:0.4f}')


Accuracy : 0.9380 AUC : 0.7802


In [56]:
# Baseline predicting by zero
accuracy_score(y, np.zeros_like(y)), roc_auc_score(y, np.zeros_like(y))

(0.9516268980477224, 0.5)

# Train Test Split

In [66]:
X, y, groups = get_X_y(train)
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.1)

(4149, 10)