In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install dabl

In [None]:
data = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')

In [None]:
import dabl

In [None]:
dabl.plot(X=data.iloc[:,:-1], y=data.iloc[:,-1])

In [None]:
data.info()

In [None]:
data.drop('id', axis=1, inplace=True)

In [None]:
data.isna().sum()

In [None]:
data.dropna(inplace=True)

In [None]:
data.stroke.value_counts(normalize=True)

In [None]:
cols = data.columns
num_cols = data._get_numeric_data().columns
obj_cols = list(set(cols)-set(num_cols))
obj_cols_idx = [data.columns.get_loc(obj) for obj in obj_cols]

In [None]:
obj_cols_idx

In [None]:
for obj in obj_cols:
    data[obj] = data[obj].astype('category').cat.codes

In [None]:
data.describe().T

In [None]:
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import lightgbm as lgbm

import warnings
warnings.filterwarnings('ignore')

In [None]:
SEED = 42

In [None]:
X = data.iloc[:,:-1]
y = data.iloc[:,-1]

In [None]:
scaler = RobustScaler().fit(X)
X_new = scaler.transform(X)

In [None]:
X = pd.DataFrame(X_new, columns = cols[:-1])
X['stroke'] = y.values
X = X.reindex(columns = cols)
y = X.iloc[:,-1]
X = X.iloc[:,:-1]

In [None]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=SEED)

In [None]:
acc_score = []
f1s_score = []
auc_score = []
for train_idx, val_idx in skf.split(X, y):
    Xt, Xv = X.loc[train_idx], X.loc[val_idx]
    yt, yv = y.loc[train_idx], y.loc[val_idx]

    model = lgbm.LGBMClassifier(
        is_unbalance=True,
        categorical_feature=obj_cols_idx,
        seed=SEED,
        boosting_type='goss',
        device_type='gpu'
    )

    model.fit(Xt, yt)
    y_pred = model.predict(Xv)
    y_prob = model.predict_proba(Xv)[:,1]
    acc_score.append(accuracy_score(yv, y_pred))
    f1s_score.append(f1_score(yv, y_pred))
    auc_score.append(roc_auc_score(yv, y_prob))
        
acc_mean = np.mean(acc_score)
f1s_mean = np.mean(f1s_score)
auc_mean = np.mean(auc_score)
df_result = pd.DataFrame({
    'Accuracy': [acc_mean],
    'F1 Score': [f1s_mean],
    'ROC AUC' : [auc_mean]
})

In [None]:
df_result