# Libraries
---

In [None]:
import pandas as pd
import numpy as np


from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter('ignore')

# Parameters
---

In [None]:
SEED = 42
N_SPLITS = 5
N_ESTIMATORS = 100

# Datasets
---

In [None]:
INPUT = "../input/tabular-playground-series-nov-2021/"

train = pd.read_csv(INPUT + "train.csv")
test = pd.read_csv(INPUT + "test.csv")
submission = pd.read_csv(INPUT + "sample_submission.csv")

features = [col for col in test.columns if 'f' in col]
TARGET = 'target'

# Standardization
---

In [None]:
scaler = StandardScaler()

train[features] = scaler.fit_transform(train[features])
test[features] = scaler.transform(test[features])

# Training and prediction
---

In [None]:
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

oof = np.zeros(train.shape[0])
pred = np.zeros(test.shape[0])

for fold, (trn_idx, val_idx) in enumerate(skf.split(X=train[features], y=train[TARGET])):
    X_train, y_train = train[features].iloc[trn_idx], train[TARGET].iloc[trn_idx]
    X_valid, y_valid = train[features].iloc[val_idx], train[TARGET].iloc[val_idx]
    X_test = test[features]
    
    model = HistGradientBoostingClassifier(
        max_iter=N_ESTIMATORS,
        verbose=0,
        random_state=SEED
    )
    model.fit(X_train, y_train)
    oof[val_idx] = model.predict_proba(X_valid)[:, -1]
    pred += model.predict_proba(X_test)[:, -1] / N_SPLITS
    
    val_score = roc_auc_score(y_valid, oof[val_idx])
    print(f"fold {fold}: {val_score}")

val_score = roc_auc_score(train[TARGET], oof)
print(f"total: {val_score}")

# Submission
---

In [None]:
submission[TARGET] = pred
submission.to_csv("submission.csv", index=False)
submission