In [None]:
import xgboost
import numpy as np
import optuna
import pandas
import psutil

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, train_test_split

## Helper functions

In [None]:
# from: https://www.kaggle.com/bextuychiev/how-to-work-w-million-row-datasets-like-a-pro
def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

In [None]:
def get_eval_metric(model, X, y_true):
    y_pred = model.predict_proba(X)[:, 1]
    return roc_auc_score(y_true, y_pred)

## Configs

* `DEBUG_MODE`: reduce the number of rows to 500, for faster testing and debugging
* `RANDOM_STATE`: set fix random state for model
* `N_TRIALS`: number of Optuna trials


In [None]:
DEBUG_MODE = False

RANDOM_STATE = 123
N_TRIALS = 10

## Read data

In [None]:
def read_train_data(reduce_memory=True):
    data_path = "/kaggle/input/tabular-playground-series-oct-2021/train.csv"
    X = pandas.read_csv(data_path, index_col="id")
    if reduce_memory:
        X = reduce_memory_usage(X, verbose=True)
    y = X.pop("target").astype("int8")
    return X, y

In [None]:
X, y = read_train_data()

In [None]:
if DEBUG_MODE:
    X, y = X[:500], y[:500]

## Model

In [None]:
def train_model(model_class, params_dict, X_train, y_train, X_val, y_val):   
    model = model_class(random_state=RANDOM_STATE, **params_dict)
    model.fit(X_train, y_train, early_stopping_rounds=150, eval_set=[(X_val, y_val)], verbose=False)
    auc_train = get_eval_metric(model, X=X_train, y_true=y_train)
    auc_val = get_eval_metric(model, X=X_val, y_true=y_val)
    print(f"AUC train={auc_train}")
    print(f"AUC val={auc_val}")
    return model, auc_val

### Hyperparameter Optimisation



In [None]:
base_params = {
    "use_label_encoder": False,
    "n_jobs": 2,
    "tree_method": "gpu_hist",
    "gpu_id": 0,
    "predictor": "gpu_predictor",
    "eval_metric" : "auc",
}

def objective(trial):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)
    
    trial_params = {
        "n_estimators": trial.suggest_int("n_estimators", 400, 5000, 200),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "learning_rate": trial.suggest_float("learning_rate", 1e-2, 0.2, log=True),
        "gamma": trial.suggest_float("gamma", 1e-8, 1.0, log=True),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 7),
        "subsample": trial.suggest_float("subsample", 0.1, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.1, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
    }
    model_params = {**trial_params, **base_params}
    
    _, auc_val = train_model(xgboost.XGBClassifier, model_params, X_train, y_train, X_val, y_val)
    
    return -auc_val

In [None]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=N_TRIALS, gc_after_trial=True)

In [None]:
best_model_params = {**base_params, **study.best_params}

In [None]:
best_model_params

## Create best params file

In [None]:
import json

with open("/kaggle/working/best_model_params.json", "w") as json_file:
    json.dump(best_model_params, json_file, indent=2)

In [None]:
!cat /kaggle/working/best_model_params.json