## TPS Oct. 2021 - Baseline LGBM/XGB/CB

Please see [TPS Oct. 2021 - Baseline AdaBoost/HistGB](https://www.kaggle.com/stevenrferrer/tps-oct-2021-baseline-adaboost-histgb) for sklearn boosting models. I separated those models because they're very slow to run and I would quickly run-out of GPU if I include them here.

## Import libraries

In [None]:
%%time

import os
import logging
import sys
import time
from datetime import timedelta

import warnings
warnings.simplefilter("ignore")

import gc
gc.enable()

import numpy as np
import pandas as pd

import plotly.figure_factory as ff

from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

## Load datasets

In [None]:
%%time

# Refer to https://www.kaggle.com/bextuychiev/how-to-work-w-million-row-datasets-like-a-pro

def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max):
                    df[col] = df[col].astype(np.float16)
                elif (c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print("Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)". 
              format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
%%time

data_dir = "../input/tabular-playground-series-oct-2021/"

train  = reduce_memory_usage(pd.read_csv(data_dir  + "train.csv"))
test = reduce_memory_usage(pd.read_csv(data_dir + "test.csv"))
submission = reduce_memory_usage(pd.read_csv(data_dir + "sample_submission.csv"))

In [None]:
%%time

print("Train shape: ", train.shape)
print("Test shape: ", test.shape, end="\n\n")

In [None]:
train.head()

In [None]:
test.head()

## Features

In [None]:
%%time

TARGET = "target"

features = [col for col in train.columns if col not in ["id", TARGET]]
print(f"All features {len(features)}:")
for feat in features:
    print(feat, end=", ")
print("\n\n")

cont_features = []
cat_features = []
for feat in features:
    if "float" in str(train[feat].dtype):
        cont_features.append(feat)
    else:
        cat_features.append(feat)

print(f"Continuous features {len(cont_features)}")
for feat in cont_features:
    print(feat, end=", ")
print("\n\n")

print(f"Categorical (binary) features {len(cat_features)}")
for feat in cat_features:
    print(feat, end=", ")
print("\n\n")

In [None]:
%%time

y = train.pop("target")
X = train.copy()
X_test = test.drop("id", axis=1).copy()

del train
del test

## Predict

In [None]:
%%time

lgb1_params = {
    "random_state": 42,
    "n_estimators": 1000,
    "objective" : "binary",
    "metric" : "auc",
}

xgb1_params = {
    "random_state": 42,
    "n_estimators": 1000,
    "eval_metric": "auc",
    "objective":"binary:logistic",
    "booster": "gbtree",
    ## cpu
    #"tree_method": "hist",
    #"n_jobs": -1,
    ## gpu
    "gpu_id": 0,
    "tree_method": "gpu_hist",
    "predictor": "gpu_predictor"
}

cb1_params = {
    "random_seed": 42,
    "iterations": 1000,
    "eval_metric" : "AUC",
    "verbose": 0,
    # gpu
    "task_type" : "GPU",
    "devices" : "0",
}

# Model name must be unique
models = [
    ("lgb1", LGBMClassifier(**lgb1_params)),
    ("xgb1", XGBClassifier(**xgb1_params)),
    ("cb1", CatBoostClassifier(**cb1_params)),
]

In [None]:
%%time

def predict_with_model(model, simple_fit=False, splits=5):
    test_preds = []
    valid_preds = {}
    scores = []
    
    skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=42)
    for fold, (idx_train, idx_valid) in enumerate(skf.split(X, y)):
        start_time = time.monotonic()
        
        X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
        X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]
        
        valid_ids = X_valid.id.values.tolist()

        X_train = X_train[features]
        X_valid = X_valid[features]

        if simple_fit:
            model.fit(X_train, y_train)
        else:
            model.fit(
                X_train, y_train,
                eval_set=[(X_valid, y_valid)],
                early_stopping_rounds=180,
                verbose=1000
            )
        
        valid_pred = model.predict_proba(X_valid)[:, 1]
        test_pred = model.predict_proba(X_test)[:, 1]
        
        test_preds.append(test_pred)
        valid_preds.update(dict(zip(valid_ids, valid_pred)))

        score = roc_auc_score(y_valid, valid_pred)
        
        end_time = time.monotonic()
        dur = timedelta(seconds=end_time - start_time)
        print(f"Fold {fold} | AUC: {score} | Took: {dur}")
        scores.append(score)
    
    test_preds = np.mean(np.column_stack(test_preds), axis=1)
    valid_preds = pd.DataFrame.from_dict(valid_preds, orient="index").reset_index()
    
    return test_preds, valid_preds, scores

In [None]:
%%time

def predict_with_models(models):
    print(f"Predicting with {len(models)} models...", end="\n\n")
    for model_name, model in models:
        start_time = time.monotonic()
        
        simple_fit = False
        
        print("-" * 50)
        print(f"Using {model_name} model...")
        test_preds, valid_preds, scores = predict_with_model(model, simple_fit=simple_fit)
        print(f"Score: {np.mean(scores)}, Std: {np.std(scores)}", end="\n\n")

        print("Saving predictions...")
        valid_preds.columns = ["id", model_name]
        valid_preds.to_csv(f"{model_name}_train.csv", index=False)

        test_preds_df = pd.DataFrame({"id": submission.id, model_name: test_preds})
        test_preds_df.to_csv(f"{model_name}_test.csv", index=False)

        sub = pd.DataFrame({"id": submission.id, TARGET: test_preds})
        sub.to_csv(f"{model_name}_submission.csv", index=False)
        
        end_time = time.monotonic()
        dur = timedelta(seconds=end_time - start_time)
        print(f"Took: {dur}")

In [None]:
%%time

predict_with_models(models)

In [None]:
# Free-up memory

del X
del y
del X_test

## Visualize

In [None]:
def load_viz_data(submission_files):
    dfs = []
    for submission_file in submission_files:
        df = pd.read_csv(submission_file)
        dfs.append((submission_file, df))
        
    hist_data = []
    for i in range(len(dfs)):
        _, df = dfs[i]
        hist_data.append(df[TARGET])
        
    return hist_data

In [None]:
%%time

submission_files = [f"{model_name}_submission.csv" for (model_name, _) in models]
viz_data = load_viz_data(submission_files)

fig = ff.create_distplot(viz_data, submission_files, show_hist=False, show_rug=False)
fig.show()