## TPS Dec. 2021 - Baseline XGBM/LGBM/CB with GPU

## LightGBM with GPU support

In [None]:
%%time

# Refer to https://www.kaggle.com/vinhnguyen/gpu-acceleration-for-lightgbm and
# https://medium.com/@vipulgote4/how-to-build-and-install-lightgbm-for-gpu-acceleration-2b53f0066c02

!rm -r /opt/conda/lib/python3.6/site-packages/lightgbm
!git clone --recursive https://github.com/Microsoft/LightGBM

In [None]:
%%time

!apt-get update && apt-get install -y -qq libboost-all-dev

In [None]:
%%time

%%bash
cd LightGBM && rm -rf build
mkdir build && cd build
cmake -DUSE_GPU=1 -DOpenCL_LIBRARY=/usr/local/cuda/lib64/libOpenCL.so -DOpenCL_INCLUDE_DIR=/usr/local/cuda/include/ ..
make -j$(nproc)

In [None]:
%%time

!cd LightGBM/python-package/;python3 setup.py install --precompile

In [None]:
%%time

!mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
!rm -r LightGBM

## Import libraries

In [None]:
%%time

import os
import logging
import sys
import time
from datetime import timedelta

import warnings
warnings.simplefilter("ignore")

import gc
gc.enable()

import numpy as np
import pandas as pd

import plotly.figure_factory as ff

from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

from xgboost import XGBClassifier
import xgboost as xgb
xgb.set_config(verbosity=0)

from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

## Load datasets

In [None]:
%%time

data_dir = "../input/tabular-playground-series-dec-2021/"
cov_dir = "../input/forest-cover-type-dataset/"

train  = pd.read_csv(data_dir  + "train.csv")
test = pd.read_csv(data_dir + "test.csv")
submission = pd.read_csv(data_dir + "sample_submission.csv")

TARGET = "Cover_Type"
ID = "Id"

# Refer to https://www.kaggle.com/lucamassaron/baseline-lightgbm-with-covtype-augmentation/notebook
covtype = pd.read_csv(cov_dir + "covtype.csv")
covtype[ID] = range(len(train), len(train)+len(covtype))
covtype = covtype[train.columns].set_index(ID)

In [None]:
%%time

print("Train shape: ", train.shape)
print("Test shape: ", test.shape, end="\n\n")

In [None]:
train.head()

In [None]:
test.head()

## Features

In [None]:
features = [col for col in train.columns if col not in (ID, TARGET)]

print(f"Features ({len(features)}):")
for feature in features:
    print(feature, end=", ")

## Reduce memory usage

In [None]:
%%time

# Refer to https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df, verbose=True):
    numerics = ["int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print("Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)".format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
%%time

y = train.pop(TARGET)
X = reduce_mem_usage(train.set_index(ID))
X_test = reduce_mem_usage(test.set_index(ID))

# Refer to # Refer to https://www.kaggle.com/lucamassaron/baseline-lightgbm-with-covtype-augmentation/notebook
aug_X = reduce_mem_usage(covtype.loc[covtype[TARGET].isin([4, 5]), X.columns])
aug_y = covtype.loc[covtype[TARGET].isin([4, 5]), TARGET]

del train
del test
del covtype

## Predict

In [None]:
%%time

def predict_with_model(model, verbose=True, splits=5):
    test_preds = []
    valid_preds = {}
    scores = []
    
    skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=42)
    for fold, (idx_train, idx_valid) in enumerate(skf.split(X, y)):
        start_time = time.monotonic()
        
        X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
        X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]
        
        valid_ids = X_valid.index.values.tolist()
        
        # Refer to https://www.kaggle.com/lucamassaron/baseline-lightgbm-with-covtype-augmentation/notebook
        X_train = X_train.append(aug_X)
        y_train = np.concatenate([y_train, aug_y])
        
        fit_params = {
            "eval_set": [(X_valid, y_valid)],
            "early_stopping_rounds": 100,
        }
        if verbose:
            # weird, but lightgbm doesn't like this param
            fit_params["verbose"] = 1000

        model.fit(X_train, y_train, **fit_params)
        valid_pred = model.predict(X_valid)
        test_pred = model.predict(X_test)
        
        test_preds.append(test_pred)
        valid_preds.update(dict(zip(valid_ids, valid_pred)))

        score = accuracy_score(y_valid, valid_pred)
        
        end_time = time.monotonic()
        dur = timedelta(seconds=end_time - start_time)
        print(f"Fold {fold} | Accuracy: {score} | Took: {dur}")
        scores.append(score)
    
    test_preds = np.mean(np.column_stack(test_preds), axis=1)
    valid_preds = pd.DataFrame.from_dict(valid_preds, orient="index").reset_index()
    
    return test_preds, valid_preds, scores

In [None]:
%%time

def predict_with_models(models):
    print(f"Predicting with {len(models)} models...", end="\n\n")
    for model_name, model in models:
        start_time = time.monotonic()
        
        verbose = "lgb" not in model_name
        
        print("-" * 50)
        print(f"Using {model_name} model...")
        test_preds, valid_preds, scores = predict_with_model(model, verbose=verbose)
        print(f"Score: {np.mean(scores)}, Std: {np.std(scores)}", end="\n\n")

        print("Saving predictions...")
        valid_preds.columns = [ID, model_name]
        valid_preds.to_csv(f"{model_name}_train.csv", index=False)

        test_preds_df = pd.DataFrame({ID: submission[ID], model_name: test_preds})
        test_preds_df.to_csv(f"{model_name}_test.csv", index=False)

        sub = pd.DataFrame({ID: submission[ID], TARGET: test_preds.astype(int)})
        sub.to_csv(f"{model_name}_submission.csv", index=False)
        
        end_time = time.monotonic()
        dur = timedelta(seconds=end_time - start_time)
        print(f"Took: {dur}")

In [None]:
%%time

SEED = 42

lgb1_params = {
    "random_state": SEED,
    "n_estimators": 1500,
    "objective" : "multiclass",
    "verbose": 0,
    # gpu
    "device": "gpu",
    "gpu_platform_id": 0,
    "gpu_device_id": 0,
}

xgb1_params = {
    "random_state": SEED,
    "n_estimators": 1500,
    "objective":"multi:softmax",
    "booster": "gbtree",
    "verbose": 0,
    # gpu
    "gpu_id": 0,
    "tree_method": "gpu_hist",
    "predictor": "gpu_predictor"
}

cb1_params = {
    "random_seed": SEED,
    "iterations": 1500,
    "loss_function": "MultiClass",
    "verbose": 0,
    # gpu
    "task_type" : "GPU",
    "devices" : "0",
}

# Model name must be unique
models = [
    ("lgb1", LGBMClassifier(**lgb1_params)),
    ("xgb1", XGBClassifier(**xgb1_params)),
    ("cb1", CatBoostClassifier(**cb1_params)),
]

In [None]:
%%time

predict_with_models(models)

## Visualize

In [None]:
def load_viz_data(submission_files):
    dfs = []
    for submission_file in submission_files:
        df = pd.read_csv(submission_file)
        dfs.append((submission_file, df))
        
    hist_data = []
    for i in range(len(dfs)):
        _, df = dfs[i]
        hist_data.append(df[TARGET])
        
    return hist_data

In [None]:
%%time

submission_files = [f"{model_name}_submission.csv" for (model_name, _) in models]
viz_data = load_viz_data(submission_files)

fig = ff.create_distplot(viz_data, submission_files, show_hist=False, show_rug=False)
fig.show()