Hello There,

in this TPS I combined a lot of notebooks i've read and code from previous TPSs to create this.

In [None]:
import numpy as np
import pandas as pd

from warnings import filterwarnings
filterwarnings('ignore')

I load the data using pandas read_csv method. A better way would be to create a feather dataset... TBC

In [None]:
%%time
# read dataframe
df_train = pd.read_csv("../input/tabular-playground-series-oct-2021/train.csv")
df_test  = pd.read_csv("../input/tabular-playground-series-oct-2021/test.csv")

sample_submission = pd.read_csv("../input/tabular-playground-series-oct-2021/sample_submission.csv")

# Feature Engineering

I add some mean, std, min, max columns. Useless features will be removed by the feature selection step later.

In [None]:
num_cols = [col for col in df_test.columns]

df_train["mean"] = df_train[num_cols].mean(axis=1)
df_train["std"]  = df_train[num_cols].std(axis=1)
df_train["min"]  = df_train[num_cols].min(axis=1)
df_train["max"]  = df_train[num_cols].max(axis=1)

df_test["mean"] = df_test[num_cols].mean(axis=1)
df_test["std"]  = df_test[num_cols].std(axis=1)
df_test["min"]  = df_test[num_cols].min(axis=1)
df_test["max"]  = df_test[num_cols].max(axis=1)

Since the dataset is verrrrry large I tend to use this function a lot. It converts each column to the best fitting datatype for its range.

In [None]:
def reduce_mem_usage(props):
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
    
            # test if column can be converted to an integer
            asint = props[col].astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
    
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props

In [None]:
import gc

df_train = reduce_mem_usage(df_train)
df_test  = reduce_mem_usage(df_test)
gc.collect()

In [None]:
# prepare dataframe for modeling
X = df_train.drop(columns=["id", "target"]).copy()
y = df_train["target"].copy()

test_data = df_test.drop(columns=["id"]).copy()

Here i find the columns with less than 5 unique values and add them to the list of categorical columns.

In [None]:
cat_cols = X.columns[(X.nunique() < 5)]
con_cols = X.columns[(X.nunique() >= 5)]
cat_cols_indices = [X.columns.get_loc(col) for col in cat_cols]
print(f"cat_cols: {cat_cols}")

Scale all non-categorical columns

In [None]:
import gc
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X[con_cols] = reduce_mem_usage(pd.DataFrame(columns=con_cols, data=scaler.fit_transform(X[con_cols])))
test_data[con_cols] = reduce_mem_usage(pd.DataFrame(columns=con_cols, data=scaler.transform(test_data[con_cols])))
gc.collect()

In [None]:
from sklearn.model_selection import train_test_split

cal_X_train, cal_X_val, cal_y_train, cal_y_val = train_test_split(X, y, random_state=0, stratify=y, test_size=.95)

# Optuna Studies

In [None]:
from sklearn.model_selection import train_test_split

cal_X_train, cal_X_val, cal_y_train, cal_y_val = train_test_split(X, y, random_state=0, stratify=y, test_size=.75)

## LGBM

I use the lgbm integration to do this study. Nothing special here. Remove my fixed parameters to tune yourself.

In [None]:
import json
import optuna
import lightgbm as lgbm
import optuna.integration.lightgbm as lgbo
import matplotlib.pyplot as plt
import seaborn as sns

lgbm_params_0 = {
  "objective": "binary",
  "metric": "auc",
  "learning_rate": 0.08,
  "device": "gpu",
  "verbose": 0, 
  "feature_pre_filter": False, 
  "lambda_l1": 9.314037635261775, 
  "lambda_l2": 0.10613573572440353,
  "num_leaves": 7,
  "feature_fraction": 0.4, 
  "bagging_fraction": 0.8391963650875751, 
  "bagging_freq": 5, 
  "min_child_samples": 100,
  "num_iterations": 10000,
  "n_estimators": 20000,
  "random_state": 42
}

if lgbm_params_0 is None:
    lgb_train = lgbm.Dataset(cal_X_train, cal_y_train, categorical_feature=cat_cols_indices)
    lgb_valid = lgbm.Dataset(cal_X_val,   cal_y_val, categorical_feature=cat_cols_indices)

    model = lgbo.train(
        {
            "objective": "binary",
            "metric": "auc", 
            "categorical_feature": cat_cols_indices,
            "n_estimators": 10_000, 
            "learning_rate": 0.08, 
            "device": "gpu", 
            "verbose": 0
        }, 
        lgb_train, 
        valid_sets=[lgb_valid], 
        verbose_eval=False, 
        num_boost_round=100, 
        verbosity=0, 
        early_stopping_rounds=5, 
        optuna_seed=42
    )

    lgbm_params_0 = model.params

    lgbm_params_0["n_estimators"] = 20_000
    lgbm_params_0["random_state"] = 42

    del lgbm_params_0["early_stopping_round"]

with open("lgbm_params_0.json".format(), "w") as file:
    file.write(json.dumps(lgbm_params_0, indent=4))

I create another params preset with different seed, i plan on using another study here in the future, with different boosting or something like that.

In [None]:
import json

lgbm_params_1 = lgbm_params_0.copy()

lgbm_params_1["random_state"] = 187


with open("lgbm_params_1.json".format(), "w") as file:
    file.write(json.dumps(lgbm_params_1, indent=4))

In [None]:
import json

lgbm_params_2 = lgbm_params_0.copy()

lgbm_params_2["random_state"] = 256


with open("lgbm_params_2.json".format(), "w") as file:
    file.write(json.dumps(lgbm_params_2, indent=4))

## Catboost

The barriers for the searchspace of this study where narrowed down by previous searches. I used the plot below to determine promising barriers. Remove my fixed parameters to tune yourself.

In [None]:
import json
import optuna
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier, Pool

def objective(trial):
    param = {
        "objective": "CrossEntropy",
        "eval_metric" : "AUC",
        "task_type": "GPU",
        "grow_policy": "SymmetricTree",
        "use_best_model" : True,
        "learning_rate": 0.08,
        "n_estimators":  10_000,
        "random_strength" : trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "max_bin": 128,
        "cat_features": cat_cols_indices,
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1e-5, 1.0),
        "max_depth": trial.suggest_int("max_depth", 4, 12),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 300),
        "random_state": 42,
    }
    
    model = CatBoostClassifier(**param)  
    
    model.fit(
        cal_train_pool,
        eval_set=[cal_val_pool],
        early_stopping_rounds=5,
        verbose=False
    )
    
    preds = model.predict_proba(cal_X_val)[:,-1]
    
    return roc_auc_score(cal_y_val, preds)


catb_params_0 = {
    "objective": "CrossEntropy",
    "eval_metric" : "AUC",
    "task_type": "GPU",
    "grow_policy": "SymmetricTree",
    "use_best_model" : True,
    "learning_rate": 0.01,
    "n_estimators":  20_000,
    "random_strength" : 1.0,
    "max_bin": 128,
    "l2_leaf_reg": 0.0007202715557592255,
    "max_depth": 4,
    "min_data_in_leaf": 103,
    "random_state": 42,
}

if catb_params_0 is None:
    cal_train_pool = Pool(cal_X_train, cal_y_train, cat_features=cat_cols_indices)
    cal_val_pool   = Pool(cal_X_val, cal_y_val, cat_features=cat_cols_indices)

    cal_train_pool.quantize(max_bin=128)
    cal_val_pool.quantize(max_bin=128)
    
    study = optuna.create_study(
        direction="maximize",
        pruner=optuna.pruners.MedianPruner(
            n_startup_trials=5, n_warmup_steps=10, interval_steps=5
        ),
    )
    study.optimize(objective, n_trials=20)
    print("Number of finished trials:", len(study.trials))
    print("Best trial:", study.best_trial.params)

    catb_params_0 = study.best_trial.params

    catb_params_0["n_estimators"] = 20_000
    catb_params_0["learning_rate"] = 0.01
    catb_params_0["max_bin"] = 128
    catb_params_0["random_strength"] = 1.0
    catb_params_0["random_state"] = 42
    catb_params_0["use_best_model"] = True
    catb_params_0["objective"] = "CrossEntropy"
    catb_params_0["grow_policy"] = "SymmetricTree"
    catb_params_0["eval_metric"] = "AUC"
    catb_params_0["task_type"] = "GPU"
    
    del cal_train_pool
    del cal_val_pool
    
    fig = optuna.visualization.plot_parallel_coordinate(study)
    fig.show()

with open("catb_params_0.json".format(), "w") as file:
    file.write(json.dumps(catb_params_0, indent=4))

I try to collect the pools here:

In [None]:
import gc

gc.collect()

I create another params preset with diffrent seed, i plan on using another study here in the future, with different boosting or something like that.

In [None]:
import json

catb_params_1 = catb_params_0.copy()

catb_params_1["random_state"] = 187


with open("catb_params_1.json".format(), "w") as file:
    file.write(json.dumps(catb_params_1, indent=4))

In [None]:
import json

catb_params_2 = catb_params_0.copy()

catb_params_2["random_state"] = 256


with open("catb_params_2.json".format(), "w") as file:
    file.write(json.dumps(catb_params_2, indent=4))

## XGBoost

As with Catboost, the barriers for the searchspace of this study where also narrowed down by previous searches. I will look into more tunable params and diffrent boosting methods. Remove my fixed parameters to tune yourself.

In [None]:
import json
from xgboost import XGBClassifier

def objective(trial):
    param = {
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "tree_method": "gpu_hist",
        "learning_rate": 0.02,
        "n_estimators": 10_000,
        "random_state": 42,
        "lambda": trial.suggest_loguniform("lambda", 0.5, 0.8),
        "alpha": trial.suggest_loguniform("alpha", 2e-3, 0.008),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.51, 0.55),
        "subsample": trial.suggest_float("subsample", 0.5, 0.65),
        "max_depth": 17, # trial.suggest_int("max_depth", 17, 18, 1),
        "min_child_weight": trial.suggest_int("min_child_weight", 150, 200),
    }
    model = XGBClassifier(**param)  
    
    model.fit(
        cal_X_train, 
        cal_y_train,
        eval_set=[(cal_X_val, cal_y_val)],
        early_stopping_rounds=5,
        verbose=False
    )
    
    preds = model.predict_proba(cal_X_val)[:,-1]
    
    return roc_auc_score(cal_y_val, preds)

xgb_params_0 = {
    "lambda": 0.5397422302447832,
    "alpha": 0.007483070716022332,
    "colsample_bytree": 0.5400956175261262,
    "subsample": 0.50044109494562,
    "min_child_weight": 200,
    "n_estimators": 20_000,
    "random_state": 42,
    "learning_rate": 0.005,
    "tree_method": "gpu_hist",
    "eval_metric": "auc",
    "objective": "binary:logistic"
}

if xgb_params_0 is None:
    study = optuna.create_study(
        direction="maximize",
        pruner=optuna.pruners.MedianPruner(
            n_startup_trials=5, n_warmup_steps=10, interval_steps=5
        ),
    )
    study.optimize(objective, n_trials=50)
    print("Number of finished trials:", len(study.trials))
    print("Best trial:", study.best_trial.params)

    xgb_params_0 = study.best_trial.params

    xgb_params_0["n_estimators"] = 20_000
    xgb_params_0["random_state"] = 42
    xgb_params_0["learning_rate"] = 0.005
    xgb_params_0["tree_method"] = "gpu_hist"
    xgb_params_0["eval_metric"] = "auc"
    xgb_params_0["objective"] = "binary:logistic"
    
    fig = optuna.visualization.plot_parallel_coordinate(study)
    fig.show()

with open("xgb_params_0.json".format(), "w") as file:
    file.write(json.dumps(xgb_params_0, indent=4))

In [None]:
import json

xgb_params_1 = xgb_params_0.copy()

xgb_params_1["random_state"] = 187


with open("xgb_params_1.json".format(), "w") as file:
    file.write(json.dumps(xgb_params_1, indent=4))

In [None]:
xgb_params_2 = {
    "objective": "binary:logistic",
    "learning_rate": 8e-3,
    "seed": 42,
    "subsample": 0.6,
    "colsample_bylevel": 0.9,
    "colsample_bytree": 0.4,
    "n_estimators": 20_000,
    "max_depth": 8,
    "alpha": 64,
    "lambda": 32,
    "min_child_weight": 8,
    "importance_type": "total_gain",
    "tree_method": "gpu_hist",
    "predictor": "gpu_predictor",
}

# Level 1 - LGBM/CatB/XGB

I do an KFold on all models with the previously tuned hyperparams.

In [None]:
%%time
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

models = [
    ("lgbm0", LGBMClassifier(**lgbm_params_0)),
    ("lgbm1", LGBMClassifier(**lgbm_params_1)),
    ("lgbm2", LGBMClassifier(**lgbm_params_2)),
    ("catb0", CatBoostClassifier(**catb_params_0)),
    ("catb1", CatBoostClassifier(**catb_params_1)),
    ("catb2", CatBoostClassifier(**catb_params_2)),
    ("xgb0", XGBClassifier(**xgb_params_0)),
    ("xgb1", XGBClassifier(**xgb_params_1)),
    ("xgb2", XGBClassifier(**xgb_params_2)),
]

oof_pred_tmp = dict()
test_pred_tmp = dict()
scores_tmp = dict()

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
for fold, (idx_train, idx_valid) in enumerate(kf.split(X, y)):
    X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
    X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]
    
    for name, model in models:
        if name not in scores_tmp:
            oof_pred_tmp[name] = list()
            oof_pred_tmp["y_valid"] = list()
            test_pred_tmp[name] = list()
            scores_tmp[name] = list()
     
        model.fit(
            X_train, y_train,
            eval_set=[(X_valid,y_valid)],
            early_stopping_rounds=500,
            verbose=0
        )
        
        pred_valid = model.predict_proba(X_valid)[:, -1]
        score = roc_auc_score(y_valid, pred_valid)
        
        scores_tmp[name].append(score)
        oof_pred_tmp[name].extend(pred_valid)
        
        print(f"Fold: {fold + 1} Model: {name} Score: {score}")
        print("--"*20)
        
        y_hat = model.predict_proba(test_data)[:, -1]
        test_pred_tmp[name].append(y_hat)
    oof_pred_tmp["y_valid"].extend(y_valid)

for name, model in models:
    print(f"Overall Validation Score | {name}: {np.mean(scores_tmp[name])}")
    print("::"*20)

In [None]:
# create df with base predictions on test_data
base_test_predictions = pd.DataFrame(
    {name: np.mean(np.column_stack(test_pred_tmp[name]), axis=1) 
    for name in test_pred_tmp.keys()}
)

# save csv checkpoint
base_test_predictions.to_csv("./base_test_predictions.csv", index=False)

# create simple average blend 
base_test_predictions["simple_avg"] = base_test_predictions.mean(axis=1)

# create submission file with simple blend average
simple_blend_submission = sample_submission.copy()
simple_blend_submission["claim"] = base_test_predictions["simple_avg"]
simple_blend_submission.to_csv("./simple_blend_submission.csv", index=False)

In [None]:
# create training set for meta learner based on the oof_predictions of the base models
oof_predictions = pd.DataFrame(
    {name:oof_pred_tmp[name] for name in oof_pred_tmp.keys()}
)

# save csv checkpoint
oof_predictions.to_csv("./oof_predictions.csv", index=False)

# get simple blend validation score
y_valid = oof_predictions["y_valid"].copy()
y_hat_blend = oof_predictions.drop(columns=["y_valid"]).mean(axis=1)
score = roc_auc_score(y_valid, y_hat_blend)

print(f"Overall Validation Score | Simple Blend: {score}")
print("::"*20)

# Lvl 2 - Logistic Regression

In [None]:
%%time
from sklearn.linear_model import LogisticRegression

# prepare meta_training set
X_meta = oof_predictions.drop(columns=["y_valid"]).copy()
y_meta = oof_predictions["y_valid"].copy()
test_meta = base_test_predictions.drop(columns=["simple_avg"]).copy()

meta_pred_tmp = []
scores_tmp = []

# create cv
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

for fold, (idx_train, idx_valid) in enumerate(kf.split(X_meta, y_meta)):
    # create train, validation sets
    X_train, y_train = X_meta.iloc[idx_train], y_meta.iloc[idx_train]
    X_valid, y_valid = X_meta.iloc[idx_valid], y_meta.iloc[idx_valid]

    model = LogisticRegression()
    model.fit(X_train, y_train)
    
    # validation prediction
    pred_valid = model.predict_proba(X_valid)[:,1]
    score = roc_auc_score(y_valid, pred_valid)
    scores_tmp.append(score)
    
    print(f"Fold: {fold + 1} Score: {score}")
    print("--"*20)
    
    # test prediction based on oof_set
    y_hat = model.predict_proba(test_meta)[:,1]
    meta_pred_tmp.append(y_hat)
    
# print overall validation scores
print(f"Overall Validation Score | Meta: {np.mean(scores_tmp)}")
print("::"*20)

In [None]:
# average meta predictions over each fold
meta_predictions = np.mean(np.column_stack(meta_pred_tmp), axis=1)

# create submission file
stacked_submission = sample_submission.copy()
stacked_submission["target"] = meta_predictions
stacked_submission.to_csv("./stacked_submission.csv", index=False)