# TPS September 2021 - VotingClassifier Baseline

## Import libraries

In [None]:
%%time

import os
import logging
import sys
import warnings
warnings.simplefilter("ignore")

import numpy as np
import pandas as pd
from tqdm import tqdm

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier as HGBClassifier
from sklearn.ensemble import VotingClassifier

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

## Load datasets

In [None]:
%%time

folds_dir = "../input/tps-september-2021-skfolds/"
data_dir = "../input/tabular-playground-series-sep-2021/"

df_train = pd.read_csv(folds_dir + "train_folds.csv")
df_test = pd.read_csv(data_dir + "test.csv")
submission = pd.read_csv(data_dir + "sample_solution.csv")

features = [col for col in df_test.columns if "f" in col]
df_test = df_test[features]

# constants
TARGET = "claim"

## Feature engineering

In [None]:
%%time

def add_new_features(df):
    # https://www.kaggle.com/realtimshady/single-simple-lightgbm
    df["n_missing"] = df[features].isna().sum(axis=1)
    df["n_missing_std"] = df.isna().std(axis=1).astype("float")
    df["abs_sum"] = df[features].abs().sum(axis=1)
    df["sem"] = df[features].sem(axis=1)
    df["std"] = df[features].std(axis=1)
    df["avg"] = df[features].mean(axis=1)
    df["max"] = df[features].max(axis=1)
    df["min"] = df[features].min(axis=1)
    
    return df

df_train = add_new_features(df_train)
df_test = add_new_features(df_test)

new_features = ["n_missing", "n_missing_std", "abs_sum", 
             "sem", "std", "avg", "max", "min"]

features += new_features

In [None]:
fill_value_dict = {
    "f1": "Mean", 
    "f2": "Median", 
    "f3": "Median", 
    "f4": "Median", 
    "f5": "Mode", 
    "f6": "Mean", 
    "f7": "Median", 
    "f8": "Median", 
    "f9": "Median", 
    "f10": "Median", 
    "f11": "Mean", 
    "f12": "Median", 
    "f13": "Mean", 
    "f14": "Median", 
    "f15": "Mean", 
    "f16": "Median", 
    "f17": "Median", 
    "f18": "Median", 
    "f19": "Median", 
    "f20": "Median", 
    "f21": "Median", 
    "f22": "Mean", 
    "f23": "Mode", 
    "f24": "Median", 
    "f25": "Median", 
    "f26": "Median", 
    "f27": "Median", 
    "f28": "Median", 
    "f29": "Mode", 
    "f30": "Median", 
    "f31": "Median", 
    "f32": "Median", 
    "f33": "Median", 
    "f34": "Mean", 
    "f35": "Median", 
    "f36": "Mean", 
    "f37": "Median", 
    "f38": "Median", 
    "f39": "Median", 
    "f40": "Mode", 
    "f41": "Median", 
    "f42": "Mode", 
    "f43": "Mean", 
    "f44": "Median", 
    "f45": "Median", 
    "f46": "Mean", 
    "f47": "Mode", 
    "f48": "Mean", 
    "f49": "Mode", 
    "f50": "Mode", 
    "f51": "Median", 
    "f52": "Median", 
    "f53": "Median", 
    "f54": "Mean", 
    "f55": "Mean", 
    "f56": "Mode", 
    "f57": "Mean", 
    "f58": "Median", 
    "f59": "Median", 
    "f60": "Median", 
    "f61": "Median", 
    "f62": "Median", 
    "f63": "Median", 
    "f64": "Median", 
    "f65": "Mode", 
    "f66": "Median", 
    "f67": "Median", 
    "f68": "Median", 
    "f69": "Mean", 
    "f70": "Mode", 
    "f71": "Median", 
    "f72": "Median", 
    "f73": "Median", 
    "f74": "Mode", 
    "f75": "Mode", 
    "f76": "Mean", 
    "f77": "Mode", 
    "f78": "Median", 
    "f79": "Mean", 
    "f80": "Median", 
    "f81": "Mode", 
    "f82": "Median", 
    "f83": "Mode", 
    "f84": "Median", 
    "f85": "Median", 
    "f86": "Median", 
    "f87": "Median", 
    "f88": "Median", 
    "f89": "Median", 
    "f90": "Mean", 
    "f91": "Mode", 
    "f92": "Median", 
    "f93": "Median", 
    "f94": "Median", 
    "f95": "Median", 
    "f96": "Median", 
    "f97": "Mean", 
    "f98": "Median", 
    "f99": "Median", 
    "f100": "Mode", 
    "f101": "Median", 
    "f102": "Median", 
    "f103": "Median", 
    "f104": "Median", 
    "f105": "Median", 
    "f106": "Median", 
    "f107": "Median", 
    "f108": "Median", 
    "f109": "Mode", 
    "f110": "Median", 
    "f111": "Median", 
    "f112": "Median", 
    "f113": "Mean", 
    "f114": "Median", 
    "f115": "Median", 
    "f116": "Mode", 
    "f117": "Median", 
    "f118": "Mean"
}


for col in tqdm(features):
    if fill_value_dict.get(col)=="Mean":
        fill_value = df_train[col].mean()
    elif fill_value_dict.get(col)=="Median":
        fill_value = df_train[col].median()
    elif fill_value_dict.get(col)=="Mode":
        fill_value = df_train[col].mode().iloc[0]
    
    df_train[col].fillna(fill_value, inplace=True)
    df_test[col].fillna(fill_value, inplace=True)

## Preprocessing

In [None]:
%%time

pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", missing_values=np.nan)),
    ("scaler", RobustScaler())
])

df_train[features] = pipe.fit_transform(df_train[features])
df_test[features] = pipe.transform(df_test[features])

## Model params

In [None]:
lgb_params = {
    "metric" : "auc",
    "max_depth" : 3,
    "num_leaves" : 7,
    "n_estimators" : 5000,
    "colsample_bytree" : 0.3,
    "subsample" : 0.5,
    "random_state" : 42,
    "reg_alpha" : 18,
    "reg_lambda" : 17,
    "learning_rate" : 0.095,
    "objective" : "binary"
}

xgb_params = {
    "eval_metric" : "auc",
    "lambda": 0.004562711234493688, 
    "alpha": 7.268146704546314, 
    "colsample_bytree": 0.6468987558386358, 
    "colsample_bynode": 0.29113878257290376, 
    "colsample_bylevel": 0.8915913499148167, 
    "subsample": 0.37130229826185135, 
    "learning_rate": 0.021671163563123198, 
    "grow_policy": "lossguide", 
    "max_depth": 18, 
    "min_child_weight": 215, 
    "max_bin": 272,
    "n_estimators": 10000,
    "random_state": 0,
    "use_label_encoder": False,
    "objective": "binary:logistic",
    "tree_method": "gpu_hist",
    # gpu
    "gpu_id": 0,
    "predictor": "gpu_predictor"
}

cb_params = {
    "random_state": 42,
    "eval_metric" : "AUC",
    "iterations": 15585, 
    "objective": "CrossEntropy",
    "bootstrap_type": "Bernoulli", 
    "od_wait": 1144, 
    "learning_rate": 0.023575206684596582, 
    "reg_lambda": 36.30433203563295, 
    "random_strength": 43.75597655616195, 
    "depth": 7, 
    "min_data_in_leaf": 11, 
    "leaf_estimation_iterations": 1, 
    "subsample": 0.8227911142845009,
    "verbose" : 0,
    # gpu
    "task_type" : "GPU",
    "devices" : "0",
}

hgb_params = {
    "random_state": 666, 
    "scoring": "roc_auc",
    "max_iter": 40000,
    "learning_rate": 0.025,
    "validation_fraction": 0.1,
    "early_stopping": True,
    "max_depth": 15, 
    "max_leaf_nodes": 17, 
    "min_samples_leaf": 12173,
}

## Predict

In [None]:
def predict(df_train, df_test, folds=5):
    test_preds = []
    valid_preds = {}
    scores = []
    
    models = [
        ("lgb", LGBMClassifier(**lgb_params)),
        ("xgb", XGBClassifier(**xgb_params)),
        ("cb", CatBoostClassifier(**cb_params)),
        ("hgb", HGBClassifier(**hgb_params))
    ]
    
    weights = [0.5, 0.4, 0.4, 0.3]
    
    for fold in range(folds):
        x_train = df_train[df_train.kfold != fold].reset_index(drop=True)
        x_valid = df_train[df_train.kfold == fold].reset_index(drop=True)
        x_test = df_test.copy()
        
        valid_ids = x_valid.id.values.tolist()

        y_train = x_train[TARGET]
        y_valid = x_valid[TARGET]

        x_train = x_train[features]
        x_valid = x_valid[features]

        model = VotingClassifier(
            estimators = models,
            voting = "soft",
            weights = weights,
            n_jobs = -1
        )
        model.fit(x_train, y_train)
        
        valid_pred = model.predict_proba(x_valid)[:, 1]
        test_pred = model.predict_proba(x_test)[:, 1]
        
        test_preds.append(test_pred)
        valid_preds.update(dict(zip(valid_ids, valid_pred)))

        score = roc_auc_score(y_valid, valid_pred)
        print(f"Fold {fold} | AUC: {score}")
        scores.append(score)
    
    test_preds = np.mean(np.column_stack(test_preds), axis=1)
    valid_preds = pd.DataFrame.from_dict(valid_preds, orient="index").reset_index()
    
    return test_preds, valid_preds, scores

In [None]:
test_preds, valid_preds, scores = predict(df_train, df_test)
print(np.mean(scores), np.std(scores))

## Save

In [None]:
valid_preds.columns = ["id", "vote_pred_2"]
valid_preds.to_csv("vote_train_2.csv", index=False)

test_preds_df = pd.DataFrame({"id": submission.id, "vote_pred_2": test_preds})
test_preds_df.to_csv("vote_test_2.csv", index=False)

sub = pd.DataFrame({"id": submission.id, "claim": test_preds})
sub.to_csv("submission.csv", index=False)