# Libraries

In [None]:
import math
import json
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import xgboost as xgb
import lightgbm as lgbm
import optuna.integration.lightgbm as lgbo
import scipy.stats as stats
from sklearn.model_selection import StratifiedKFold, KFold, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import FunctionTransformer, PowerTransformer, QuantileTransformer, OneHotEncoder
from sklearn.ensemble import BaggingRegressor, ExtraTreesRegressor, GradientBoostingRegressor, RandomForestRegressor, StackingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
import statsmodels.api as sm
import catboost
import optuna

import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow import keras
from tensorflow.keras import callbacks
from tensorflow.keras import layers
from tensorflow.keras import activations,callbacks
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import initializers
from tensorflow.keras import regularizers
from tensorflow.keras import metrics
from keras.models import Model

import multiprocessing

sns.set_theme()
sns.set_palette(palette = "rainbow")

gpu_available = tf.test.is_gpu_available()
%matplotlib inline

# Data

In [None]:
train = pd.read_csv('../input/tabular-playground-series-aug-2021/train.csv')
train = train.set_index('id')
target = train['loss']
train = train.drop('loss', axis=1)
test = pd.read_csv('../input/tabular-playground-series-aug-2021/test.csv')
test = test.set_index('id')
preds = pd.read_csv('../input/tabular-playground-series-aug-2021/sample_submission.csv')

# Scale Data

In [None]:
oh = OneHotEncoder()

train_oh = oh.fit_transform(train[["f1", "f86"]]).toarray()
test_oh = oh.transform(test[["f1", "f86"]]).toarray()

f1_map = { v: i for i, v in enumerate(train["f1"].unique()) }
f86_map = { v: i for i, v in enumerate(train["f86"].unique()) }

train["f1"] = train["f1"].map(lambda v: f1_map[v])
train["f86"] = train["f86"].map(lambda v: f86_map[v])

test["f1"] = test["f1"].map(lambda v: f1_map[v])
test["f86"] = test["f86"].map(lambda v: f86_map[v])

train_cat = train[["f1", "f86"]].values
test_cat = test[["f1", "f86"]].values

In [None]:
original_skew = train.skew()

best = { col: ("og", original_skew[col]) for col in train.columns }
transformers = {
    "log1p": lambda x: x if 0.0 in x else FunctionTransformer(np.log1p).fit_transform(x),
    "square": lambda x: FunctionTransformer(np.square).fit_transform(x),
    "cube": lambda x: FunctionTransformer(lambda x: x**3).fit_transform(x),
    "box-cox": lambda x: PowerTransformer(method = "box-cox", standardize=True).fit_transform(x.reshape(-1,1)).reshape(-1) if min(x) > 0 else x,
    "yeojohn": lambda x: PowerTransformer(standardize=True).fit_transform(x.reshape(-1,1)).reshape(-1)
}

for name, transformer in transformers.items():
    for col in train.columns:
        v_trans = transformer(train[col].values)
        if np.isfinite(v_trans).all():
            df_trans = pd.DataFrame(v_trans)
            trans_skew = df_trans.skew()[0]
            if abs(trans_skew) < abs(best[col][1]):
                best[col] = (name, trans_skew)

for col, (method, _) in best.items():
    if method != "og":
        train[col] = transformers[method](train[col].values).reshape(-1)
        test[col] = transformers[method](test[col].values).reshape(-1)

print(sum(original_skew), sum(train.skew()))

In [None]:
sc = QuantileTransformer(output_distribution="normal")

train_sc = sc.fit_transform(train)
test_sc = sc.transform(test)

In [None]:
train_sc = pd.DataFrame(train_sc, columns=[ "f{}".format(i) for i in range(100) ])
test_sc = pd.DataFrame(test_sc, columns=[ "f{}".format(i) for i in range(100) ])

train_c = pd.DataFrame(train_sc, columns=[ "f{}".format(i) for i in range(100) ])
test_c  = pd.DataFrame(test_sc, columns=[ "f{}".format(i) for i in range(100) ])

train_oh = pd.DataFrame(train_oh, columns=[ "oh_{}".format(i) for i in range(train_oh.shape[1]) ])
test_oh  = pd.DataFrame(test_oh, columns=[ "oh_{}".format(i) for i in range(test_oh.shape[1]) ])

train_cat = pd.DataFrame(train_cat, columns=["f1_c", "f86_c"])
test_cat  = pd.DataFrame(test_cat, columns=["f1_c", "f86_c"])

In [None]:
train = pd.concat([train_sc, train_oh], axis=1)
test  = pd.concat([test_sc, test_oh],   axis=1)

train_c = pd.concat([train_c, train_cat], axis=1)
test_c  = pd.concat([test_sc, test_cat],  axis=1)

In [None]:
del train_sc
del test_sc

del train_oh
del test_oh

del train_cat
del test_cat

# Models

In [None]:
FOLDS = 20

print("Using {} Folds, Calibraiton Train Size of {}, Calibraiton Test Size of {}".format(FOLDS, 1 - 1 / FOLDS, 1 / FOLDS))

### Split Train/Test for Hyperparameter Search

In [None]:
cal_X_train, cal_X_val, cal_y_train, cal_y_val = train_test_split(train, target, random_state=0, stratify=target, test_size=1 / FOLDS)
cal_X_train_c, cal_X_val_c, cal_y_train_c, cal_y_val_c = train_test_split(train_c, target, random_state=0, stratify=target, test_size=1 / FOLDS)
len(cal_X_train), len(cal_X_val)

In [None]:
def score_model(mod, X, y):
    y_pred = mod.predict(X)
    return np.sqrt(mean_squared_error(y, y_pred))

In [None]:
cal_preds = pd.DataFrame()
rmse_dict = {}
fold_preds = { i: pd.DataFrame() for i in range(FOLDS) }
fold_test_preds = { i: pd.DataFrame() for i in range(FOLDS) }

## Xgb

### Hyperparameter Optimisation

In [None]:
if False:
    def objective(trial):
        param = {
            "n_estimators": trial.suggest_int("n_estimators", 200, 2000, 100),
            "subsample": trial.suggest_discrete_uniform("subsample", 0.6, 1, 0.1),
            "colsample_bytree": trial.suggest_discrete_uniform("colsample_bytree", 0.6, 1, 0.1),
            "eta": trial.suggest_loguniform("eta", 1e-3, 0.1),
            "reg_alpha": trial.suggest_int("reg_alpha", 1, 50),
            "reg_lambda": trial.suggest_int("reg_lambda", 5, 100),
            "max_depth": trial.suggest_int("max_depth", 5, 20),
            "min_child_weight": trial.suggest_int("min_child_weight", 5, 20),
            "random_state": 42,
            "learning_rate": 0.05,
            "tree_method": "gpu_hist" if gpu_available else "hist",
        }

        model = xgb.XGBRegressor(**param)

        model.fit(cal_X_train, cal_y_train, eval_set=[(cal_X_val, cal_y_val)], verbose=False, eval_metric="rmse", early_stopping_rounds=5)

        return score_model(model, cal_X_val, cal_y_val)

    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=400, timeout=None)
    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    params = trial.params

    with open("xgb.json", "w") as file:
        file.write(json.dumps(params, indent=4))
else:
    params = {
        "n_estimators": 900,
        "subsample": 0.6,
        "colsample_bytree": 0.6,
        "eta": 0.0036403005445866375,
        "reg_alpha": 26,
        "reg_lambda": 100,
        "max_depth": 5,
        "min_child_weight": 13
     }

params["tree_method"] = "gpu_hist" if gpu_available else "hist"
params["random_state"] = 42
params["learning_rate"] = 0.05
params

### Stack Calibration Model

In [None]:
model = xgb.XGBRegressor(**params)
model.fit(cal_X_train, cal_y_train, eval_set=[(cal_X_val, cal_y_val)], verbose=False, eval_metric="rmse", early_stopping_rounds=100)

cal_preds = pd.concat([cal_preds, pd.DataFrame(model.predict(cal_X_val)[:, np.newaxis], columns=["xgb"])], axis=1)

score_model(model, cal_X_val, cal_y_val)

### KFold Prediction

In [None]:
for fold, (trn_idx, val_idx) in enumerate(KFold(n_splits=20, random_state=42, shuffle=True).split(train, target)):
    print("Fold :", fold + 1)

    fold_X_train, fold_y_train = train.iloc[trn_idx], target.iloc[trn_idx]
    fold_X_test,  fold_y_test  = train.iloc[val_idx], target.iloc[val_idx]

    model = xgb.XGBRegressor(**params)
    model.fit(fold_X_train, fold_y_train, eval_set=[(fold_X_test, fold_y_test)], verbose=False, eval_metric="rmse")

    fold_preds[fold] = pd.concat([fold_preds[fold], pd.DataFrame(model.predict(fold_X_test)[:, np.newaxis], columns=["xgb"])], axis=1)
    fold_test_preds[fold] = pd.concat([fold_test_preds[fold], pd.DataFrame(model.predict(test)[:, np.newaxis], columns=["xgb"])], axis=1)

    score = score_model(model, fold_X_test, fold_y_test)
    rmse_dict["xgb_" + str(fold + 1)] = score

    ax = xgb.plot_importance(model)
    plt.show()

    print('#### fold #########', score)

## LightGBM

### Hyperparameter Optimisation

In [None]:
if False:
    params = { "objective": "mean_squared_error", "metric": "rmse", "device": "gpu" if gpu_available else "cpu" }
    
    lgb_cal_X_train_c = cal_X_train_c.drop("f1_c", axis=1).drop("f86_c", axis=1)
    
    lgb_cal_X_val_c = cal_X_val_c.drop("f1_c", axis=1).drop("f86_c", axis=1)

    lgb_train = lgbm.Dataset(lgb_cal_X_train_c, cal_y_train_c)
    lgb_valid = lgbm.Dataset(lgb_cal_X_val_c, cal_y_val_c)

    model = lgbo.train(params, lgb_train, valid_sets=[lgb_valid], verbose_eval=False, num_boost_round=100, early_stopping_rounds=5)

    params = model.params

    with open("lgbm.json", "w") as file:
        file.write(json.dumps(params, indent=4))
else:
    params = {
        "objective": "mean_squared_error",
        "metric": "rmse", 
        "device": "gpu" if gpu_available else "cpu",
        "feature_pre_filter": False, 
        "lambda_l1": 1.4627327010463796e-08, 
        "lambda_l2": 2.749104514966133e-08, 
        "num_leaves": 24, 
        "feature_fraction": 0.62, 
        "bagging_fraction": 1.0, 
        "bagging_freq": 0, 
        "min_child_samples": 100
    }

params["learning_rate"] = 0.006
params["num_iterations"] = 80000
params

### Stack Calibration Model

In [None]:
lgb_cal_X_train_c = cal_X_train_c.drop("f1_c", axis=1).drop("f86_c", axis=1)
lgb_cal_X_val_c = cal_X_val_c.drop("f1_c", axis=1).drop("f86_c", axis=1)

lgb_train = lgbm.Dataset(lgb_cal_X_train_c, cal_y_train_c)
lgb_valid = lgbm.Dataset(lgb_cal_X_val_c, cal_y_val_c)

model = lgbm.train(params, lgb_train, valid_sets=[lgb_valid], verbose_eval=False, early_stopping_rounds=100)

cal_preds = pd.concat([cal_preds, pd.DataFrame(model.predict(lgb_cal_X_val_c)[:, np.newaxis], columns=["lgbm"])], axis=1)

score_model(model, lgb_cal_X_val_c, cal_y_val_c)

### KFold Prediction

In [None]:
for fold, (trn_idx, val_idx) in enumerate(KFold(n_splits=20, random_state=42, shuffle=True).split(train, target)):
    print("Fold :", fold + 1)

    # create dataset
    fold_X_train, fold_y_train = train_c.iloc[trn_idx], target.iloc[trn_idx]
    fold_X_test,  fold_y_test  = train_c.iloc[val_idx], target.iloc[val_idx]
    
    lgb_fold_X_train = fold_X_train.drop("f1_c", axis=1).drop("f86_c", axis=1)
    
    lgb_fold_X_test = fold_X_test.drop("f1_c", axis=1).drop("f86_c", axis=1)
    
    lgb_test_c = test_c.drop("f1_c", axis=1).drop("f86_c", axis=1)

    lgb_train = lgbm.Dataset(lgb_fold_X_train, fold_y_train)
    lgb_valid = lgbm.Dataset(lgb_fold_X_test, fold_y_test)

    model = lgbm.train(params, lgb_train, valid_sets=[lgb_valid], verbose_eval=False, early_stopping_rounds=100)

    fold_preds[fold] = pd.concat([fold_preds[fold], pd.DataFrame(model.predict(lgb_fold_X_test)[:, np.newaxis], columns=["lgbm"])], axis=1)
    fold_test_preds[fold] = pd.concat([fold_test_preds[fold], pd.DataFrame(model.predict(lgb_test_c)[:, np.newaxis], columns=["lgbm"])], axis=1)

    score = score_model(model, lgb_fold_X_test, fold_y_test)
    rmse_dict["lgbm_" + str(fold + 1)] = score

    ax = lgbm.plot_importance(model, figsize=(15,15))
    plt.show()

    print('#### fold #########', score)

## Catboost

In [None]:
if False:
    def objective(trial):
        param = {
            "iterations": trial.suggest_int("iterations", 800, 4000, 100),
            "learning_rate": trial.suggest_discrete_uniform("learning_rate", 0.02, 0.08, 0.01),
            "depth": trial.suggest_int("depth", 2, 16, 2),
            "l2_leaf_reg": trial.suggest_discrete_uniform("l2_leaf_reg", 0.2, 4, 0.1),
            "random_strength": trial.suggest_discrete_uniform("random_strength", 0.5, 2, 0.5),
        }

        model = catboost.CatBoostRegressor(
            **param, 
            thread_count=4,
            eval_metric="RMSE", 
            loss_function="RMSE",
            grow_policy='Depthwise',
            leaf_estimation_method='Newton', 
            bootstrap_type='Bernoulli',
            task_type="GPU" if gpu_available else "CPU",
            early_stopping_rounds=5,
            random_state=42
        )
        
        cat_train = catboost.Pool(cal_X_train_c, cal_y_train_c, ["f1_c", "f86_c"])
        cat_valid = catboost.Pool(cal_X_val_c, cal_y_val_c, ["f1_c", "f86_c"])
        
        model.fit(cat_train, eval_set=[cat_valid], verbose=False)

        return np.sqrt(mean_squared_error(cal_y_val_c, model.predict(cat_valid)))

    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=400, timeout=None)
    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    params = trial.params

    with open("catboost.json", "w") as file:
        file.write(json.dumps(params, indent=4))
else:
    params = {
        "iterations": 2700,
        "learning_rate": 0.08,
        "depth": 4,
        "l2_leaf_reg": 1.0,
        "random_strength": 2.0
    }

params

In [None]:
model = catboost.CatBoostRegressor(
    **params, 
    thread_count=4,
    eval_metric="RMSE", 
    loss_function="RMSE",
    grow_policy='Depthwise',
    leaf_estimation_method='Newton', 
    bootstrap_type='Bernoulli',
    task_type="GPU" if gpu_available else "CPU",
    early_stopping_rounds=100,
    random_state=42
)

cat_train = catboost.Pool(cal_X_train_c, cal_y_train_c, ["f1_c", "f86_c"])
cat_valid = catboost.Pool(cal_X_val_c, cal_y_val_c, ["f1_c", "f86_c"])

model.fit(cat_train, eval_set=[cat_valid], verbose=False)

cal_preds = pd.concat([cal_preds, pd.DataFrame(model.predict(cat_valid)[:, np.newaxis], columns=["catboost"])], axis=1)

score_model(model, cat_valid, cal_y_val)

In [None]:
for fold, (trn_idx, val_idx) in enumerate(KFold(n_splits=20, random_state=42, shuffle=True).split(train, target)):
    print("Fold :", fold + 1)

    # create dataset
    fold_X_train, fold_y_train = train_c.iloc[trn_idx], target.iloc[trn_idx]
    fold_X_test,  fold_y_test  = train_c.iloc[val_idx], target.iloc[val_idx]

    model = catboost.CatBoostRegressor(
        **params, 
        thread_count=4,
        eval_metric="RMSE", 
        loss_function="RMSE",
        grow_policy='Depthwise',
        leaf_estimation_method='Newton', 
        bootstrap_type='Bernoulli',
        task_type="GPU" if gpu_available else "CPU",
        early_stopping_rounds=100,
        random_state=42
    )
    
    cat_train = catboost.Pool(fold_X_train, fold_y_train, cat_features=["f1_c", "f86_c"])
    cat_valid = catboost.Pool(fold_X_test, fold_y_test, cat_features=["f1_c", "f86_c"])
    cat_test  = catboost.Pool(test_c, cat_features=["f1_c", "f86_c"])

    model.fit(cat_train, eval_set=[cat_valid], verbose=False)

    fold_preds[fold] = pd.concat([fold_preds[fold], pd.DataFrame(model.predict(cat_valid)[:, np.newaxis], columns=["catboost"])], axis=1)
    fold_test_preds[fold] = pd.concat([fold_test_preds[fold], pd.DataFrame(model.predict(cat_test)[:, np.newaxis], columns=["catboost"])], axis=1)

    score = score_model(model, cat_valid, fold_y_test)
    rmse_dict["cat_" + str(fold + 1)] = score

    print('#### fold #########', score)

## ATTN-DNN

### Model Definition

In [None]:
def keras_model(block_activation: str, final_activation: str, kernel_initializer: str, embedding_width_1: int, embedding_width_2: int, lr: int, blocks: int, dropout: float, **kwargs):
    width = train_c.shape[1] - 2 + embedding_width_1 + embedding_width_2
    
    def attn(inp):
        k = layers.Dense(width, kernel_initializer = kernel_initializer)(inp)
        q = layers.Dense(width, kernel_initializer = kernel_initializer)(inp)
        v = layers.Dense(width, kernel_initializer = kernel_initializer)(inp)

        k_r = layers.Reshape((width, 1))(k)
        q_r = layers.Reshape((width, 1))(q)
        v_r = layers.Reshape((width, 1))(v)

        a = layers.Attention(use_scale=True, dropout=dropout)([ q, v, k ])
        a = layers.Reshape((width,))(a)
        a = layers.LayerNormalization()(a)

        return a

    def block(inp):
        x = layers.Dense(width, kernel_initializer = kernel_initializer)(inp)
        x = layers.Activation(block_activation)(x)
        x = layers.Add()([x, inp])
        x = layers.LayerNormalization()(x)

        return x

    inp_s = layers.Input(shape=(train_c.shape[1] - 2,))
    inp_c_1 = layers.Input(shape=(1,))
    inp_c_2 = layers.Input(shape=(1,))
    
    e1 = layers.Flatten()(layers.Embedding(cal_X_train_c["f1_c"].nunique(), embedding_width_1)(inp_c_1))
    e2 = layers.Flatten()(layers.Embedding(cal_X_train_c["f86_c"].nunique(), embedding_width_2)(inp_c_2))
    
    x = layers.Concatenate()([inp_s, e1, e2])
    
    for _ in range(blocks):
        i = x
        x = attn(x)
        x = block(x)
        x = layers.Add()([ x, i ])

    x = layers.Dense(1, activation=final_activation, kernel_initializer ="lecun_normal")(x)

    model = keras.Model(inputs=[inp_s, inp_c_1, inp_c_2], outputs=x)
    model.compile(optimizer=Adam(lr=lr), loss="mse", metrics=[metrics.RootMeanSquaredError()])

    return model

### Hyperparameter Optimisation

In [None]:
if True:
    def objective(trial):
        params = {
            "blocks": trial.suggest_int("blocks", 4, 16),
            "dropout": trial.suggest_float("dropout", 0, 0.5),
            "lr": trial.suggest_loguniform("lr", 1e-5, 1e-3),
            "block_activation": trial.suggest_categorical("block_activation", ["elu", "relu"]),
            "final_activation": trial.suggest_categorical("final_activation", ["elu", "relu"]),
            "kernel_initializer": trial.suggest_categorical("kernel_initializer", ["lecun_normal", "random_normal", "random_uniform", "glorot_uniform"]),
            "embedding_width_1": trial.suggest_int("embedding_width_1", 16, 64, 8),
            "embedding_width_2": trial.suggest_int("embedding_width_2", 16, 64, 8)
        }

        model = keras_model(**params)

        earlyStopping = callbacks.EarlyStopping(min_delta=0.001, patience=10, verbose=0)
        checkpoint = callbacks.ModelCheckpoint("/tmp/checkpoint", monitor="val_root_mean_squared_error", mode="min", save_best_only=True, save_weights_only=True)
        history = model.fit(
            [ cal_X_train_c.drop("f1_c", axis=1).drop("f86_c", axis=1), cal_X_train_c[["f1_c"]], cal_X_train_c[["f86_c"]] ], 
            cal_y_train_c, 
            validation_data=([ cal_X_val_c.drop("f1_c", axis=1).drop("f86_c", axis=1), cal_X_val_c[["f1_c"]], cal_X_val_c[["f86_c"]] ], cal_y_val_c), 
            batch_size=2048, 
            epochs=100, 
            callbacks=[earlyStopping, checkpoint],
            verbose=0
        )
        model.load_weights("/tmp/checkpoint")

        return score_model(model, [ cal_X_val_c.drop("f1_c", axis=1).drop("f86_c", axis=1), cal_X_val_c[["f1_c"]], cal_X_val_c[["f86_c"]] ], cal_y_val_c)

    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=100, timeout=None)
    trial = study.best_trial
    params = trial.params

    with open("attn_dnn.json", "w") as file:
        file.write(json.dumps(params, indent=4))
else:
    params = {
        "blocks": 5,
        "dropout": 0.46778133370223896,
        "lr": 0.0008267100960741205,
        "block_activation": "relu",
        "final_activation": "elu",
        "kernel_initializer": "random_uniform",
        "embedding_width_1": 64,
        "embedding_width_2": 64
    }

params

### Stack Calibration Model

In [None]:
model = keras_model(**params)

earlyStopping = callbacks.EarlyStopping(min_delta=0.001, patience=10, verbose=0)
checkpoint = callbacks.ModelCheckpoint("/tmp/checkpoint", monitor="val_root_mean_squared_error", mode="min", save_best_only=True, save_weights_only=True)
history = model.fit(
    [ cal_X_train_c.drop("f1_c", axis=1).drop("f86_c", axis=1), cal_X_train_c[["f1_c"]], cal_X_train_c[["f86_c"]] ],
    cal_y_train_c, 
    validation_data=([ cal_X_val_c.drop("f1_c", axis=1).drop("f86_c", axis=1), cal_X_val_c[["f1_c"]], cal_X_val_c[["f86_c"]] ], cal_y_val_c),
    batch_size=2048, 
    epochs=400, 
    callbacks=[earlyStopping, checkpoint],
    verbose=0
)
model.load_weights("/tmp/checkpoint")

cal_preds = pd.concat([cal_preds, pd.DataFrame(model.predict([ cal_X_val_c.drop("f1_c", axis=1).drop("f86_c", axis=1), cal_X_val_c[["f1_c"]], cal_X_val_c[["f86_c"]] ]), columns=["attn_dnn"])], axis=1)

score_model(model, [ cal_X_val_c.drop("f1_c", axis=1).drop("f86_c", axis=1), cal_X_val_c[["f1_c"]], cal_X_val_c[["f86_c"]] ], cal_y_val_c)

### KFold Prediction

In [None]:
for fold, (trn_idx, val_idx) in enumerate(KFold(n_splits=20, random_state=42, shuffle=True).split(train_c, target)):
    print("Fold :", fold + 1)

    # create dataset
    fold_X_train, fold_y_train = train_c.iloc[trn_idx], target.iloc[trn_idx]
    fold_X_test,  fold_y_test  = train_c.iloc[val_idx], target.iloc[val_idx]
    
    model = keras_model(**params)

    earlyStopping = callbacks.EarlyStopping(min_delta=0.001, patience=10, verbose=0)
    checkpoint = callbacks.ModelCheckpoint("/tmp/checkpoint", monitor="val_root_mean_squared_error", mode="min", save_best_only=True, save_weights_only=True)
    history = model.fit(
        [ fold_X_train.drop("f1_c", axis=1).drop("f86", axis=1), fold_X_train[["f1_c"]], fold_X_train[["f86_c"]] ], 
        fold_y_train, 
        validation_data=([ fold_X_test.drop("f1_c", axis=1).drop("f86_c", axis=1), fold_X_test[["f1_c"]], fold_X_test[["f86_c"]] ], fold_y_test), 
        batch_size=2048, 
        epochs=400, 
        callbacks=[earlyStopping, checkpoint],
        verbose=0
    )
    model.load_weights("/tmp/checkpoint")

    fold_preds[fold] = pd.concat([fold_preds[fold], pd.DataFrame(model.predict([ fold_X_test.drop("f1_c", axis=1).drop("f86_c", axis=1), fold_X_test[["f1_c"]], fold_X_test[["f86_c"]] ]), columns=["attndnn"])], axis=1)
    fold_test_preds[fold] = pd.concat([fold_test_preds[fold], pd.DataFrame(model.predict([ test_c.drop("f1_c", axis=1).drop("f86_c", axis=1), test_c[["f1_c"]], test_c[["f86_c"]] ]), columns=["attndnn"])], axis=1)
    
    score = score_model(model, [ fold_X_test.drop("f1_c", axis=1).drop("f86_c", axis=1), fold_X_test[["f1_c"]], fold_X_test[["f86_c"]] ], fold_y_test)
    rmse_dict["attn_dnn_" + str(fold + 1)] = score

    print('#### fold #########', score)

## DNN

### Model Definition

In [None]:
def keras_model(block_activation: str, final_activation: str, embedding_width_1: int, embedding_width_2: int, lr: int, blocks: int, dropout: float):
    width = train_c.shape[1] - 2 + embedding_width_1 + embedding_width_2
    
    def block(inp):
        x = layers.Dense(width, kernel_initializer ="random_uniform")(inp)
        x = layers.Dropout(dropout)(x)
        x = layers.BatchNormalization()(x)
        x = layers.Activation(block_activation)(x)
        x = layers.Dense(width, kernel_initializer ="random_uniform")(x)
        x = layers.Dropout(dropout)(x)
        x = layers.BatchNormalization()(x)
        x = layers.Activation(block_activation)(x)
        x = layers.Dense(width, kernel_initializer ="random_uniform")(x)
        x = layers.Dropout(dropout)(x)
        x = layers.BatchNormalization()(x)
        x = layers.Activation(block_activation)(x)
        x = layers.Add()([x, inp])

        return x

    inp_s = layers.Input(shape=(train_c.shape[1] - 2,))
    inp_c_1 = layers.Input(shape=(1,))
    inp_c_2 = layers.Input(shape=(1,))
    
    e1 = layers.Flatten()(layers.Embedding(cal_X_train_c["f1_c"].nunique(), embedding_width_1)(inp_c_1))
    e2 = layers.Flatten()(layers.Embedding(cal_X_train_c["f86_c"].nunique(), embedding_width_2)(inp_c_2))
    
    x = layers.Concatenate()([inp_s, e1, e2])

    for _ in range(blocks):
        x = block(x)

    x = layers.Dense(1, activation=final_activation, kernel_initializer ="random_uniform")(x)

    model = keras.Model(inputs=[inp_s, inp_c_1, inp_c_2], outputs=x)
    model.compile(optimizer=Adam(lr=lr), loss="mse", metrics=[metrics.RootMeanSquaredError()])

    return model

### Hyperparameter Optimisation

In [None]:
if False:
    def objective(trial):
        params = {
            "blocks": trial.suggest_int("blocks", 8, 32),
            "dropout": trial.suggest_float("dropout", 0, 0.5),
            "lr": trial.suggest_loguniform("lr", 1e-5, 1e-3),
            "block_activation": trial.suggest_categorical("block_activation", ["elu", "relu"]),
            "final_activation": trial.suggest_categorical("final_activation", ["elu", "relu"]),
            "embedding_width_1": trial.suggest_int("embedding_width_1", 16, 64, 8),
            "embedding_width_2": trial.suggest_int("embedding_width_2", 16, 64, 8)
        }

        model = keras_model(**params)

        earlyStopping = callbacks.EarlyStopping(min_delta=0.001, patience=10, verbose=0)
        checkpoint = callbacks.ModelCheckpoint("/tmp/checkpoint", monitor="val_root_mean_squared_error", mode="min", save_best_only=True, save_weights_only=True)
        history = model.fit(
            [ cal_X_train_c.drop("f1_c", axis=1).drop("f86_c", axis=1), cal_X_train_c[["f1_c"]], cal_X_train_c[["f86_c"]] ], 
            cal_y_train_c, 
            validation_data=([ cal_X_val_c.drop("f1_c", axis=1).drop("f86_c", axis=1), cal_X_val_c[["f1_c"]], cal_X_val_c[["f86_c"]] ], cal_y_val_c), 
            batch_size=2048, 
            epochs=100, 
            callbacks=[earlyStopping, checkpoint],
            verbose=0
        )
        model.load_weights("/tmp/checkpoint")

        return score_model(model, [ cal_X_val_c.drop("f1_c", axis=1).drop("f86_c", axis=1), cal_X_val_c[["f1_c"]], cal_X_val_c[["f86_c"]] ], cal_y_val_c)

    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=100, timeout=None)
    trial = study.best_trial
    params = trial.params
    
    with open("dnn.json", "w") as file:
        file.write(json.dumps(params, indent=4))
else:
    params = {
        'blocks': 11,
        'dropout': 0.07658614707982882, 
        'lr': 1.7141256541284283e-05,
        'block_activation': 'elu', 
        'final_activation': 'relu', 
        'embedding_width_1': 24,
        'embedding_width_2': 16
    }

params

### Stack Calibration Model

In [None]:
model = keras_model(**params)

earlyStopping = callbacks.EarlyStopping(min_delta=0.001, patience=10, verbose=0)
checkpoint = callbacks.ModelCheckpoint("/tmp/checkpoint", monitor="val_root_mean_squared_error", mode="min", save_best_only=True, save_weights_only=True)
history = model.fit(
    [ cal_X_train_c.drop("f1_c", axis=1).drop("f86_c", axis=1), cal_X_train_c[["f1_c"]], cal_X_train_c[["f86_c"]] ],
    cal_y_train_c, 
    validation_data=([ cal_X_val_c.drop("f1_c", axis=1).drop("f86_c", axis=1), cal_X_val_c[["f1_c"]], cal_X_val_c[["f86_c"]] ], cal_y_val_c),
    batch_size=2048, 
    epochs=400, 
    callbacks=[earlyStopping, checkpoint],
    verbose=0
)
model.load_weights("/tmp/checkpoint")

cal_preds = pd.concat([cal_preds, pd.DataFrame(model.predict([ cal_X_val_c.drop("f1_c", axis=1).drop("f86_c", axis=1), cal_X_val_c[["f1_c"]], cal_X_val_c[["f86_c"]] ]), columns=["dnn"])], axis=1)

score_model(model, [ cal_X_val_c.drop("f1_c", axis=1).drop("f86_c", axis=1), cal_X_val_c[["f1_c"]], cal_X_val_c[["f86_c"]] ], cal_y_val_c)

### KFold Prediction

In [None]:
for fold, (trn_idx, val_idx) in enumerate(KFold(n_splits=20, random_state=42, shuffle=True).split(train, target)):
    print("Fold :", fold + 1)

    # create dataset
    fold_X_train, fold_y_train = train_c.iloc[trn_idx], target.iloc[trn_idx]
    fold_X_test,  fold_y_test  = train_c.iloc[val_idx], target.iloc[val_idx]
    
    model = keras_model(**params)

    earlyStopping = callbacks.EarlyStopping(min_delta=0.001, patience=10, verbose=0)
    checkpoint = callbacks.ModelCheckpoint("/tmp/checkpoint", monitor="val_root_mean_squared_error", mode="min", save_best_only=True, save_weights_only=True)
    history = model.fit(
        [ fold_X_train.drop("f1_c", axis=1).drop("f86_c", axis=1), fold_X_train[["f1_c"]], fold_X_train[["f86_c"]] ], 
        fold_y_train, 
        validation_data=([ fold_X_test.drop("f1_c", axis=1).drop("f86_c", axis=1), fold_X_test[["f1_c"]], fold_X_test[["f86_c"]] ], fold_y_test), 
        batch_size=2048, 
        epochs=400, 
        callbacks=[earlyStopping, checkpoint],
        verbose=0
    )
    model.load_weights("/tmp/checkpoint")

    fold_preds[fold] = pd.concat([fold_preds[fold], pd.DataFrame(model.predict([ fold_X_test.drop("f1_c", axis=1).drop("f86_c", axis=1), fold_X_test[["f1_c"]], fold_X_test[["f86_c"]] ]), columns=["dnn"])], axis=1)
    fold_test_preds[fold] = pd.concat([fold_test_preds[fold], pd.DataFrame(model.predict([ test_c.drop("f1_c", axis=1).drop("f86_c", axis=1), test_c[["f1_c"]], test_c[["f86_c"]] ]), columns=["dnn"])], axis=1)

    score = score_model(model, [ fold_X_test.drop("f1_c", axis=1).drop("f86_c", axis=1), fold_X_test[["f1_c"]], fold_X_test[["f86_c"]] ], fold_y_test)
    rmse_dict["dnn_s_" + str(fold + 1)] = score

    print('#### fold #########', score)

# Stack

In [None]:
preds_dict = {}

In [None]:
corr = cal_preds.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
f, ax = plt.subplots(figsize=(11, 9))
sns.heatmap(
    corr,
    mask=mask, 
    vmax=.3,
    center=0,
    square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
stack_X = pd.concat([cal_X_val_c.reset_index(), cal_preds], axis=1)
stack_y = cal_y_val_c

stack_cal_X_train, stack_cal_X_val, stack_cal_y_train, stack_cal_y_val = train_test_split(stack_X, stack_y, random_state=0, test_size=1 / FOLDS)
len(stack_cal_X_train), len(stack_cal_X_val)

In [None]:
params = { "objective": "mean_squared_error", "metric": "rmse", "device": "cpu" }

lgb_train = lgbm.Dataset(stack_cal_X_train.drop("f1_c", axis=1).drop("f86_c", axis=1), stack_cal_y_train)
lgb_valid = lgbm.Dataset(stack_cal_X_val.drop("f1_c", axis=1).drop("f86_c", axis=1), stack_cal_y_val)

model = lgbo.train(params, lgb_train, valid_sets=[lgb_valid], verbose_eval=False, num_boost_round=100, early_stopping_rounds=5)

params = model.params

del params["early_stopping_round"]
params["learning_rate"] = 0.006
params["num_iterations"] = 4500

with open("stack_lgbm.json", "w") as file:
    file.write(json.dumps(params, indent=4))

params

In [None]:
for fold, (trn_idx, val_idx) in enumerate(KFold(n_splits=20, random_state=42, shuffle=True).split(train_c, target)):
    print("Fold :", fold + 1)
    
    fold_X, fold_y = train_c.iloc[val_idx], target.iloc[val_idx]
    
    fold_X = pd.concat((fold_X.drop("f1_c", axis=1).drop("f86_c", axis=1).reset_index(), fold_preds[fold]), axis=1)

    lgb_train = lgbm.Dataset(fold_X, fold_y)
    model = lgbm.train(params, lgb_train, verbose_eval=False)
    
    ax = lgbm.plot_importance(model, figsize=(15,15))
    plt.show()
    
    preds_pre = pd.concat((test_c.drop("f1_c", axis=1).drop("f86_c", axis=1).reset_index(), fold_test_preds[fold]), axis=1)
    
    preds_dict["stack_" + str(fold + 1)] = model.predict(preds_pre)

## Model Performance Analysis

In [None]:
keys = list(rmse_dict.keys())
vals = [rmse_dict[k] for k in keys]
plt.figure(figsize=(15, 10))
g = sns.barplot(x=keys, y=vals)
g.set_ylim(7.5, 8.2)
None

## Ensemble

In [None]:
total_preds = np.zeros(shape=preds.shape[0])
for i, pred in enumerate(preds_dict.values()):
    pred_ = pd.read_csv('../input/tabular-playground-series-aug-2021/sample_submission.csv')
    pred_.loss = pred
    pred_.to_csv('submission_'+ str(i) +'.csv', index=False)
    
    total_preds += pred

total_preds /= len(preds_dict.keys())
sub = pd.read_csv('../input/tabular-playground-series-aug-2021/sample_submission.csv')
sub.loss = total_preds
sub.to_csv('submission.csv',index=False)
sub