# Libraries

In [None]:
import pandas as pd
import numpy as np
import random
import glob
import time
import os

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

import xgboost as xgb
from sklearn.neural_network import MLPRegressor

from scipy.optimize import minimize

import matplotlib.pyplot as plt
import seaborn as sns

pd.options.display.max_rows = 120
pd.options.display.max_columns = 100

import warnings
warnings.simplefilter('ignore')

# Parameters

In [None]:
N_SPLITS = 10
N_ESTIMATORS = 10000
EARLY_STOPPING_ROUNDS = 200
VERBOSE = 1000
SEED = 299792458

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(SEED)

# Datasets

In [None]:
INPUT = "../input/tabular-playground-series-aug-2021/"
INPUT_PRED = "../input/tps08-pred/"

OUTPUT = "./output/"
os.makedirs(OUTPUT, exist_ok=True)

train = pd.read_csv(INPUT + "train.csv")
test = pd.read_csv(INPUT + "test.csv")
submission = pd.read_csv(INPUT + "sample_submission.csv")

oof_files = np.sort(glob.glob(INPUT_PRED + "oof*.npy"))
pred_files = glob.glob(INPUT_PRED + "pred*.npy")
add_features = []

for i, (oof_file, pred_file) in enumerate(zip(oof_files, pred_files)):
    train[f'pred{i}'] = np.load(oof_file)
    test[f'pred{i}'] = np.load(pred_file)
    add_features.append(f'pred{i}')              

scale_features = [col for col in test.columns if 'f' in col]
features = scale_features + add_features

target = 'loss'

In [None]:
ss = StandardScaler()
train[scale_features] = ss.fit_transform(train[scale_features])
test[scale_features] = ss.transform(test[scale_features])

In [None]:
train.shape, test.shape

# XGBoost

In [None]:
pre_params = {
    'objective': 'reg:squarederror',
    'learning_rate': 5e-3,
    'seed': SEED,
    'subsample': 0.8,
    'colsample_bytree': 0.6,
    'n_estimators': N_ESTIMATORS,
    'max_depth': 8,
    'alpha': 20,
    'lambda': 9,
    'min_child_weight': 384,
    'importance_type': 'total_gain',
    'tree_method': 'gpu_hist'
}

params = {
    'objective': 'reg:squarederror',
    'learning_rate': 1e-3,
    'seed': SEED,
    'subsample': 0.6,
    'colsample_bytree': 0.4,
    'n_estimators': N_ESTIMATORS,
    'max_depth': 16,
    'alpha': 20,
    'lambda': 9,
    'min_child_weight': 128,
    'importance_type': 'total_gain',
    'tree_method': 'gpu_hist'
}

In [None]:
mlp_oof = np.zeros(train.shape[0])
mlp_pred = np.zeros(test.shape[0])

xgb_oof = np.zeros(train.shape[0])
xgb_pred = np.zeros(test.shape[0])

kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

for fold, (trn_idx, val_idx) in enumerate(kf.split(X=train[features])):
    print(f"===== fold {fold} =====")
    X_train, y_train = train[features].iloc[trn_idx], train[target].iloc[trn_idx]
    X_valid, y_valid = train[features].iloc[val_idx], train[target].iloc[val_idx]
    X_test = test[features]

    start = time.time()
    model = MLPRegressor(hidden_layer_sizes=50,
                         early_stopping=True,
                         n_iter_no_change=100,
                         solver='adam',
                         shuffle=True,
                         random_state=SEED)
    model.fit(X_train,y_train)

    mlp_oof[val_idx] = model.predict(X_valid)
    mlp_pred += model.predict(X_test) / N_SPLITS
    
    elapsed = time.time() - start
    
    rmse = mean_squared_error(y_valid, mlp_oof[val_idx], squared=False)
    print(f"fold {fold} - mlp rmse: {rmse:.6f}, elapsed time: {elapsed:.2f}sec\n")
    
    start = time.time()
    pre_model = xgb.XGBRegressor(**pre_params)
    pre_model.fit(
        X_train, 
        y_train,
        eval_set=[(X_valid, y_valid)],
        eval_metric='rmse',
        verbose=VERBOSE,
        callbacks = [xgb.callback.EarlyStopping(
            rounds=EARLY_STOPPING_ROUNDS,
            save_best=True)]
    )
    
    model = xgb.XGBRegressor(**params)
    model.fit(
        X_train, 
        y_train,
        eval_set=[(X_valid, y_valid)],
        eval_metric='rmse',
        early_stopping_rounds=EARLY_STOPPING_ROUNDS,
        verbose=VERBOSE,
        xgb_model=pre_model
    )

    xgb_oof[val_idx] = model.predict(X_valid)
    xgb_pred += model.predict(X_test) / N_SPLITS

    elapsed = time.time() - start
    rmse = mean_squared_error(y_valid, xgb_oof[val_idx], squared=False)
    print(f"fold {fold} - xgb rmse: {rmse:.6f}, elapsed time: {elapsed:.2f}sec\n")

rmse = mean_squared_error(train[target], mlp_oof, squared=False)
print(f"oof mlp rmse = {rmse:.6f}")

rmse = mean_squared_error(train[target], xgb_oof, squared=False)
print(f"oof xgb rmse = {rmse:.6f}")

np.save("mlp_oof.npy", mlp_oof)
np.save("mlp_pred.npy", mlp_pred)

np.save("xgb_oof.npy", xgb_oof)
np.save("xgb_pred.npy", xgb_pred)

# Ensemble

In [None]:
def class_optimizer(X, a0, a1):
    oof = X[0]*a0 + (1-X[0])*a1
    return mean_squared_error(train[target], oof, squared=False)

res = minimize(
    fun=class_optimizer,
    x0=[0.2],
    args=tuple([mlp_oof, xgb_oof]),
    method='Nelder-Mead',
    options={'maxiter': 300})

print(res)
print(f"coef0 {res.x[0]}, coef1 {1-res.x[0]}")

In [None]:
ensemble_oof = res.x[0] * mlp_oof + (1-res.x[0]) * xgb_oof
ensemble_pred = res.x[0] * mlp_pred + (1-res.x[0]) * xgb_pred

print(mean_squared_error(train[target], ensemble_oof, squared=False))

# Submission

In [None]:
submission['loss'] = ensemble_pred
submission.to_csv("submission.csv", index=False)

submission