ver. 1.0 - Mean RMSE (10 fold) - 0.84297; Public lb - 0.84308  
ver. 2.0 - Mean RMSE (10 fold) - **0.84280**; Public lb - 0.84291  
ver. 3.0 - Mean RMSE (10 fold) - 0.84292; Public lb - 0.84290  
ver. 4.0 - Mean RMSE (10 fold) - 0.84290; Public lb - **0.84287**

In [None]:
import pandas as pd
import numpy as np
import time
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from hyperopt.pyll.base import scope
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler

In [None]:
train = pd.read_csv("../input/tabular-playground-series-feb-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-feb-2021/test.csv")


# The test dataset doesn't have "G" value in cat6 column
train = train[train['cat6'] != 'G']
target = train.target

In [None]:
train.head()

In [None]:
def preprocess(df, encoder=None,
               scaler=None, cols_to_drop=None,
               cols_to_encode=None, cols_to_scale=None):
    """
    Preprocess input data
    :param df: DataFrame with data
    :param encoder: encoder object with fit_transform method,
                    or dummies value for pd.get_dummies method
    :param scaler: scaler object with fit_transform method
    :param cols_to_drop: columns to be removed
    :param cols_to_encode: columns to be encoded
    :param cols_to_scale: columns to be scaled
    :return: DataFrame
    """

    if encoder:
        if encoder == 'dummies':
            for col in cols_to_encode:
                encoded = pd.get_dummies(df[col], prefix='dummy_' + col)

            df = df.drop(cols_to_encode, axis=1)
            df = df.join(encoded)
        else:
            for col in cols_to_encode:
                df[col] = encoder.fit_transform(df[col].values.reshape(-1, ))

    if scaler:
        for col in cols_to_scale:
            df[col] = scaler.fit_transform(df[col].values.reshape(-1, 1))

    if cols_to_drop:
        df = df.drop(cols_to_drop, axis=1)

    return df

In [None]:
cat_cols = ['cat' + str(i) for i in range(10)]
cont_cols = ['cont' + str(i) for i in range(14)]

train = preprocess(train, scaler=StandardScaler(), encoder=LabelEncoder(),
                   cols_to_drop=['id', 'target'], cols_to_encode=cat_cols,
                   cols_to_scale=cont_cols)

test = preprocess(test, scaler=StandardScaler(), encoder=LabelEncoder(), 
                  cols_to_encode=cat_cols, cols_to_drop=['id'], 
                  cols_to_scale=cont_cols)

In [None]:
class EnsembleModel:
    def __init__(self, params):
        """
        LGB + XGB model
        """
        self.lgb_params = params['lgb']
        self.xgb_params = params['xgb']

        self.lgb_model = LGBMRegressor(**self.lgb_params)
        self.xgb_model = XGBRegressor(**self.xgb_params)

    def fit(self, x, y, *args, **kwargs):
        return (self.lgb_model.fit(x, y, *args, **kwargs),
                self.xgb_model.fit(x, y, *args, **kwargs))

    def predict(self, x, weights=[1.0, 1.0]):
        """
        Generate model predictions
        :param x: data
        :param weights: weights on model prediction, first one is the weight on lgb model
        :return: array with predictions
        """
        return (weights[0] * self.lgb_model.predict(x) +
                weights[1] * self.xgb_model.predict(x)) / 2

In [None]:
# ensemble_params = {
#     "lgb" : {
#         "num_leaves": scope.int(hp.quniform("num_leaves", 31, 200, 1)),
#         "max_depth": scope.int(hp.quniform("max_depth", 10, 24, 1)),
#         'learning_rate': hp.uniform('learning_rate', 0.01, 0.05),
#         'min_split_gain': hp.uniform('min_split_gain', 0, 1.0),
#         'min_child_samples': scope.int(hp.quniform("min_child_samples", 2, 700, 1)),
#         "subsample": hp.uniform("subsample", 0.2, 1.0),
#         "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1.0),
#         'reg_alpha': hp.uniform('reg_alpha', 1e-5, 1.0),
#         'reg_lambda': hp.uniform('reg_lambda', 0, 50),
#         'n_jobs': -1,
#         'n_estimators': 2000},
#     'xgb': {
#         'max_depth': scope.int(hp.quniform('xgb.max_depth', 10, 24, 1)),
#         'learning_rate': hp.uniform('xgb.learning_rate', 0.01, 0.05),
#         'gamma': hp.uniform('xgb.gamma', 1, 10),
#         'min_child_weight': scope.int(hp.quniform('xgb.min_child_weight', 2, 700, 1)),
#         'n_estimators': 2000,
#         'colsample_bytree': hp.uniform('xgb.colsample_bytree', 0.5, 0.9),
#         'subsample': hp.uniform('xgb.subsample', 0.5, 1.0),
#         'reg_lambda': hp.uniform('xgb.reg_lambda', 0, 100),
#         'reg_alpha': hp.uniform('xgb.reg_alpha', 1e-5, 0.5),
#         'objective': 'reg:squarederror',
#         'tree_method': 'gpu_hist',
#         'n_jobs': -1}
# }

# def ensemble_search(params):
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=22)

#     model = EnsembleModel(params)

#     evaluation = [(X_test, y_test)]

#     model.fit(X_train, y_train,
#               eval_set=evaluation, eval_metric='rmse',
#               early_stopping_rounds=100, verbose=False)

#     val_preds = model.predict(X_test)
#     rmse = mean_squared_error(y_test, val_preds, squared=False)

#     return {"loss": rmse, "status": STATUS_OK}

In [None]:
# X = train.copy()
# y = target

# trials = Trials()

# best_hyperparams = fmin(fn=ensemble_search,
#                        space=ensemble_params,
#                        algo=tpe.suggest,
#                        max_evals=100,
#                        trials=trials)

In [None]:
# best_hyperparams

In [None]:
# All params taken from previous version

since = time.time()
columns = train.columns

ensemble_params = {
    "lgb" : {
        "num_leaves": 36,
        "max_depth": 21,
        'learning_rate': 0.049019854828962754,
        'min_split_gain': 0.2579555416739361,
        'min_child_samples': 500,
        "subsample": 0.2595537456780356,
        "colsample_bytree": 0.6203517996970486,
        'reg_alpha': 0.33867231210286647,
        'reg_lambda': 42.071411120949854,
        'n_jobs': -1,
        'n_estimators': 10000},
    'xgb': {
        'max_depth': 13,
        'learning_rate': 0.020206705089028228,
        'gamma': 3.5746731812451156,
        'min_child_weight': 564,
        'n_estimators': 10000,
        'colsample_bytree': 0.5015940592112956,
        'subsample': 0.6839489639112909,
        'reg_lambda': 18.085502002853246,
        'reg_alpha': 0.17532087359570606,
        'objective': 'reg:squarederror',
        'tree_method': 'gpu_hist',
        'n_jobs': -1}
}
    
preds = np.zeros(test.shape[0])
kf = KFold(n_splits=10, random_state=22, shuffle=True)
rmse = []
n = 0

for trn_idx, test_idx in kf.split(train[columns], target):

    X_tr, X_val=train[columns].iloc[trn_idx], train[columns].iloc[test_idx]
    y_tr, y_val=target.iloc[trn_idx], target.iloc[test_idx]

    model = EnsembleModel(ensemble_params)

    model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], early_stopping_rounds=150, verbose=False)

    preds += model.predict(test[columns], [0.5, 1.5]) / kf.n_splits
    rmse.append(mean_squared_error(y_val, model.predict(X_val, [0.5, 1.5]), squared=False))
    
    print(f"Fold {n+1}, RMSE: {rmse[n]}")
    n += 1


print("Mean RMSE: ", np.mean(rmse))
end_time = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(
        end_time // 60, end_time % 60))

In [None]:
ss = pd.read_csv("../input/tabular-playground-series-feb-2021/sample_submission.csv")
ss['target'] = preds

ss.to_csv("ensemble_model_5.csv", index=False)