In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sn

from catboost import CatBoostRegressor

import xgboost as xgb

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler, QuantileTransformer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import *
from sklearn.ensemble import StackingRegressor
from sklearn.base import BaseEstimator, clone

import optuna
from optuna.samplers import TPESampler

### Import Data

In [None]:
df_train = pd.read_csv(r"../input/tabular-playground-series-aug-2021/train.csv", index_col=0)
df_test = pd.read_csv(r"../input/tabular-playground-series-aug-2021/test.csv", index_col=0)

X = df_train.iloc[:, :-1]
y = df_train.iloc[:, -1]

X_test = df_test

def create_submission(model, X_test):
    y_pred = model.predict(X_test)
    #y_pred = np.round(y_pred)
    df_submission = pd.DataFrame(index = X_test.index )
    df_submission["loss"] = y_pred
    df_submission.to_csv("submission.csv")
    return df_submission

### Define Optuna Objective

thanks to michael127001 : https://www.kaggle.com/michael127001/xgbregressor-with-optuna-tuning?kernelSessionId=71581358

https://optuna.org/

In [None]:
STUDY_TIME = 1*60*30

In [None]:
features = X.columns

class DataFrameWrapper(BaseEstimator):
    global features
    def __init__(self):
        self.columns = features

    def fit(self, X, y=None):
        return pd.DataFrame(X, columns= self.columns)
    
    def transform(self, X, y=None):
        return pd.DataFrame(X, columns= self.columns)
    
    def fit_transform(self, X, y=None):
        return pd.DataFrame(X, columns= self.columns)

In [None]:
# XGB Regressor
def objective(trial):

    param_grid = {

        'max_depth': trial.suggest_int('max_depth', 4, 8), # Extremely prone to overfitting!
        'n_estimators': trial.suggest_int('n_estimators', 500, 3500, 500), # Extremely prone to overfitting!
        'eta': trial.suggest_float('eta', 0.007, 0.013), # Most important parameter.
        'subsample': trial.suggest_discrete_uniform('subsample', 0.2, 0.9, 0.1),
        'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.2, 0.9, 0.1),
        'colsample_bylevel': trial.suggest_discrete_uniform('colsample_bylevel', 0.2, 0.9, 0.1),
        'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-4, 1e4), # I've had trouble with LB score until tuning this.
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-4, 1e4), # L2 regularization
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-4, 1e4), # L1 regularization
        'gamma': trial.suggest_loguniform('gamma', 1e-4, 1e4),
    } 
    
    reg = xgb.XGBRegressor(
        # These parameters should help with trial speed.
        objective='reg:squarederror',
        tree_method='gpu_hist',
        predictor='gpu_predictor',
        n_jobs=-1,
        **param_grid
    )
    
    scaler = dict()
    scaler['StandardScaler']= StandardScaler()
    scaler['MinMaxScaler']= MinMaxScaler()
    scaler['QT'] = QuantileTransformer(n_quantiles=10, random_state=0)
    
    used_scaler = scaler[trial.suggest_categorical('scaler', ['StandardScaler', 'MinMaxScaler', 'QT'])]
    
    pipe = Pipeline([('scaler', used_scaler), ('wrapper', DataFrameWrapper()), ('model', reg)])
    
    scores = cross_val_score(pipe, X, y, cv=3, scoring= 'neg_root_mean_squared_error')
    print('CV rmse scores: ', -scores)
    
    return -scores.mean()

In [None]:
train_time = STUDY_TIME
xgb_study = optuna.create_study(direction='minimize', sampler=TPESampler(), study_name='XGBRegressor')
xgb_study.optimize(objective, timeout=train_time)

xgb_param = dict()
xgb_param['objective'] = 'reg:squarederror'
xgb_param['tree_method']='gpu_hist'
xgb_param['predictor']='gpu_predictor'
xgb_param['n_jobs']=-1

xgb_study.best_params.pop('scaler')
xgb_param = xgb_study.best_params

print('Best parameters :')
display(xgb_param)
xgb_model = xgb.XGBRegressor(**xgb_param)

In [None]:
# CatBoost Regressor
def objective(trial):

    param_grid = {
        'depth': trial.suggest_int('depth', 4, 8),
        'iterations': trial.suggest_int('iterations', 400, 3200, 400),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.01),
        'l2_leaf_reg': trial.suggest_uniform('l2_leaf_reg', 1, 30),
        'random_strength' : trial.suggest_uniform('random_strength', 1, 5)
    } 
    
    reg = CatBoostRegressor(
        # These parameters should help with trial speed.
        grow_policy='Depthwise',
        leaf_estimation_method='Newton', 
        bootstrap_type='Bernoulli',
        loss_function='RMSE',
        eval_metric='RMSE',
        task_type='GPU',
        silent=True,
        **param_grid
    )
    
    scaler = dict()
    scaler['StandardScaler']= StandardScaler()
    scaler['MinMaxScaler']= MinMaxScaler()
    scaler['QT'] = QuantileTransformer(n_quantiles=10, random_state=0)
    
    used_scaler = scaler[trial.suggest_categorical('scaler', ['StandardScaler', 'MinMaxScaler', 'QT'])]
    
    pipe = Pipeline([('scaler', used_scaler), ('wrapper', DataFrameWrapper()), ('model', reg)])
    
    scores = cross_val_score(pipe, X, y, cv=3, scoring= 'neg_root_mean_squared_error')
    print('CV rmse scores: ', -scores)

    return -scores.mean()

In [None]:
train_time = STUDY_TIME
cat_study = optuna.create_study(direction='minimize', sampler=TPESampler(), study_name='CatBoost Regressor')
cat_study.optimize(objective, timeout=train_time)

cat_param = dict()
cat_param['grow_policy']='Depthwise'
cat_param['leaf_estimation_method']='Newton'
cat_param['bootstrap_type']='Bernoulli'
cat_param['loss_function']='RMSE'
cat_param['eval_metric']='RMSE'
cat_param['task_type']='GPU'
cat_param['verbose']=500


cat_param = cat_study.best_params
del cat_best_param["scaler"]
cat_best_param

print('Best parameters :')
display(cat_best_param)

cat_model = CatBoostRegressor(**cat_best_param)

### Linear model blend using OOF

thanks to Somayyeh Gholami & Mehran Kazeminia : https://www.kaggle.com/somayyehgholami/1-tps-aug-21-xgboost-catboost?kernelSessionId=71146386

In [None]:
skf = StratifiedKFold(n_splits=3)
skf.get_n_splits(X, y)

models_zoo = [xgb_model, cat_model]
oof_preds = np.zeros((len(y), len(models_zoo)))

for i_fold,(train_idx, test_idx) in enumerate(skf.split(X, y)):
    print("--- Fold {} ---".format(i_fold))
    for i_model,model in enumerate(models_zoo):
        print("Training model {},".format(i_model))
        model.fit(X.iloc[train_idx,:], y[train_idx])
        oof_preds[test_idx, i_model] = model.predict(X.iloc[test_idx,:])
        print(30*"===")
        
np.savez("oof_preds.npz", oof_preds)

In [None]:
def best_blend(oof_preds, coeff):    
    oof_blend = (oof_preds[:, 0] * coeff) + (oof_preds[:, 1] * (1.0 - coeff))   
    rmse = mean_squared_error(y, oof_blend, squared = False)
    return rmse

results = np.zeros((51,2))
for i,c in enumerate(np.linspace(0, 1, 51)):       
    results[i,0] = c  
    results[i,1] = best_blend(oof_preds,c)  
    
plt.plot(results[:,0], results[:,1])
plt.grid()
plt.show()

best_coeff = results[results[:,1].argmin(), 0]

In [None]:
xgb_pred = xgb_model.predict(X_test)
cat_pred = cat_model.predict(X_test)
blend_pred = (xgb_pred * best_coeff) + (cat_pred * (1.0 - best_coeff))

df_submission = pd.DataFrame(index = X_test.index )
df_submission["loss"] = blend_pred
df_submission.to_csv("submission_blend.csv")