# Imports

In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import optuna

from warnings import simplefilter
import gc

simplefilter('ignore')
rs = 69420
train_path = r'../input/tabular-playground-series-aug-2021/train.csv'
test_path = r'../input/tabular-playground-series-aug-2021/test.csv'
submission_path = r'../input/tabular-playground-series-aug-2021/sample_submission.csv'
budget = 3600*2

In [None]:
train = pd.read_csv(train_path, index_col=0)
test = pd.read_csv(test_path, index_col=0)

In [None]:
train.head()

In [None]:
test.head()

# Data Prep

In [None]:
y = train.loss.values
X = train.drop(["loss"], axis = 1).values

X.shape, y.shape

# Catboost Optuna Tuning

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor

def objective(trial,data=X,target=y):
    
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.25, random_state=rs, stratify=target)
    
    sc = RobustScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    
    params = {
        'iterations':trial.suggest_int("iterations", 1000, 20000),
        'od_wait':trial.suggest_int('od_wait', 500, 2000),
        'loss_function':'RMSE',
        'task_type':"GPU",
        'eval_metric':'RMSE',
        'leaf_estimation_method':'Newton',
        'bootstrap_type': 'Bernoulli',
        'learning_rate' : trial.suggest_uniform('learning_rate',0.02,1),
        'reg_lambda': trial.suggest_uniform('reg_lambda',1e-5,100),
        'subsample': trial.suggest_uniform('subsample',0,1),
        'random_strength': trial.suggest_uniform('random_strength',10,50),
        'depth': trial.suggest_int('depth',1,15),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf',1,30),
        'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations',1,15),
    }
    
    model = CatBoostRegressor(**params)  
    
    model.fit(X_train, y_train, eval_set=[(X_test,y_test)], early_stopping_rounds=100, verbose=False)
        
    y_preds = model.predict(X_test)
    loss = np.sqrt(mean_squared_error(y_test, y_preds))
    
    return loss

In [None]:
%%time
study = optuna.create_study(
    direction='minimize',
    sampler=optuna.samplers.TPESampler()
)

study.optimize(
    objective,
    timeout=budget,
    gc_after_trial=True
)

In [None]:
print('Number of finished trials:', len(study.trials))
print('Best trial: score {}, params {}'.format(study.best_trial.value, study.best_trial.params))

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
cat_params = study.best_trial.params
cat_params['loss_function'] = 'RMSE'
cat_params['eval_metric'] = 'RMSE'
cat_params['bootstrap_type']= 'Bernoulli'
cat_params['leaf_estimation_method'] = 'Newton'
cat_params['random_state'] = rs
cat_params['task_type']='GPU'

# LGBM Optuna Tuning

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor

def objective2(trial,data=X,target=y):
    
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.25, random_state=rs, stratify=target)
    
    sc = RobustScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    
    params = {
        'reg_alpha' : trial.suggest_loguniform('reg_alpha', 0.19, 0.5),
        'reg_lambda' : trial.suggest_loguniform('reg_lambda', 0.31, 0.34),
        'num_leaves' : trial.suggest_int('num_leaves', 50, 91),
        'learning_rate' : trial.suggest_uniform('learning_rate', 0.01, 0.07),
        'max_depth' : trial.suggest_int('max_depth', 3 , 67),
        'n_estimators' : trial.suggest_int('n_estimators', 5555, 7000),
        'min_child_weight' : trial.suggest_loguniform('min_child_weight', 0.012, 0.04),
        'subsample' : trial.suggest_uniform('subsample', 0.789, 1.0),
        'colsample_bytree' : trial.suggest_loguniform('colsample_bytree', 0.52, 1),
        'min_child_samples' : trial.suggest_int('min_child_samples', 76, 80),
        'metric' : 'rmse',
        'device_type' : 'gpu',
        'boosting_type':'gbdt'
    }
    
    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, 'rmse', valid_name = 'valid_0')
    
    model = LGBMRegressor(**params, random_state=rs)
    model.fit(X_train, y_train, eval_set=[(X_test,y_test)], verbose=False, early_stopping_rounds=50, callbacks=[pruning_callback])
        
    y_preds = model.predict(X_test)
    loss = np.sqrt(mean_squared_error(y_test, y_preds))
    
    return loss

In [None]:
%%time
study = optuna.create_study(
    direction='minimize',
    sampler=optuna.samplers.TPESampler(),
    pruner = optuna.pruners.HyperbandPruner()
)

study.optimize(
    objective2,
    timeout=budget,
    gc_after_trial=True
)

In [None]:
print('Number of finished trials:', len(study.trials))
print('Best trial: score {}, params {}'.format(study.best_trial.value, study.best_trial.params))

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
lgbm_params = study.best_trial.params
lgbm_params['metric'] = 'RMSE'
lgbm_params['boosting_type']= 'gbdt'
lgbm_params['random_state'] = rs
lgbm_params['device'] = 'gpu'

# Optuna XGBoost

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

def objective3(trial,data=X,target=y):
    
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.25, random_state=rs, stratify=target)
    
    sc = RobustScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    
    params = {
        "n_estimators": trial.suggest_int("n_estimators",200,2000,100),
        "subsample": trial.suggest_discrete_uniform("subsample",0.6,1,0.1),
        "colsample_bytree": trial.suggest_discrete_uniform("colsample_bytree",0.6,1,0.1),
        "eta": trial.suggest_loguniform("eta",1e-3,0.1),
        "reg_alpha": trial.suggest_int("reg_alpha",1,50),
        "reg_lambda": trial.suggest_int("reg_lambda",5,100),
        "max_depth": trial.suggest_int("max_depth",5,20),
        "min_child_weight": trial.suggest_int("min_child_weight",5,20),
    }
    
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "validation_0-rmse")
    model = XGBRegressor(**params, tree_method='gpu_hist', random_state=rs)
    
    model.fit(
        X_train, y_train,
        eval_set=[(X_test,y_test)],
        verbose=False,
        eval_metric='rmse',
        early_stopping_rounds=50,
        callbacks=[pruning_callback]
    )

    y_preds = model.predict(X_test)
    loss = np.sqrt(mean_squared_error(y_test, y_preds))
    
    return loss

In [None]:
%%time
study = optuna.create_study(
    direction='minimize',
    sampler=optuna.samplers.TPESampler(),
    pruner = optuna.pruners.HyperbandPruner()
)

study.optimize(
    objective3,
    timeout=budget,
    gc_after_trial=True
)

In [None]:
print('Number of finished trials:', len(study.trials))
print('Best trial: score {}, params {}'.format(study.best_trial.value, study.best_trial.params))

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
xgb_params = study.best_trial.params
xgb_params['tree_method'] = 'gpu_hist'
xgb_params['random_state'] = rs

# Blend Models

In [None]:
from sklearn.ensemble import VotingRegressor

cat = CatBoostRegressor(**cat_params, verbose=0)
lgbm = LGBMRegressor(**lgbm_params, verbose=0)
xgb = XGBRegressor(**xgb_params, verbosity=0)

estimators = [
    ('Catboost', cat),
    ('LightGBM', lgbm),
    ('XGBoost', xgb),
]

clf = VotingRegressor(estimators=estimators)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=rs, stratify=y)
    
sc = RobustScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
%%time
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

In [None]:
print("Blend RMSE: ", round(np.sqrt(mean_squared_error(y_test, y_pred)), 5))

In [None]:
test_sub = sc.transform(test)

In [None]:
submission = pd.read_csv(submission_path)
submission.head()

In [None]:
submission['loss'] = clf.predict(test_sub)
submission.head()

In [None]:
submission.to_csv("sub_stack.csv", index=False)