<div style="background-color:rgba(55, 99, 71, 0.5);">
    <h1><center>Importing Libraries</center></h1>
</div>

Thanks Ranjeet Shrivastav for sharing your approach. I have used LGBM and CatBoost for final voting (0.5-0.5) and am yet to tune my CatBoost model.
Please upvote his work here - https://www.kaggle.com/ranjeetshrivastav/tps-aug-21-optuna-lgb-xgb-cb

In [None]:
import random
random.seed(123)

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

# importing viz packages

import matplotlib.pyplot as plt
import seaborn as sns

# importing modelling packages

from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV, KFold, StratifiedKFold
from sklearn.feature_selection import SelectKBest,f_regression,SelectPercentile,VarianceThreshold
from mlxtend.feature_selection import SequentialFeatureSelector as SFS, ExhaustiveFeatureSelector as EFS
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler,StandardScaler,PowerTransformer
from sklearn.decomposition import PCA

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.tree import DecisionTreeRegressor

# Optimisation Packages

import optuna
from optuna import trial
from optuna.samplers import TPESampler
import pprint
import joblib
from skopt import BayesSearchCV
from skopt.callbacks import DeadlineStopper, VerboseCallback, DeltaXStopper
from skopt.space import Real, Categorical, Integer
from time import time

<div style="background-color:rgba(55, 99, 71, 0.5);">
    <h1><center>Data Input</center></h1>
</div>

In [None]:
# importing train and test

train = pd.read_csv(r'../input/tabular-playground-series-aug-2021/train.csv')
test = pd.read_csv(r'../input/tabular-playground-series-aug-2021/test.csv')

In [None]:
# seperating into dependent and independent variables

X = train.drop(['id','loss'],axis=1)
y = train['loss']
test = test.drop(['id'],axis=1)

<div style="background-color:rgba(55, 99, 71, 0.5);">
    <h1><center>Data Preprocessing</center></h1>
</div>

In [None]:
# using minmax scaler for scaling data

scaler = MinMaxScaler()

X = scaler.fit_transform(X)
test = scaler.transform(test)

<div style="background-color:rgba(55, 99, 71, 0.5);">
    <h1><center>Model Building+Optuna on LightGBM</center></h1>
</div>

In [None]:
def fit_lgb(trial, x_train, y_train, x_test, y_test):
    params = {
        'reg_alpha' : trial.suggest_loguniform('reg_alpha' , 0.47 , 0.5),
        'reg_lambda' : trial.suggest_loguniform('reg_lambda' , 0.32 , 0.33),
        'num_leaves' : trial.suggest_int('num_leaves' , 50 , 70),
        'learning_rate' : trial.suggest_uniform('learning_rate' , 0.03 , 0.04),
        'max_depth' : trial.suggest_int('max_depth', 30 , 40),
        'n_estimators' : trial.suggest_int('n_estimators', 100 , 6100),
        'min_child_weight' : trial.suggest_loguniform('min_child_weight', 0.015 , 0.02),
        'subsample' : trial.suggest_uniform('subsample' , 0.9 , 1.0), 
        'colsample_bytree' : trial.suggest_loguniform('colsample_bytree', 0.52 , 1),
        'min_child_samples' : trial.suggest_int('min_child_samples', 76, 80),
        'metric' : 'rmse',
        'device_type' : 'gpu',
    }
    
    model = LGBMRegressor(**params,device = 'gpu', random_state=2021)
    model.fit(x_train, y_train,eval_set=[(x_test,y_test)], early_stopping_rounds=150, verbose=False)
    
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)
    y_train_pred = np.clip(y_train_pred, 0.1, None)
    y_test_pred = np.clip(y_test_pred, 0.1, None)
    
    log = {
        "train rmse": mean_squared_error(y_train, y_train_pred,squared=False),
        "valid rmse": mean_squared_error(y_test, y_test_pred,squared=False)
    }
    
    return model, log

In [None]:
def objective(trial):
    rmse = 0
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
    model, log = fit_lgb(trial, x_train, y_train, x_test, y_test)
    rmse += log['valid rmse']
        
    return rmse

In [None]:
# optuna trials on lgbm - commenting to save run time

#study = optuna.create_study(direction='minimize', study_name='LGBMRegressor')
#study.optimize(objective,n_trials=10)

#print('Number of finished trials: ', len(study.trials))
#print('Best trial:')
#trial = study.best_trial

#print('\tValue: {}'.format(trial.value))
#print('\tParams: ')
#for key, value in trial.params.items():
 #   print('\t\t{}: {}'.format(key, value))

<div style="background-color:rgba(55, 99, 71, 0.5);">
    <h1><center>Best Parameters from my Optuna run</center></h1>
</div>

In [None]:
lgb_params =  {'reg_alpha': 0.49296333273117504, 'reg_lambda': 0.32320931014536086, 
               'num_leaves': 54, 'learning_rate': 0.03832217782251515, 'max_depth': 37, 'n_estimators': 2973,
               'min_child_weight': 0.019808752100234205, 'subsample': 0.9662983672394618, 
               'colsample_bytree': 0.5413818580548442, 'min_child_samples': 80}

In [None]:
def cross_val(X, y, model, params, folds=10):

    kf = KFold(n_splits=folds, shuffle=True, random_state=2021)
    for fold, (train_idx, test_idx) in enumerate(kf.split(X, y)):
        print(f"Fold: {fold}")
        x_train, y_train = X[train_idx], y[train_idx]
        x_test, y_test = X[test_idx], y[test_idx]

        alg = model(**params,random_state = 2021)
        alg.fit(x_train, y_train,
                eval_set=[(x_test, y_test)],
                early_stopping_rounds=400,
                verbose=False)
        pred = alg.predict(x_test)
        error = mean_squared_error(y_test, pred,squared = False)
        print(f" mean_squared_error: {error}")
        print("-"*50)
    
    return alg

In [None]:
lgb_model = cross_val(X, y, LGBMRegressor, lgb_params)

<div style="background-color:rgba(55, 99, 71, 0.5);">
    <h2><center>Final Voting</center></h2>
</div>

In [None]:
# initialising my models - I am yet to tune CB properly.

cat = CatBoostRegressor()
lgb = LGBMRegressor(**lgb_params)

In [None]:
# Voting - LGBM (tuned) + CB (baseline right now) - 0.5 weights

from sklearn.ensemble import VotingRegressor
folds = KFold(n_splits = 5, random_state = 228, shuffle = True)

predictions = np.zeros(len(test))

for fold, (trn_idx, val_idx) in enumerate(folds.split(X)):
    print(f"Fold: {fold}")
    X_train, X_val = X[trn_idx], X[val_idx]
    y_train, y_val = y[trn_idx], y[val_idx]

    model = VotingRegressor(
            estimators = [
                ('lgbm', lgb),
               ('cat',cat)
            ],
            weights = [0.5,0.5]
        )
   
    model.fit(X_train, y_train)
    pred = model.predict(X_val)
    error = mean_squared_error(y_val, pred,squared = False)
    print(f" mean_squared_error: {error}")
    print("-"*50)
    
    predictions += model.predict(test) / folds.n_splits 

<div style="background-color:rgba(55, 99, 71, 0.5);">
    <h2><center>Prediction and submission</center></h2>
</div>

In [None]:
sub = pd.read_csv(r'../input/tabular-playground-series-aug-2021/sample_submission.csv')
sub['loss'] = predictions
sub.to_csv(f'vote.csv',index = False)

<div style="background-color:rgba(55, 99, 71, 0.5);">
    <h2><center>Thanks! Kindly upvote if you liked my basic notebook :)</center></h2>
</div>