A foreword, although my overall implementation doesn't have too great of a score, I hope that my contribution will be able to slightly boost someone's with a better notebook.

A lot of what I have based my notebook on is creditted to 
 1. https://www.kaggle.com/michael127001/xgbregressor-with-optuna-tuning
 2. https://www.kaggle.com/pranjalverma08/tps-08-cb-lgbm-xgb-starter
 3. https://www.kaggle.com/dmitryuarov/falling-below-7-87-voting-cb-xgb-lgbm
 4. https://www.kaggle.com/hiro5299834/tps-aug-2021-xgb/

## Contributions made by this notebook
1. **Pseudolabelled dataset training**: the last cell in this notebook is annotated and compartmentalized so that it can be easily integrated if someone else would like to try it out
2. **Choose best from repeated Kfolds**: also in the last cell of this notebook. Kfolds is run a repeated number of times and then only the lowest loss runs are chosen to be averaged. To turn it off, simply replace RepeatedKfold with Kfold and remove instances of the variable 'n_repeats'. *Warning* this is highly prone to overfit so regularize the model as necessary or turn it off
3. **Use a tweedie variance power between [1.035, 1.06]**: see older versions of this notebook to see optuna optimization runs that show this is a better fit than the suggested 1.1 that other notebooks use

(4). Explored a few parameters in the XGB model to tune for Optuna that I didn't see in other notebooks. This may be unnecessary but someone else may be able to explain whether they are helpful or not

## What can be improved
1. XGB model paramters, pseudo XGB model parameters
2. Ensembling
3. N_split and N_repeat values

In [None]:
# import libraries
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from optuna.samplers import TPESampler
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, KFold, RepeatedKFold
from sklearn.neural_network import MLPRegressor
from xgboost import cv
import xgboost as xgb
from scipy.optimize import minimize


In [None]:
# import data
train_df = pd.read_csv('../input/tabular-playground-series-aug-2021/train.csv')
test_df = pd.read_csv('../input/tabular-playground-series-aug-2021/test.csv')
submission = pd.read_csv('../input/tabular-playground-series-aug-2021/sample_submission.csv')

# separate data
X = train_df.drop(['loss', 'id'], axis=1)
y = train_df['loss']
X_test = test_df.drop(['id'], axis=1)

# Tune for new hyperparameters or use custom values
XGB_OPTUNA = False
PSEUDO = True

EARLY_OPTUNA = 100
EARLY_FIT = 150


In [None]:
# scale data
scaler = StandardScaler()
scaler.fit(pd.concat([X, X_test]))
X = scaler.transform(X)
X_test = scaler.transform(X_test)

In [None]:
def xgb_objective(trial,data=X,target=y):
    X_train, X_valid, y_train, y_valid = train_test_split(data, target, test_size=0.4,random_state=42)
    
    param_grid = {'tweedie_variance_power': trial.suggest_float('tweedie_variance_power', 1.035, 1.06),
                  'n_estimators': trial.suggest_int('n_estimators', 3800, 5000), 
                  'max_depth': trial.suggest_int('max_depth', 5, 9),
                  'eta': trial.suggest_float('eta', 0.005, 0.011),
                  'subsample': trial.suggest_discrete_uniform('subsample', 0.3, 0.6, 0.01),
                  'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 0.8),
                  'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.6, 0.8),
                  'colsample_bynode': trial.suggest_float('colsample_bynode', 0.6, 1.0),
                  'min_child_weight': trial.suggest_float('min_child_weight', 1e-3, 0.1),
                  'reg_alpha': trial.suggest_float('reg_alpha', 1, 100),
                  'reg_lambda': trial.suggest_float('reg_lambda', 800, 8000),
                  'max_delta_step': trial.suggest_float('max_delta_step', 1, 8000),
                  'gamma': trial.suggest_float('gamma', 0.1, 1),
                  'base_score': trial.suggest_float('base_score', 0.42, 0.46)} 
    
    model = xgb.XGBRegressor(objective='reg:tweedie',
                             tree_method='gpu_hist',
                             predictor='gpu_predictor',
                             sampling_method='gradient_based',
                             n_jobs=-1,
                             max_bin=256,
                             single_precision_histogram='true',
                             **param_grid)
    
    model.fit(X_train, y_train,
              eval_set=[(X_valid, y_valid)],
              eval_metric='rmse',
              early_stopping_rounds=EARLY_OPTUNA,
              verbose=False)

    return mean_squared_error(y_valid, model.predict(X_valid), squared=False)

In [None]:
def create_optuna_study(objective, study_name, train_time):
    study = optuna.create_study(direction='minimize', 
                                sampler=TPESampler(), 
                                study_name=study_name)
    study.optimize(objective, 
                   timeout=train_time)
    trial = study.best_trial
    
    print('Number of finished trials: ', len(study.trials))
    print('Best trial:')
    print('\tValue: {}'.format(trial.value))
    print('\tParams: ')
    for key, value in trial.params.items():
        print("\t\t'{}': {},".format(key, value))
    
    return trial, study

In [None]:
train_time = 1 * 60 * 60

# XGB Optimize
if XGB_OPTUNA:
    xgb_trial, study = create_optuna_study(xgb_objective, 'XGBRegressor', train_time)
    xgb_params = xgb_trial.params
else:
    # 	Value: 7.810990920261464
    xgb_params = {'tweedie_variance_power': 1.0467,
                    'n_estimators': 4200,
                    'max_depth': 6,
                    'eta': 0.010168813765699104,
                    'subsample': 0.32999999999999996,
                    'colsample_bytree': 0.72,
                    'colsample_bylevel': 0.77,
                    'colsample_bynode': 0.4,
                    'min_child_weight': 0.0015983397006165201,
                    'reg_alpha': 5.089297744468109,
                    'reg_lambda': 5614.706936183112,
                    'max_delta_step': 12.488093623290982,
                    'gamma': 0.002944897792984669,
                    'base_score': 0.4534214581239122}
xgb_params['objective']='reg:tweedie'
xgb_params['tree_method'] = 'gpu_hist'
xgb_params['predictor'] = 'gpu_predictor'
xgb_params['n_jobs'] = -1
xgb_params['max_bin'] = 256

pseudo_xgb_params = {'n_estimators': 4874,
                    'max_depth': 8,
                    'eta': 0.006269949203588203,
                    'subsample': 0.52,
                    'colsample_bytree': 0.5341415254987654,
                    'colsample_bylevel': 0.7053444074403165,
                    'colsample_bynode': 0.6195508609737396,
                    'min_child_weight': 0.07942691380323752,
                    'reg_alpha': 75.63328698050019,
                    'reg_lambda': 2045.4576615756023,
                    'max_delta_step': 4636.2914334780635,
                    'gamma': 0.652157845901367,
                    'base_score': 0.42312328086044243}
pseudo_xgb_params['objective'] = 'reg:squarederror'
pseudo_xgb_params['tree_method'] = 'gpu_hist'
pseudo_xgb_params['predictor'] = 'gpu_predictor'
pseudo_xgb_params['n_jobs'] = -1
pseudo_xgb_params['max_bin'] = 256


## Initial Label

In [None]:
test_preds = np.zeros(X_test.shape[0])

xgb_rmse = []

n_splits = 7
kf = KFold(n_splits=n_splits, shuffle=True, random_state=0)

for fold, (train_idx, valid_idx) in enumerate(kf.split(X, y), 1):
    X_train, y_train = X[train_idx], y[train_idx]
    X_valid, y_valid = X[valid_idx], y[valid_idx]
    
    pre_xgb_model = xgb.XGBRegressor(**xgb_params)
    pre_xgb_model.fit(X_train, y_train,
                  eval_set=[(X_valid, y_valid)],
                  verbose=False,
                  callbacks = [xgb.callback.EarlyStopping(
                      rounds=EARLY_FIT,
                      save_best=True)]) 

    post_xgb_model = xgb.XGBRegressor(**xgb_params)
    post_xgb_model.fit(X_train, y_train,
                      eval_set=[(X_valid, y_valid)],
                      verbose=False,
                      callbacks = [xgb.callback.EarlyStopping(
                          rounds=EARLY_FIT*2,
                          save_best=True)],
                      xgb_model=pre_xgb_model)
    
    
    test_preds += post_xgb_model.predict(X_test) / n_splits 
    xgb_rmse.append(mean_squared_error(y_valid, post_xgb_model.predict(X_valid), squared=False))
    
    print(f'Fold {fold}\n\txgb: {xgb_rmse[fold-1]}')
    

print(f'\nAverage xgb rmse: {np.array(xgb_rmse).mean()}')

submission['loss'] = test_preds
submission.to_csv('submission.csv', index=False)

## Pseudolabel implementation
What is pseudolabelling? Read here:
https://www.kaggle.com/c/tabular-playground-series-apr-2021/discussion/231738

To use this implementation, the following variables need to be declared before you just pop it onto the end of your code

1. X: training dataset
2. y: training dataset labels
3. X_test: testing dataset
4. test_preds: testing dataset predictions
5. xgb_params: dictionary parameters for XGB model
6. pseudo_xgb_params: dictionary parameters for fine-tune XGB model
7. early_fit: number of early stopping rounds
8. n_splits: number of cross-validation splits
9. n_repeats: number of split and train repeats

In [None]:
def pseudolabel(X, y, X_test, test_preds, xgb_params, pseudo_xgb_params, early_fit, n_splits=7, n_repeats=2):
    rmse = []
    preds = []
    best_rmse = 0
    test_preds2 = np.zeros(len(X_test))

    kf = RepeatedKFold(n_splits = n_splits, n_repeats = n_repeats, random_state=0)
    kf2 = RepeatedKFold(n_splits = n_splits, n_repeats = n_repeats, random_state=0)

    for fold, ((train_idx, valid_idx),(pseudo_idx, pseudo2_idx)) in enumerate(zip(kf.split(X, y), kf2.split(X_test, test_preds)), 1):
        X_train, y_train = X[train_idx], y[train_idx]
        X_valid, y_valid = X[valid_idx], y[valid_idx]
        
        X_pseudo, y_pseudo = X_test[pseudo_idx], test_preds[pseudo_idx]
        X_pseudo2, y_pseudo2 = X_test[pseudo2_idx], test_preds[pseudo2_idx]
        
        # Run the model on a smaller pseudolabel dataset
        pre_xgb_model = xgb.XGBRegressor(**pseudo_xgb_params)
        pre_xgb_model.fit(np.concatenate([X_train, X_pseudo2]),
                          np.concatenate([y_train, y_pseudo2]),
                          eval_set=[(X_valid, y_valid)],
                          verbose=False,
                          callbacks=[xgb.callback.EarlyStopping(
                              rounds=early_fit,
                              save_best=True)])        
        
        # Finetune the model using the larger pseudo dataset and a more complex model
        # Feed the previous model weights into this new model
        # The evaluation dataset must be the grountruth data
        post_xgb_model = xgb.XGBRegressor(**pseudo_xgb_params)
        post_xgb_model.fit(np.concatenate([X_train, X_pseudo]),
                          np.concatenate([y_train, y_pseudo]),
                          eval_set=[(X_valid, y_valid)],
                          verbose=False,
                          callbacks=[xgb.callback.EarlyStopping(
                              rounds=early_fit*2,
                              save_best=True)],
                          xgb_model=pre_xgb_model)

        preds.append(post_xgb_model.predict(X_test))

        xgb_rmse = mean_squared_error(y_valid, post_xgb_model.predict(X_valid), squared=False)
        rmse.append(xgb_rmse)
        
        print(f'Fold {fold}\n\txgb: {xgb_rmse}')
    
    for n in sorted(range(n_splits*n_repeats), key=lambda k: rmse[k])[:n_splits]:
        test_preds2 += preds[n] / n_splits
        best_rmse += rmse[n] / n_splits
    
    print(f'\nAverage total rmse: {np.array(rmse).mean()}')
    print(f'\nAverage best rmse: {best_rmse}')
    
    return test_preds2


In [None]:
if PSEUDO:
    test_preds2 = pseudolabel(X, y, X_test, test_preds, xgb_params, pseudo_xgb_params, EARLY_FIT)
    
submission2 = submission
submission2['loss'] = test_preds2
submission2.to_csv('submission2.csv', index=False)