In [None]:
# Data processing
%matplotlib inline
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt

# Machine Learning
import optuna
from catboost import CatBoostRegressor
from optuna.samplers import TPESampler
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error

In [None]:
input_dir = Path('../input/tabular-playground-series-aug-2021/')
train_df = pd.read_csv(input_dir / 'train.csv')
test_df = pd.read_csv(input_dir / 'test.csv')
sample_submission = pd.read_csv(input_dir / 'sample_submission.csv')

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
sample_submission.head()

In [None]:
X = train_df.drop(['id', 'loss'], axis=1).values
y = train_df['loss'].values
X_test = test_df.drop(['id'], axis=1).values

In [None]:
# I've found many using MinMaxScaling but I've personally had better results with StandardScaling
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_test = scaler.fit_transform(X_test)

In [None]:
y_min = y.min()
y_max = y.max()

# While it's probably rare that values will fall outside the y-min-max range, we should probably do it anyway.
def my_rmse(y_true, y_hat):
    y_true[y_true < y_min] = y_min
    y_true[y_true > y_max] = y_max
    
    y_true[y_hat < y_min] = y_min
    y_true[y_hat > y_max] = y_max
    
    return mean_squared_error(y_true, y_hat, squared=False)

Some optimal parameters used from:
https://www.kaggle.com/somayyehgholami/1-tps-aug-21-xgboost-catboost

Go throw an upvote!

In [None]:
def objective(trial):
    # Split the train data for each trial.
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, stratify=y, test_size=0.4)

    param_grid = {
        'depth': trial.suggest_int('depth', 6, 10), # Extremely prone to overfitting!
        'iterations': trial.suggest_int('iterations', 400, 4000, 400), # Extremely prone to overfitting!
        'learning_rate': trial.suggest_float('learning_rate', 0.02, 0.03), # Most important parameter - the learning rate!
        'random_strength': trial.suggest_discrete_uniform('random_strength', 1.0, 2.0, 0.1),
        'l2_leaf_reg': trial.suggest_int('l2_leaf_reg', 1, 50), # L2 regularization
    } 
    
    reg = CatBoostRegressor(
        grow_policy='Depthwise',
        leaf_estimation_method='Newton', 
        bootstrap_type='Bernoulli',
        thread_count=4,
        loss_function='RMSE',
        eval_metric='RMSE',
        od_type='Iter',
        task_type='GPU',
        verbose=False,
        early_stopping_rounds=400,
        **param_grid
    )
    
    reg.fit(X_train, y_train, verbose=False)

    return my_rmse(y_valid, reg.predict(X_valid))

In [None]:
train_time = 1 * 10 * 60 # Train for up to ten minutes.
study = optuna.create_study(direction='minimize', sampler=TPESampler(), study_name='CatBoost')
study.optimize(objective, timeout=train_time)

print('Number of finished trials: ', len(study.trials))
print('Best trial:')
trial = study.best_trial

print('\tValue: {}'.format(trial.value))
print('\tParams: ')
for key, value in trial.params.items():
    print('\t\t{}: {}'.format(key, value))

In [None]:
# Fetch the best trial parameters and set some settings for the KFold predictions.
catb_params = trial.params
catb_params['grow_policy'] = 'Depthwise'
catb_params['leaf_estimation_method'] = 'Newton'
catb_params['bootstrap_type'] = 'Bernoulli'
catb_params['thread_count'] = 4
catb_params['loss_function'] = 'RMSE'
catb_params['eval_metric'] = 'RMSE'
catb_params['od_type'] = 'Iter'
catb_params['task_type'] = 'GPU'
catb_params['early_stopping_rounds'] = 400

n_splits = 10
test_preds = None
kf_rmse = []

for fold, (train_idx, valid_idx) in enumerate(KFold(n_splits=n_splits, shuffle=True).split(X, y)):
    # Fetch the train-validation indices.
    X_train, y_train = X[train_idx], y[train_idx]
    X_valid, y_valid = X[valid_idx], y[valid_idx]
    
    # Create and fit a new model using the best parameters.
    model = CatBoostRegressor(**catb_params)
    model.fit(X_train, y_train, verbose=False)
    
    # Validation predictions.
    valid_pred = model.predict(X_valid)
    rmse = my_rmse(y_valid, valid_pred)
    print(f'Fold {fold+1}/{n_splits} RMSE: {rmse:.4f}')
    kf_rmse.append(rmse)
    
    # Use the model trained for 1/n_splits of the output predictions.
    if test_preds is None:
        test_preds = model.predict(X_test)
    else:
        # This is kind of naughty for numerical accuracy (may overflow on other problems) but slightly quicker.
        test_preds += model.predict(X_test)

test_preds /= n_splits
print(f'Average KFold RMSE: {np.mean(np.array(kf_rmse)):.5f}')

In [None]:
test_preds[test_preds < y_min] = y_min
test_preds[test_preds > y_max] = y_max
sample_submission['loss'] = test_preds
sample_submission.to_csv('submission.csv', index=False)
sample_submission