In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import catboost as cb
import optuna

In [2]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error

In [3]:
from joblib import dump, load

In [4]:
train = pd.read_csv('../data/tabular-playground-series-aug-2021/train.csv', index_col='id')

In [5]:
test = pd.read_csv('../data/tabular-playground-series-aug-2021/test.csv', index_col='id')

In [6]:
X = train.drop('loss', axis=1)
y = train.loss

In [7]:
def gradient_boost(boost):
    score = cross_val_score(boost, X, y, scoring='neg_root_mean_squared_error', n_jobs=-1, cv=5, verbose=10)
    print(score.mean())
    return boost.fit(X, y)

In [8]:
def objective_cb(trial):
    params = {
        'objective': 'RMSE',
        'eval_metric': 'RMSE',
        'silent': True,
        'thread_count': 4,
        'num_trees': trial.suggest_int('num_trees', 1000, 2000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 0.0, 5.0),
        'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS']),
        'random_strength': trial.suggest_float('random_strength', 1.0, 20.0),
        'max_depth': trial.suggest_int('max_depth', 0, 8),
        'rsm': trial.suggest_float('rsm', 0.0, 1.0),
    }
    
    cat = cb.CatBoostRegressor(**params)
    
    score = cross_val_score(cat, X, y, scoring='neg_root_mean_squared_error', n_jobs=-1, cv=3)
    rmse = score.mean()
    return rmse

In [9]:
# study = optuna.create_study(direction="maximize")
# study.optimize(objective_cb, n_trials=100)
# print(study.best_trial)

In [10]:
# study.best_params

In [11]:
best_params = {
    'objective': 'RMSE',
    'eval_metric': 'RMSE',
    'thread_count': 4,
    'silent': True,
    'num_trees': 10000,
    'l2_leaf_reg': 1.2,
    'bootstrap_type': 'Bernoulli',
    'random_strength': 10.3,
    'max_depth': 5,
    'rsm': 0.2
}

cat = gradient_boost(cb.CatBoostRegressor(**best_params))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  6.5min remaining:  9.8min
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:  6.5min remaining:  4.4min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  6.6min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  6.6min finished


-7.843046668177912


In [12]:
dump(cat, '../cat.joblib') 

['../cat.joblib']