In [1]:
%%capture
!pip install --upgrade optuna_integration

In [2]:
import gc
gc.enable()

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import optuna.integration.lightgbm as lgb
from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.base import clone

SEED = 2024

In [3]:
DATA_DIR = '/kaggle/input/playground-series-s4e4'

train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')
sample_sub = pd.read_csv(f'{DATA_DIR}/sample_submission.csv')

In [4]:
TARGET = 'Rings'
# log-transforming the target to optimize for RMSE instead of RMSLE
train[TARGET] = np.log1p(train[TARGET])

In [5]:
train = train.drop('id', axis=1)
test = test.drop('id', axis=1)

In [6]:
features = [f for f in test.columns]
cat_features = ['Sex']

In [7]:
sex_mapping = {'I': 0, 'F': 1, 'M': 2}
train['Sex'] = train['Sex'].replace(sex_mapping).astype('int')
test['Sex'] = test['Sex'].replace(sex_mapping).astype('int')

In [8]:
# LightGBM dataset
dtrain = lgb.Dataset(
    data=train[features],
    label=train[TARGET],
    feature_name=features,
    categorical_feature=cat_features)

In [9]:
base_params = {
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.01,
    'boosting_type': 'gbdt',
    'force_row_wise': True,
    'verbosity': -1,
    'n_jobs': -1,
    'deterministic': True,
    'random_state': SEED
}

In [10]:
early_stopping = lgb.early_stopping(
    stopping_rounds=100,
    first_metric_only=True,
    verbose=False,
    min_delta=1e-4)

In [11]:
BUDGET = 60 * 60 * 8
NUM_FOLDS = 10

tuner = lgb.LightGBMTunerCV(
    time_budget=BUDGET,
    optuna_seed=SEED,
    params=base_params,
    train_set=dtrain,
    num_boost_round=10000,
    folds=KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=SEED),
    feature_name=features,
    categorical_feature=cat_features,
    callbacks=[early_stopping],
    seed=SEED)

[I 2024-04-20 03:45:20,333] A new study created in memory with name: no-name-1fa5b028-1d32-49de-baf3-a0be09614e45


In [12]:
%%time
tuner.run()

feature_fraction, val_score: 0.149170:  14%|#4        | 1/7 [02:29<14:58, 149.81s/it][I 2024-04-20 03:47:50,165] Trial 0 finished with value: 0.14916951080061475 and parameters: {'feature_fraction': 0.5}. Best is trial 0 with value: 0.14916951080061475.
feature_fraction, val_score: 0.149170:  29%|##8       | 2/7 [04:45<11:46, 141.33s/it][I 2024-04-20 03:50:05,566] Trial 1 finished with value: 0.14918590699465997 and parameters: {'feature_fraction': 0.8999999999999999}. Best is trial 0 with value: 0.14916951080061475.
feature_fraction, val_score: 0.149132:  43%|####2     | 3/7 [06:55<09:04, 136.07s/it][I 2024-04-20 03:52:15,385] Trial 2 finished with value: 0.14913246158149931 and parameters: {'feature_fraction': 0.7}. Best is trial 2 with value: 0.14913246158149931.
feature_fraction, val_score: 0.149132:  57%|#####7    | 4/7 [09:04<06:40, 133.37s/it][I 2024-04-20 03:54:24,624] Trial 3 finished with value: 0.14913246158149931 and parameters: {'feature_fraction': 0.8}. Best is trial 2 wi

CPU times: user 6h 46min 24s, sys: 3h 20min 37s, total: 10h 7min 1s
Wall time: 5h 57min





In [13]:
print(f'Best score: {tuner.best_score:.5f}')
print(f'Best hyperparameters:')
for k, v in tuner.best_params.items():
    print(f'{k:20} - {v}')

Best score: 0.14795
Best hyperparameters:
objective            - regression
metric               - rmse
learning_rate        - 0.01
boosting_type        - gbdt
force_row_wise       - True
verbosity            - -1
n_jobs               - -1
deterministic        - True
random_state         - 2024
feature_pre_filter   - False
lambda_l1            - 0.27401117304604755
lambda_l2            - 0.0009257365563132775
num_leaves           - 192
feature_fraction     - 0.584
bagging_fraction     - 0.70342005528744
bagging_freq         - 2
min_child_samples    - 25


In [14]:
def comp_metric(y_true, y_pred):
    return mean_squared_error(y_true, y_pred, squared=False)

def custom_cv(estimator, seed=SEED, verbose=True):
    X_test = test[features]
    
    oof_preds, test_preds = {}, {}
    scores = []

    cv = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=seed)
    for fold, (train_ids, val_ids) in enumerate(cv.split(train)):
        X_train, y_train = train[features].iloc[train_ids], train[TARGET].iloc[train_ids]
        X_val, y_val = train[features].iloc[val_ids], train[TARGET].iloc[val_ids]
        
        model = clone(estimator)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            callbacks=[early_stopping])

        val_preds = model.predict(X_val)
        oof_preds.update(dict(zip(val_ids, val_preds)))
        test_preds[f'fold{fold}'] = model.predict(X_test)

        score = comp_metric(y_val, val_preds)
        scores.append(score)
        if verbose:
            print(f'Fold #{fold:>2}: {score:.5f} ({model.best_iteration_:>4} rounds)')
        _ = gc.collect()

    test_preds = pd.DataFrame.from_dict(test_preds)
    test_preds['mean'] = test_preds.mean(axis=1) # mean of fold-wise predictions
    
    oof_preds = pd.Series(oof_preds).sort_index()
    print(f'\nAvg score: {np.mean(scores):.5f} +/- {np.std(scores):.5f}')
    print(f'OOF score: {comp_metric(train[TARGET], oof_preds):.5f}\n')
    
    return oof_preds, test_preds

In [15]:
%%time
model = LGBMRegressor(**tuner.best_params, n_estimators=10000)
op, tp = custom_cv(model)

Fold # 0: 0.15076 ( 606 rounds)
Fold # 1: 0.14926 ( 700 rounds)
Fold # 2: 0.14728 ( 792 rounds)
Fold # 3: 0.14658 ( 650 rounds)
Fold # 4: 0.14781 ( 654 rounds)
Fold # 5: 0.14676 ( 709 rounds)
Fold # 6: 0.14920 ( 585 rounds)
Fold # 7: 0.14812 ( 716 rounds)
Fold # 8: 0.14534 ( 581 rounds)
Fold # 9: 0.14843 ( 642 rounds)

Avg score: 0.14795 +/- 0.00148
OOF score: 0.14796

CPU times: user 5min 4s, sys: 0 ns, total: 5min 4s
Wall time: 5min 4s


In [16]:
def create_submission_files(preds, notebook='03'):
    for col in preds.columns:
        sub = sample_sub.copy()
        # inverse transform for log-transformed target and clipping based on train data range
        sub[TARGET] = np.expm1(preds[col]).clip(1, 29)
        sub.to_csv(f'nb{notebook}_{col}.csv', index=False)

In [17]:
create_submission_files(tp)

In [18]:
!head nb03_mean.csv

id,Rings
90615,9.840484045482214
90616,9.781911789575407
90617,9.847144738039194
90618,10.50497144216985
90619,7.579552720829909
90620,9.369105518499326
90621,10.737770948346359
90622,6.165948974136031
90623,7.926124609060306
