In [1]:
%%capture
!pip install --upgrade optuna_integration

In [2]:
import gc
gc.enable()

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

import optuna.integration.lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from sklearn.base import clone
from lightgbm import LGBMRegressor

SEED = 2024

In [3]:
DATA_DIR = '/kaggle/input/playground-series-s4e5'

train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')
sample_sub = pd.read_csv(f'{DATA_DIR}/sample_submission.csv')

original = pd.read_csv('/kaggle/input/flood-prediction-factors/flood.csv')

In [4]:
TARGET = 'FloodProbability'

In [5]:
train = train.drop('id', axis=1)
test = test.drop('id', axis=1)

In [6]:
extended = pd.concat([train, original], axis=0, ignore_index=True)

In [7]:
train.shape, original.shape, extended.shape, test.shape

((1117957, 21), (50000, 21), (1167957, 21), (745305, 20))

In [8]:
features = list(test.columns)
cat_features = []

In [9]:
# LightGBM dataset
dtrain = lgb.Dataset(
    data=extended[features],
    label=extended[TARGET],
    feature_name=features,
    categorical_feature=cat_features)

In [10]:
base_params = {
    'objective': 'cross_entropy',
    'metric': 'cross_entropy',
    'learning_rate': 0.1,
    'boosting_type': 'gbdt',
    'force_row_wise': True,
    'verbosity': -1,
    'n_jobs': -1,
    'deterministic': True,
    'random_state': SEED
}

early_stopping = lgb.early_stopping(
    stopping_rounds=100,
    first_metric_only=True,
    verbose=False,
    min_delta=2e-4)

In [11]:
BUDGET = 60 * 60 * 11
NUM_FOLDS = 10

tuner = lgb.LightGBMTunerCV(
    time_budget=BUDGET,
    optuna_seed=SEED,
    params=base_params,
    train_set=dtrain,
    num_boost_round=10000,
    folds=KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=SEED),
    seed=SEED,
    feature_name=features,
    categorical_feature=cat_features,
    callbacks=[early_stopping])

[I 2024-05-01 02:13:00,879] A new study created in memory with name: no-name-07bebb7a-2514-46d3-a988-5dd80e275a4c


In [12]:
%%time
tuner.run()

feature_fraction, val_score: 0.688737:  14%|#4        | 1/7 [05:09<30:57, 309.54s/it][I 2024-05-01 02:18:10,457] Trial 0 finished with value: 0.6887372164884943 and parameters: {'feature_fraction': 0.5}. Best is trial 0 with value: 0.6887372164884943.
feature_fraction, val_score: 0.688737:  29%|##8       | 2/7 [10:32<26:27, 317.55s/it][I 2024-05-01 02:23:33,614] Trial 1 finished with value: 0.6888695307031173 and parameters: {'feature_fraction': 0.8999999999999999}. Best is trial 0 with value: 0.6887372164884943.
feature_fraction, val_score: 0.688737:  43%|####2     | 3/7 [16:51<23:01, 345.43s/it][I 2024-05-01 02:29:52,230] Trial 2 finished with value: 0.6887805311300156 and parameters: {'feature_fraction': 0.7}. Best is trial 0 with value: 0.6887372164884943.
feature_fraction, val_score: 0.688737:  57%|#####7    | 4/7 [22:07<16:42, 334.02s/it][I 2024-05-01 02:35:08,760] Trial 3 finished with value: 0.6889249496306895 and parameters: {'feature_fraction': 0.8}. Best is trial 0 with valu

CPU times: user 20h 29min 49s, sys: 36min 35s, total: 21h 6min 25s
Wall time: 6h 22min 1s





In [13]:
def comp_metric(y_true, y_pred):
    return r2_score(y_true, y_pred)

def custom_cv(estimator, seed=SEED, verbose=True):
    X_test = test[features]
    
    oof_preds, test_preds = {}, {}
    scores = []

    cv = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=seed)
    for fold, (train_ids, val_ids) in enumerate(cv.split(extended)):
        X_train, y_train = extended[features].iloc[train_ids], extended[TARGET].iloc[train_ids]
        X_val, y_val = extended[features].iloc[val_ids], extended[TARGET].iloc[val_ids]
        
        model = clone(estimator)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            callbacks=[early_stopping])

        val_preds = model.predict(X_val)
        oof_preds.update(dict(zip(val_ids, val_preds)))
        test_preds[f'fold{fold}'] = model.predict(X_test)

        score = comp_metric(y_val, val_preds)
        scores.append(score)
        if verbose:
            print(f'Fold #{fold:>2}: {score:.5f} ({model.best_iteration_:>4} rounds)')
        _ = gc.collect()

    test_preds = pd.DataFrame.from_dict(test_preds)
    test_preds['mean'] = test_preds.mean(axis=1) # mean of fold-wise predictions
    
    oof_preds = pd.Series(oof_preds).sort_index()
    print(f'\nAvg score: {np.mean(scores):.5f} +/- {np.std(scores):.5f}')
    print(f'OOF score: {comp_metric(extended[TARGET], oof_preds):.5f}\n')
    
    return oof_preds, test_preds

In [14]:
%%time
model = LGBMRegressor(**tuner.best_params, n_estimators=10000)
op, tp = custom_cv(model)

Fold # 0: 0.83084 ( 182 rounds)
Fold # 1: 0.82782 ( 170 rounds)
Fold # 2: 0.82552 ( 170 rounds)
Fold # 3: 0.83751 ( 222 rounds)
Fold # 4: 0.80090 ( 124 rounds)
Fold # 5: 0.83460 ( 205 rounds)
Fold # 6: 0.81047 ( 137 rounds)
Fold # 7: 0.81410 ( 141 rounds)
Fold # 8: 0.82509 ( 166 rounds)
Fold # 9: 0.80768 ( 133 rounds)

Avg score: 0.82145 +/- 0.01173
OOF score: 0.82146

CPU times: user 15min 58s, sys: 2.05 s, total: 16min
Wall time: 16min


In [15]:
def create_submission_files(preds, notebook='02'):
    for col in preds.columns:
        sub = sample_sub.copy()
        sub[TARGET] = preds[col]
        sub.to_csv(f'nb{notebook}_{col}.csv', index=False)

In [16]:
create_submission_files(tp)

In [17]:
!head nb02_mean.csv

id,FloodProbability
1117957,0.5700424569158742
1117958,0.4606120742534915
1117959,0.46415030357303594
1117960,0.4689588728688152
1117961,0.4615332779893285
1117962,0.5053372728145092
1117963,0.539004799394006
1117964,0.5257604924359878
1117965,0.4698265405932124
