In [1]:
%%capture
!pip install --upgrade flaml[automl] ipywidgets

In [2]:
import gc
gc.enable()

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from flaml import AutoML

SEED = 2024

In [3]:
DATA_DIR = '/kaggle/input/playground-series-s4e5'

train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')

In [4]:
train = train.drop('id', axis=1)
test = test.drop('id', axis=1)

In [5]:
TARGET = 'FloodProbability'
features = [f for f in test.columns]

In [6]:
TIME_BUDGET = 60 * 60 * 11
NUM_FOLDS = 10

automl_settings = {
    'time_budget': TIME_BUDGET,
    'task': 'regression',
    'metric': 'r2',
    'ensemble': True,
    'eval_method': 'cv',
    'split_type': KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=SEED),
    'retrain_full': True,
    'early_stop': True,
    'seed': SEED
}

In [7]:
%%time
automl = AutoML()
automl.fit(X_train=train[features], y_train=train[TARGET], **automl_settings)

[flaml.automl.logger: 05-01 04:46:50] {1680} INFO - task = regression
[flaml.automl.logger: 05-01 04:46:50] {1691} INFO - Evaluation method: cv
[flaml.automl.logger: 05-01 04:46:50] {1789} INFO - Minimizing error metric: 1-r2
[flaml.automl.logger: 05-01 04:46:50] {1901} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'catboost', 'xgboost', 'extra_tree', 'xgb_limitdepth']
[flaml.automl.logger: 05-01 04:46:50] {2219} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 05-01 04:47:21] {2345} INFO - Estimated sufficient time budget=306846s. Estimated necessary time budget=2624s.
[flaml.automl.logger: 05-01 04:47:21] {2392} INFO -  at 35.4s,	estimator lgbm's best error=0.9631,	best estimator lgbm's best error=0.9631
[flaml.automl.logger: 05-01 04:47:21] {2219} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 05-01 04:47:55] {2392} INFO -  at 70.1s,	estimator lgbm's best error=0.9631,	best estimator lgbm's best error=0.9631
[flaml.automl.logger: 05-01 04:47:

In [8]:
%%time
preds = automl.predict(test[features])

CPU times: user 1min 3s, sys: 28 ms, total: 1min 3s
Wall time: 1min 3s


In [9]:
sub = pd.read_csv(f'{DATA_DIR}/sample_submission.csv')
sub[TARGET] = preds.clip(0, 1)

In [10]:
sub.to_csv('nb05.csv', index=False)

In [11]:
!head nb05.csv

id,FloodProbability
1117957,0.5748599363928231
1117958,0.4550166805306189
1117959,0.45460316067748585
1117960,0.46600206360482055
1117961,0.4624786142811014
1117962,0.5050892358831723
1117963,0.5348989462359299
1117964,0.5282848940805784
1117965,0.4692253560551578
