In [1]:
%%capture
!pip install --upgrade autogluon ipywidgets

In [2]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from autogluon.tabular import TabularPredictor

In [3]:
DATA_DIR = '/kaggle/input/playground-series-s4e5'

train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')
sample_sub = pd.read_csv(f'{DATA_DIR}/sample_submission.csv')

In [4]:
train = train.drop('id', axis=1)
test = test.drop('id', axis=1)

In [5]:
TARGET = 'FloodProbability'
features = list(test.columns)

In [6]:
train['sum_all'] = train[features].sum(axis=1)
test['sum_all'] = test[features].sum(axis=1)

In [7]:
time_limit = 60 * 60 * 6.5
metric = 'r2'

predictor = TabularPredictor(
    label=TARGET, 
    problem_type='regression',
    eval_metric=metric,
    path='/kaggle/working/models')

predictor.fit(
    train,
    time_limit=time_limit,
    presets='best_quality')

Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Dynamic stacking is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
Detecting stacked overfitting by sub-fitting AutoGluon on the input data. That is, copies of AutoGluon will be sub-fit on subset(s) of the data. Then, the holdout validation data is used to detect stacked overfitting.
Sub-fit(s) time limit is: 23400.0 seconds.
Starting holdout-based sub-fit for dynamic stacking. Context path is: /kaggle/working/models/ds_sub_fit/sub_fit_ho.
Running the sub-fit in a ray process to avoid memory leakage.
Spend 6155 seconds for the sub-fit(s) during dynamic stacking.
Time left for full fit of AutoGluon: 17245 

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7f22245525c0>

In [8]:
%%time
preds = predictor.predict(test)

INFO:sklearnex: sklearn.neighbors.KNeighborsRegressor.predict: running accelerated version on CPU
INFO:sklearnex: sklearn.utils.validation._assert_all_finite: running accelerated version on CPU
INFO:sklearnex: sklearn.neighbors.KNeighborsRegressor.predict: running accelerated version on CPU
INFO:sklearnex: sklearn.utils.validation._assert_all_finite: running accelerated version on CPU


CPU times: user 1h 48min 31s, sys: 15.4 s, total: 1h 48min 46s
Wall time: 34min 36s


In [9]:
def create_submission_files(preds, notebook='03'):
    sub = sample_sub.copy()
    sub[TARGET] = preds.clip(0, 1)
    sub.to_csv(f'nb{notebook}.csv', index=False)

In [10]:
create_submission_files(preds)

In [11]:
!head nb03.csv

id,FloodProbability
1117957,0.5773626
1117958,0.4512708
1117959,0.44890797
1117960,0.471793
1117961,0.47135243
1117962,0.50706774
1117963,0.5371877
1117964,0.5279045
1117965,0.4729153
