In [1]:
%%capture
!pip install --upgrade autogluon ipywidgets

In [2]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from autogluon.tabular import TabularPredictor

In [3]:
DATA_DIR = '/kaggle/input/bitgrit-crypto-price-prediction'

train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')
sample_sub = pd.read_csv(f'{DATA_DIR}/solution_format.csv')

In [4]:
train['feature_10_F_missing'] = train.feature_10_F.isna().astype('int')
test['feature_10_F_missing'] = test.feature_10_F.isna().astype('int')

In [5]:
TARGET = 'Target'
FEATURES = [f for f in test.columns if f not in ('ID', 'feature_10_F')]

In [6]:
feats_F = [f'feature_{i}_F' for i in range(1, 10)]

feats_A = [f'feature_{i}_A' for i in range(1, 10)]

CAT_FEATURES = ['TR_1_EventInd', 'TR_2_EventInd', 'TR_3_EventInd', 
                'feature_X_A', 'feature_10_A', 'feature_10_F_missing', 'feature_10_G'] \
             + feats_A + feats_F

train[CAT_FEATURES] = train[CAT_FEATURES].astype('category')
test[CAT_FEATURES] = test[CAT_FEATURES].astype('category')

In [7]:
%%time

hours = 11
time_limit = 60 * 60 * hours

problem_type = 'binary'
metric = 'f1'

predictor = TabularPredictor(
    label=TARGET,
    problem_type=problem_type,
    eval_metric=metric,
    path='/kaggle/working/models')

predictor.fit(
    train[FEATURES + [TARGET]],
    time_limit=time_limit,
    presets='best_quality')

Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Dynamic stacking is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
Detecting stacked overfitting by sub-fitting AutoGluon on the input data. That is, copies of AutoGluon will be sub-fit on subset(s) of the data. Then, the holdout validation data is used to detect stacked overfitting.
Sub-fit(s) time limit is: 39600 seconds.
Starting holdout-based sub-fit for dynamic stacking. Context path is: /kaggle/working/models/ds_sub_fit/sub_fit_ho.
Running the sub-fit in a ray process to avoid memory leakage.
Spend 9975 seconds for the sub-fit(s) during dynamic stacking.
Time left for full fit of AutoGluon: 29625 se

CPU times: user 5min 20s, sys: 44.9 s, total: 6min 5s
Wall time: 4h 52min 51s


<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7947d026aef0>

In [8]:
%%time
preds = predictor.predict(test[FEATURES])

INFO:sklearnex: sklearn.neighbors.KNeighborsClassifier.predict_proba: running accelerated version on CPU
INFO:sklearnex: sklearn.utils.validation._assert_all_finite: running accelerated version on CPU
INFO:sklearnex: sklearn.neighbors.KNeighborsClassifier.predict_proba: running accelerated version on CPU
INFO:sklearnex: sklearn.utils.validation._assert_all_finite: running accelerated version on CPU


CPU times: user 5.87 s, sys: 167 ms, total: 6.03 s
Wall time: 3.94 s


In [9]:
def create_submission_files(preds, config, notebook='02'):
    sub = sample_sub.copy()
    sub[TARGET] = preds
    sub.to_csv(f'nb{notebook}_{config}.csv', index=False)

In [10]:
config = f'autogluon_f1_{hours}hrs'
create_submission_files(preds, config)

In [11]:
!head nb02_autogluon_f1_11hrs.csv

ID,Target
2661,1
2662,1
2663,1
2664,1
2665,1
2666,1
2667,1
2668,1
2669,1
