In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install -U lightautoml

In [None]:
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.dataset.roles import DatetimeRole, CategoryRole
from lightautoml.tasks import Task
from sklearn.metrics import classification_report, roc_auc_score

In [None]:
train_df = pd.read_csv('../input/cat-in-the-dat/train.csv')
test_df = pd.read_csv('../input/cat-in-the-dat/test.csv')
submission_df = pd.read_csv('../input/cat-in-the-dat/sample_submission.csv')

In [None]:
N_THREADS = 4 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
TEST_SIZE = 0.2 # Test size for metric check
TIMEOUT = 8*3600 # Time in seconds for automl run
TARGET_NAME = 'target' # Target column name
np.random.seed(RANDOM_STATE)

In [None]:
train_df.columns

In [None]:
def preprocess(df):
    df['time'] = (np.datetime64('2018-01-01') + df['day'].astype(np.dtype('timedelta64[D]')) + df['month'].astype(np.dtype('timedelta64[M]'))).astype(str)
    return df.drop(columns=['id', 'day', 'month'])
train = preprocess(train_df)
test = preprocess(test_df)

In [None]:
train.head()

In [None]:
train.columns

In [None]:
task = Task('binary', )
roles = {'target': TARGET_NAME,
         DatetimeRole(base_date=True, seasonality=('m', 'd', 'wd', 'hour'), base_feats=True): 'time',
         CategoryRole(ordinal=False): ['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4', 'nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4', 'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9',], 
         CategoryRole(ordinal=True): ['ord_0', 'ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5',],
         }

In [None]:
automl = TabularUtilizedAutoML(task = task,
                               verbose=2,
                               timeout = TIMEOUT,
                               general_params = {'nested_cv': False, 'use_algos': [['linear_l2', 'lgb', 'lgb_tuned', 'cb', 'cb_tuned']]},
                               reader_params = {'cv': N_FOLDS, 'random_state': RANDOM_STATE},
                               tuning_params = {'max_tuning_iter': 100},
                               lgb_params = {'default_params': {'num_threads': N_THREADS, }}, #'device': 'gpu'
                               cb_params = {'default_params': {'thread_count': N_THREADS, }} #'task_type': 'GPU'
                              )
oof_pred = automl.fit_predict(train, roles = roles)


In [None]:
print(roc_auc_score(train[TARGET_NAME].values.ravel(), oof_pred.data.ravel()))

In [None]:
thres = .5
print(classification_report(train[TARGET_NAME].values.ravel(), (oof_pred.data.ravel() > thres).astype(int), digits=6))

In [None]:
test_pred = automl.predict(test)

In [None]:
submission_df['target'] = test_pred.data.ravel()
submission_df.head()

In [None]:
submission_df.to_csv('submission.csv', index=False)