In [None]:
!pip install -U lightautoml

In [None]:
# Standard python libraries
import logging
import os
import time
import requests
logging.basicConfig(format='[%(asctime)s] (%(levelname)s): %(message)s', level=logging.INFO)

# Installed libraries
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
import torch

# Imports from our package
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.tasks import Task

In [None]:
df_merged = pd.read_csv('/kaggle/input/df-merged/df_merged.csv', index_col=0)
df_merged.index = pd.to_datetime(df_merged.index, dayfirst=True)

In [None]:
df_merged.info()

In [None]:
# train_data_automl = df_merged['2012':'2019-03']
# train_data_automl = train_data_automl.drop(columns=['abs_diff','diff_3','abs_diff_3', 'T_a'])

# test_data_auto_ml = df_merged['2019-11':]
# test_data_auto_ml = test_data_auto_ml.drop(columns=['abs_diff','diff_3','abs_diff_3', 'T_a'])

In [None]:
df_merged = df_merged[df_merged['abs_diff'] > 1]

In [None]:
train_data_automl = df_merged['2012':'2019-03']
train_data_automl = train_data_automl.drop(columns=['abs_diff','diff_3','abs_diff_3', 'T_a', 'T_kal', 'T_sher', 'T_sar', 'three_mean'])

test_data_auto_ml = df_merged['2019-11':]
test_data_auto_ml = test_data_auto_ml.drop(columns=['abs_diff','diff_3','abs_diff_3', 'T_a', 'T_kal', 'T_sher', 'T_sar', 'three_mean'])

In [None]:
N_THREADS = 4 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
TEST_SIZE = 0.2 # Test size for metric check
TIMEOUT = 1200 # Time in seconds for automl run - UPDATED VALUE FOR UTILIZATION
TARGET_NAME = 'diff' # Target column name

In [None]:
task = Task('reg', loss='mae', metric='mae')

In [None]:
roles = {'target': TARGET_NAME}

In [None]:
# CHANGED TabularAutoML to TabularUtilizedAutoML for timeout utilization
automl = TabularUtilizedAutoML(task = task, 
                       timeout = TIMEOUT,
                       cpu_limit = N_THREADS,
                       general_params = {'use_algos': [['lgb','lgb_tuned']]},
                       reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE},
                      )

In [None]:
oof_pred = automl.fit_predict(train_data_automl, roles = roles)
logging.info('oof_pred:\n{}\nShape = {}'.format(oof_pred, oof_pred.shape))

In [None]:
# Fast feature importances calculation
fast_fi = automl.get_feature_scores('fast')
fast_fi.set_index('Feature')['Importance'].plot.bar(figsize = (20, 10), grid = True)

In [None]:
accurate_fi = automl.get_feature_scores('accurate', test_data_auto_ml, silent=False)
accurate_fi.set_index('Feature')['Importance'].plot.bar(figsize = (20, 10), grid = True)

In [None]:
test_pred = automl.predict(test_data_auto_ml)
mean_absolute_error(test_pred.data[:, 0], df_merged['2019-11':]['diff'].values)

In [None]:
test_pred = automl.predict(test_data_auto_ml)
logging.info('Prediction for test data:\n{}\nShape = {}'
              .format(test_pred, test_pred.shape))

logging.info('Check scores...')
logging.info('OOF score: {}'.format(mean_absolute_error(train_data[TARGET_NAME].values, oof_pred.data[:, 0])))