# Step 0. Install LAMA

In [None]:
pip install lightautoml

# Step 0.1. Import necessary libraries 

In [None]:
# Standard python libraries
import os
import time

# Installed libraries
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import torch
import matplotlib.pyplot as plt
import pickle

# Imports from our package
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.dataset.roles import DatetimeRole
from lightautoml.tasks import Task
from lightautoml.report.report_deco import ReportDeco

#Import h2o automl
import h2o
from h2o.automl import H2OAutoML

# Step 0.2. Parameters 

In [None]:
N_THREADS = 4 # threads cnt for lgbm and linear models
N_FOLDS = 3 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
TEST_SIZE = 0.2 # Test size for metric check
TIMEOUT = 300 # Time in seconds for automl run
TARGET_NAME = 'TARGET' # Target column name

# Step 0.3. Fix torch number of threads and numpy seed 

In [None]:
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

# Step 0.4. Example data load 

In [None]:
%%time

data = pd.read_csv('../input/lama-datasets/sampled_app_train.csv')
data.head()

# Step 0.5. (Optional) Some user feature preparation 

Cell below shows some user feature preparations to create task more difficult (this block can be omitted if you don't want to change the initial data):

In [None]:
%%time

data['BIRTH_DATE'] = (np.datetime64('2018-01-01') + data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))).astype(str)
data['EMP_DATE'] = (np.datetime64('2018-01-01') + np.clip(data['DAYS_EMPLOYED'], None, 0).astype(np.dtype('timedelta64[D]'))
                    ).astype(str)

data['constant'] = 1
data['allnan'] = np.nan

data['report_dt'] = np.datetime64('2018-01-01')

data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True)

# Step 0.6. (Optional) Data splitting for train-test 

Block below can be omitted if you are going to train model only or you have specific train and test files:

In [None]:
%%time

train_data, test_data = train_test_split(data, 
                                         test_size=TEST_SIZE, 
                                         stratify=data[TARGET_NAME], 
                                         random_state=RANDOM_STATE)
print('Data splitted. Parts sizes: train_data = {}, test_data = {}'
              .format(train_data.shape, test_data.shape))

In [None]:
train_data.head()

#  ==== AutoML preset usage ====


## Step 1. Create Task

In [None]:
%%time

task = Task('binary', )

## Step 2. Setup columns roles

Roles setup here set target column and base date, which is used to calculate date differences:

In [None]:
%%time

roles = {'target': TARGET_NAME,
         DatetimeRole(base_date=True, seasonality=(), base_feats=False): 'report_dt',
         }

## Step 3. Create AutoML from preset

To create AutoML model here we use `TabularAutoML` preset.


All params we set above can be send inside preset to change its configuration:

In [None]:
%%time 
start = time.time()
automl = TabularAutoML(task = task, 
                       timeout = TIMEOUT,
                       cpu_limit = N_THREADS,
                       general_params = {'nested_cv': False, 'use_algos': [['linear_l2', 'lgb', 'lgb_tuned']]},
                       reader_params = {'cv': N_FOLDS, 'random_state': RANDOM_STATE},
                       tuning_params = {'max_tuning_iter': 20, 'max_tuning_time': 30},
                       verbose=0)

RD = ReportDeco()
automl_rd = RD(automl)

oof_pred = automl_rd.fit_predict(train_data, roles = roles)
print('oof_pred:\n{}\nShape = {}'.format(oof_pred, oof_pred.shape))
time_automl = time.time() - start

Save new config template:

In [None]:
automl.get_config(path='bb_config.yml')

In [None]:
with open('automl.pickle', 'wb') as f:
    pickle.dump(automl, f)

## Step 4. Predict to test data and check scores

In [None]:
%%time

test_pred = automl_rd.predict(test_data)
print('Prediction for test data:\n{}\nShape = {}'.format(test_pred, test_pred.shape))

print('Check scores...')
print('OOF score: {}'.format(roc_auc_score(train_data[TARGET_NAME].values,
                                           oof_pred.data[:, 0])))
test_automl = roc_auc_score(test_data[TARGET_NAME].values, test_pred.data[:, 0])
print('TEST score: {}'.format(test_automl))


## Step 5. Same Preset with less available time.

In [None]:
%%time 
start = time.time()
automl = TabularAutoML(task = task, 
                       timeout = 20,
                       cpu_limit = N_THREADS,
                       general_params = {'nested_cv': False, 'use_algos': [['linear_l2', 'lgb', 'lgb_tuned']]},
                       reader_params = {'cv': N_FOLDS, 'random_state': RANDOM_STATE},
                       tuning_params = {'max_tuning_iter': 20, 'max_tuning_time': 30},
                       verbose=0)


oof_pred = automl.fit_predict(train_data, roles = roles)
print('oof_pred:\n{}\nShape = {}'.format(oof_pred, oof_pred.shape))
time_automl_fast = time.time() - start

OOF predictions now contains NaNs because not all folds were calculated. So, we omit OOF score.

In [None]:
%%time

test_pred = automl.predict(test_data)
print('Prediction for test data:\n{}\nShape = {}'.format(test_pred, test_pred.shape))

print('Check scores...')
test_automl_fast = roc_auc_score(test_data[TARGET_NAME].values, test_pred.data[:, 0])
print('TEST score: {}'.format(test_automl_fast))

## Step 6. Create AutoML with time utilization

Below we are going to create specific AutoML preset for TIMEOUT utilization (try to spend it as much as possible):

In [None]:
%%time
start = time.time()
automl = TabularUtilizedAutoML(task = task,
                       timeout = TIMEOUT,
                       general_params = {'nested_cv': False, 'use_algos': [['linear_l2', 'lgb', 'lgb_tuned']]},
                       reader_params = {'cv': N_FOLDS, 'random_state': RANDOM_STATE},
                       tuning_params = {'max_tuning_iter': 20, 'max_tuning_time': 30},
                       verbose=0)
oof_pred = automl.fit_predict(train_data, roles = roles)
time_automl_utilized = time.time() - start

In [None]:
%%time

test_pred = automl.predict(test_data)
print('Prediction for test data:\n{}\nShape = {}'.format(test_pred, test_pred.shape))

print('Check scores...')
print('OOF score: {}'.format(roc_auc_score(train_data[TARGET_NAME].values,
                                           oof_pred.data[:, 0])))
test_automl_utilized = roc_auc_score(test_data[TARGET_NAME].values, test_pred.data[:, 0])
print('TEST score: {}'.format(test_automl_utilized))

Let's compare results:

In [None]:
labels = ['fast', 'base', 'utilized']
times = [time_automl_fast, time_automl, time_automl_utilized]
scores = [test_automl_fast, test_automl, test_automl_utilized]

def plot_bar(labels, times, scores):
    x = np.arange(len(labels))  # the label locations
    width = 0.35  # the width of the bars

    fig, ax = plt.subplots(figsize=[7, 5])
    rects = ax.bar(x, scores, width, label='Score', hatch="///",edgecolor="#034569", color='none')

    ax2 = ax.twinx()  # instantiate a second axes that shares the same x-axis
    color = '#FF8B00'
    ax2.plot(x, times, color=color, label='Time', marker='o',)
    ax2.set_ylabel('Time, seconds', color=color) 
    ax2.tick_params(axis='y', labelcolor=color)
    ax2.grid(False)
    ax2.set_ylim(np.min(times) * 0.8, np.max(times) * 1.1)

    ax.set_ylabel('Score, ROC AUC')
    ax.set_title('Score by available time')
    ax.set_xticks(x)
    ax.set_xticklabels(labels)
    ax.set_ylim(np.min(scores) - 0.002, np.max(scores) + 0.002)
    ax.grid(False)
    ax.spines['top'].set_visible(False)
    ax2.spines['top'].set_visible(False)

    for rect in rects:
        height = rect.get_height()
        ax.annotate('{}'.format(np.round(height, 3)),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')

    plt.legend()
    plt.tight_layout()
    plt.show()
    
plot_bar(labels, times, scores)

## Step 7. H2O? 

In [None]:
h2o.init(nthreads=-1,     # number of threads when launching a new H2O server
         max_mem_size=12  # in gigabytes
        )

In [None]:
start = time.time()

X_y_train_h = h2o.H2OFrame(train_data)
types = X_y_train_h.types

aml = H2OAutoML(max_runtime_secs=(300),  # 5 minutes
                max_models=None,  # no limit
                seed=RANDOM_STATE)
cols = sorted(list(set(train_data.columns) - {TARGET_NAME}))
aml.train(x=cols,y=TARGET_NAME, training_frame=X_y_train_h)

time_h2o = time.time() - start

In [None]:
h2o_predict = aml.predict(h2o.H2OFrame(test_data, column_types=types))
test_h2o = roc_auc_score(test_data[TARGET_NAME].values, h2o_predict.as_data_frame().values[:, 0])


In [None]:
labels.append('h2o')
times.append(time_h2o)
scores.append(test_h2o)

plot_bar(labels, times, scores)

## Step 8. Report

* Report for base TabularAutoML is [here](./lama_report/lama_interactive_report.html)