# Step 0.0. Install LightAutoML

In [None]:
pip install -U lightautoml

# Step 0.1. Import necessary libraries 

In [None]:
# Standard python libraries
import os
import time
import re

# Installed libraries
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import torch

# Imports from our package
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.dataset.roles import DatetimeRole
from lightautoml.tasks import Task
from lightautoml.utils.profiler import Profiler

# Step 0.2. Parameters 

In [None]:
N_THREADS = 4 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
TEST_SIZE = 0.2 # Test size for metric check
TIMEOUT = 3600 # Time in seconds for automl run

# Step 0.3. Fix torch number of threads and numpy seed 

In [None]:
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

# Step 0.4. Data load 

In [None]:
%%time

train_data = pd.read_csv('../input/tabular-playground-series-mar-2021/train.csv')
train_data.head()

In [None]:
test_data = pd.read_csv('../input/tabular-playground-series-mar-2021/test.csv')
test_data.head()

In [None]:
submission = pd.read_csv('../input/tabular-playground-series-mar-2021/sample_submission.csv')
submission.head()

# Step 0.5. Extra features generation

In [None]:
def add_specific_feats(data):
    data['f1_mul_f2'] = data['cont1'] * data['cont2']
    data['f5_mul_f8'] = data['cont5'] * data['cont8']
    data['f0_mul_f2'] = data['cont0'] * data['cont2']
    data['f1_mul_f5'] = data['cont1'] * data['cont5']
    
    data['f3_mul_f4'] = data['cont3'] * data['cont4']
    data['f5_rat_f8'] = data['cont5'] / data['cont8']
    data['f1_min_f2'] = data['cont1'] - data['cont2']
    data['f1_pls_f6'] = data['cont1'] + data['cont6']
    
    data['f4_min_f8'] = data['cont4'] - data['cont8']
    data['f4_mul_f8'] = data['cont4'] * data['cont8']
    data['f2_mul_f4'] = data['cont2'] * data['cont4']
    data['f3_rat_f9'] = data['cont3'] / data['cont9']
    
    data['f6_rat_f10'] = data['cont6'] / data['cont10']
    data['f5_pls_f8'] = data['cont5'] + data['cont8']
    data['f2_pls_f6'] = data['cont2'] + data['cont6']
    data['f6_mul_f8'] = data['cont6'] * data['cont8']
    
    data['f6_min_f8'] = data['cont6'] - data['cont8']
    data['f0_pls_f10'] = data['cont0'] + data['cont10']
    return data

train_data = add_specific_feats(train_data)
test_data = add_specific_feats(test_data)

# Step 0.6. Data splitting for train-test 

In [None]:
tr_data, te_data = train_test_split(train_data, 
                                     test_size=TEST_SIZE, 
                                     stratify=train_data['target'], 
                                     random_state=RANDOM_STATE)
print('Data splitted. Parts sizes: tr_data = {}, te_data = {}'.format(tr_data.shape, te_data.shape))

# ========= AutoML preset usage =========


## Step 1. Create Task

In [None]:
%%time

# AUC metric is the default one for binary tasks in LightAutoML
task = Task('binary', )

## Step 2. Setup columns roles

In [None]:
%%time

roles = {
    'target': 'target',
    'drop': ['id'],
}

## Step 3. Create AutoML from preset and train on 80% of data

In [None]:
%%time 

automl = TabularAutoML(task = task, 
                       timeout = TIMEOUT,
                       cpu_limit = N_THREADS,
                       general_params = {'use_algos': [['linear_l2', 'lgb', 'lgb_tuned']]},
                       reader_params = {'n_jobs': N_THREADS})
oof_pred = automl.fit_predict(tr_data, roles = roles)
print('oof_pred:\n{}\nShape = {}'.format(oof_pred[:10], oof_pred.shape))

## Step 4. Predict to validation data and check scores

In [None]:
%%time

test_pred = automl.predict(te_data)
print('Prediction for test data:\n{}\nShape = {}'.format(test_pred[:10], test_pred.shape))

print('Check scores...')
print('OOF score: {}'.format(roc_auc_score(tr_data['target'].values, oof_pred.data[:, 0])))
print('TEST score: {}'.format(roc_auc_score(te_data['target'].values, test_pred.data[:, 0])))

## Step 5. Create AutoML with time utilization 

Below we are going to create specific AutoML preset for TIMEOUT utilization (try to spend it as much as possible):

In [None]:
%%time 

automl = TabularUtilizedAutoML(task = task, 
                       timeout = TIMEOUT,
                       cpu_limit = N_THREADS,
                       general_params = {'use_algos': [['linear_l2', 'lgb', 'lgb_tuned']]},
                       reader_params = {'n_jobs': N_THREADS})
oof_pred = automl.fit_predict(tr_data, roles = roles)
print('oof_pred:\n{}\nShape = {}'.format(oof_pred[:10], oof_pred.shape))

## Step 6. Predict to validation data and check scores for utilized automl

In [None]:
%%time

test_pred = automl.predict(te_data)
print('Prediction for test data:\n{}\nShape = {}'.format(test_pred[:10], test_pred.shape))

print('Check scores...')
print('OOF score: {}'.format(roc_auc_score(tr_data['target'].values, oof_pred.data[:, 0])))
print('TEST score: {}'.format(roc_auc_score(te_data['target'].values, test_pred.data[:, 0])))

## Step 7. Train on full data 

In [None]:
%%time 

automl = TabularUtilizedAutoML(task = task, 
                       timeout = TIMEOUT,
                       cpu_limit = N_THREADS,
                       general_params = {'use_algos': [['linear_l2', 'lgb', 'lgb_tuned']]},
                       reader_params = {'n_jobs': N_THREADS})
oof_pred = automl.fit_predict(train_data, roles = roles)
print('oof_pred:\n{}\nShape = {}'.format(oof_pred[:10], oof_pred.shape))

## Step 8. Predict for test data and check OOF score

In [None]:
%%time

test_pred = automl.predict(test_data)
print('Prediction for test data:\n{}\nShape = {}'.format(test_pred[:10], test_pred.shape))

print('Check scores...')
print('OOF score: {}'.format(roc_auc_score(train_data['target'].values, oof_pred.data[:, 0])))

## Step 9. Prepare submission

In [None]:
submission['target'] = test_pred.data[:, 0]
submission.to_csv('automl_utilized_3600_with_feats.csv', index = False)

In [None]:
submission