#### Please upvote if you find the notebook useful

# Step 0.0. Install LightAutoML

In [None]:
pip install -U lightautoml

# Step 0.1. Import necessary libraries 

In [None]:
# Standard python libraries
import os
import time
import joblib

# Installed libraries
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split

# Imports from our package
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.tasks import Task

# Step 0.2. Parameters 

In [None]:
N_THREADS = 4 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
TEST_SIZE = 0.2 # Test size for metric check
TIMEOUT = 3600 # Time in seconds for automl run
TARGET_NAME = 'target'
LOAD_PRETRAINED = True

# Step 0.3. Data load 

In [None]:
%%time

train_data = pd.read_csv('../input/tabular-playground-series-may-2021/train.csv')
train_data[TARGET_NAME] = train_data[TARGET_NAME].str.slice(start=6).astype(int) - 1
train_data.head()

In [None]:
test_data = pd.read_csv('../input/tabular-playground-series-may-2021/test.csv')
test_data.head()

In [None]:
submission = pd.read_csv('../input/tabular-playground-series-may-2021/sample_submission.csv')
submission.head()

# Step 0.5. Add new features

In [None]:
def create_gr_feats(data):
    pass
    
if not LOAD_PRETRAINED:
    all_df = pd.concat([train_data, test_data]).reset_index(drop = True)
    create_gr_feats(all_df)
    train_data, test_data = all_df[:len(train_data)], all_df[len(train_data):]
    print(train_data.shape, test_data.shape)

In [None]:
train_data.head()

# ========= AutoML preset usage =========


## Step 1. Create Task

In [None]:
%%time

task = Task('multiclass',)

## Step 2. Setup columns roles

In [None]:
%%time

roles = {
    'target': TARGET_NAME,
    'drop': ['id'],
}

## Step 3. Train on full data (variant 1)

In [None]:
%%time 
# Model training
if not LOAD_PRETRAINED:
    automl = TabularUtilizedAutoML(task = task, 
                                   timeout = 3 * TIMEOUT,
                                   cpu_limit = N_THREADS,
                                   reader_params = {'n_jobs': N_THREADS},
                                   configs_list=[
                                       '../input/lightautoml-configs/conf_0_sel_type_0.yml',
                                       '../input/lightautoml-configs/conf_1_sel_type_1.yml'
                                   ])
    oof_pred_1 = automl.fit_predict(train_data, roles = roles)
    print('oof_pred:\n{}\nShape = {}'.format(oof_pred_1[:10], oof_pred_1.shape))

In [None]:
%%time

# Fast feature importances calculation
if not LOAD_PRETRAINED:
    fast_fi_1 = automl.get_feature_scores('fast', silent = False)
    fast_fi_1.set_index('Feature')['Importance'].plot.bar(figsize = (20, 10), grid = True)

In [None]:
%%time
# Model prediction
if not LOAD_PRETRAINED:
    test_pred_1 = automl.predict(test_data)
    print('Prediction for test data:\n{}\nShape = {}'.format(test_pred_1[:10], test_pred_1.shape))

    print('Check scores...')
    print('OOF score: {}'.format(log_loss(train_data[TARGET_NAME].values, oof_pred_1.data)))

## Step 4. Train on full data (variant 2)

In [None]:
%%time 
# Model training
if not LOAD_PRETRAINED:
    automl = TabularUtilizedAutoML(task = task, 
                                   timeout = 5 * TIMEOUT,
                                   cpu_limit = N_THREADS,
                                   reader_params = {'n_jobs': N_THREADS},
                                   configs_list=[
                                       '../input/lightautoml-configs/conf_4_sel_type_0_no_int.yml',
                                       '../input/lightautoml-configs/conf_5_sel_type_1_tuning_full.yml',
                                       '../input/lightautoml-configs/conf_6_sel_type_1_tuning_full_no_int_lgbm.yml'
                                   ])
    oof_pred_2 = automl.fit_predict(train_data, roles = roles)
    print('oof_pred:\n{}\nShape = {}'.format(oof_pred_2[:10], oof_pred_2.shape))

In [None]:
%%time

# Fast feature importances calculation
if not LOAD_PRETRAINED:
    fast_fi_2 = automl.get_feature_scores('fast', silent = False)
    fast_fi_2.set_index('Feature')['Importance'].plot.bar(figsize = (20, 10), grid = True)

In [None]:
%%time
# Model prediction
if not LOAD_PRETRAINED:
    test_pred_2 = automl.predict(test_data)
    print('Prediction for test data:\n{}\nShape = {}'.format(test_pred_2[:10], test_pred_2.shape))

    print('Check scores...')
    print('OOF score: {}'.format(log_loss(train_data[TARGET_NAME].values, oof_pred_2.data)))

## Step 5. Ensembling

In [None]:
if not LOAD_PRETRAINED:
    best_score = 10000
    best_w = None
    for w in np.arange(0, 1.01, 0.01):
        preds = w * oof_pred_1.data + (1 - w) * oof_pred_2.data
        sc = log_loss(train_data[TARGET_NAME].values, preds)
        if sc < best_score:
            best_score = sc
            best_w = w
            print(best_w, best_score)

    print('=' * 30)
    print(best_w, best_score)

In [None]:
if not LOAD_PRETRAINED:
    test_preds = best_w * test_pred_1.data + (1 - best_w) * test_pred_2.data

In [None]:
if LOAD_PRETRAINED:
    # Load calculated data from v10 of this kernel. Let's try to build an average of this 2 LightAutoML models
    oof_pred_1, oof_pred_2, test_pred_1, test_pred_2 = joblib.load('../input/tps-may-computed-data/oof_and_test_v10.pkl')
    test_preds = 0.5 * test_pred_1.data + 0.5 * test_pred_2.data

## Step 6. Prepare submission

In [None]:
submission.iloc[:, 1:] = test_preds
submission.to_csv('lightautoml_2variants_ensemble.csv', index = False)

In [None]:
submission

In [None]:
if not LOAD_PRETRAINED:
    joblib.dump((oof_pred_1, oof_pred_2, test_pred_1, test_pred_2), 'saved_oof_and_preds_from_ensemble.pkl')