In [None]:
from IPython.core.display import display, HTML, Javascript

# ----- Notebook Theme -----
color_map = ['#16a085', '#e8f6f3', '#d0ece7', '#a2d9ce', '#73c6b6', '#45b39d', 
                        '#16a085', '#138d75', '#117a65', '#0e6655', '#0b5345']

prompt = color_map[-1]
main_color = color_map[0]
strong_main_color = color_map[1]
custom_colors = [strong_main_color, main_color]

css_file = ''' 

div #notebook {
background-color: white;
line-height: 20px;
}

#notebook-container {
%s
margin-top: 2em;
padding-top: 2em;
border-top: 4px solid %s; /* light orange */
-webkit-box-shadow: 0px 0px 8px 2px rgba(224, 212, 226, 0.5); /* pink */
    box-shadow: 0px 0px 8px 2px rgba(224, 212, 226, 0.5); /* pink */
}

div .input {
margin-bottom: 1em;
}

.rendered_html h1, .rendered_html h2, .rendered_html h3, .rendered_html h4, .rendered_html h5, .rendered_html h6 {
color: %s; /* light orange */
font-weight: 600;
}

div.input_area {
border: none;
    background-color: %s; /* rgba(229, 143, 101, 0.1); light orange [exactly #E58F65] */
    border-top: 2px solid %s; /* light orange */
}

div.input_prompt {
color: %s; /* light blue */
}

div.output_prompt {
color: %s; /* strong orange */
}

div.cell.selected:before, div.cell.selected.jupyter-soft-selected:before {
background: %s; /* light orange */
}

div.cell.selected, div.cell.selected.jupyter-soft-selected {
    border-color: %s; /* light orange */
}

.edit_mode div.cell.selected:before {
background: %s; /* light orange */
}

.edit_mode div.cell.selected {
border-color: %s; /* light orange */

}
'''
def to_rgb(h): 
    return tuple(int(h[i:i+2], 16) for i in [0, 2, 4])

main_color_rgba = 'rgba(%s, %s, %s, 0.1)' % (to_rgb(main_color[1:]))
open('notebook.css', 'w').write(css_file % ('width: 95%;', main_color, main_color, main_color_rgba, main_color,  main_color, prompt, main_color, main_color, main_color, main_color))

def nb(): 
    return HTML("<style>" + open("notebook.css", "r").read() + "</style>")
nb()

<img src="https://github.com/AILab-MLTools/LightAutoML/raw/master/imgs/LightAutoML_logo_big.png" alt="LightAutoML logo" style="width:70%;"/>

# LightAutoML baseline

Official LightAutoML github repository is [here](https://github.com/AILab-MLTools/LightAutoML). 

### Do not forget to put upvote for the notebook and the ⭐️ for github repo if you like it - one click for you, great pleasure for us ☺️ 

In [None]:
s = '<iframe src="https://ghbtns.com/github-btn.html?user=sb-ai-lab&repo=LightAutoML&type=star&count=true&size=large" frameborder="0" scrolling="0" width="170" height="30" title="LightAutoML GitHub"></iframe>'
HTML(s)

## This notebook is the updated copy of our [Tutorial_1 from the GIT repository](https://github.com/AILab-MLTools/LightAutoML/blob/master/examples/tutorials/Tutorial_1_basics.ipynb). Please check our [tutorials folder](https://github.com/AILab-MLTools/LightAutoML/blob/master/examples/tutorials) if you are interested in other examples of LightAutoML functionality.

## 0. Prerequisites

### 0.0. install LightAutoML

In [None]:
# %%capture
# !pip3 install -U lightautoml

# # QUICK WORKAROUND FOR PROBLEM WITH PANDAS
# !pip3 install -U pandas

### 0.1. Import libraries

Here we will import the libraries we use in this kernel:
- Standard python libraries for timing, working with OS etc.
- Essential python DS libraries like numpy, pandas, scikit-learn and torch (the last we will use in the next cell)
- LightAutoML modules: `TabularAutoML` preset for AutoML model creation and Task class to setup what kind of ML problem we solve (binary/multiclass classification or regression)

In [None]:
# Standard python libraries
import os
import time

# Essential DS libraries
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import torch

# LightAutoML presets, task and report generation
# from lightautoml.automl.presets.tabular_presets import TabularAutoML
# from lightautoml.tasks import Task

# For NN training
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.layers import Dense, Input, Dropout, Embedding, Concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.utils import plot_model
from sklearn.preprocessing import StandardScaler

### 0.2. Constants

Here we setup the constants to use in the kernel:
- `N_THREADS` - number of vCPUs for LightAutoML model creation
- `N_FOLDS` - number of folds in LightAutoML inner CV
- `RANDOM_STATE` - random seed for better reproducibility
- `TIMEOUT` - limit in seconds for model to train

In [None]:
N_THREADS = 4
N_FOLDS = 5
RANDOM_STATE = 42
TIMEOUT = 1800

### 0.3. Imported models setup

For better reproducibility fix numpy random seed with max number of threads for Torch (which usually try to use all the threads on server):

In [None]:
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

### 0.4. Data loading
Let's check the data we have:

In [None]:
INPUT_DIR = '../input/tabular-playground-series-jun-2022/'

In [None]:
columns = ['row_id', 
           'F_1_0', 'F_1_1', 'F_1_2', 'F_1_3', 'F_1_4', 'F_1_5', 'F_1_6', 'F_1_7', 'F_1_8', 
           'F_1_9', 'F_1_10', 'F_1_11', 'F_1_12', 'F_1_13', 'F_1_14', 
           'F_2_0', 'F_2_1', 'F_2_2', 'F_2_3', 'F_2_4', 'F_2_5', 'F_2_6', 'F_2_7', 
           'F_2_8', 'F_2_9', 'F_2_10', 'F_2_11', 'F_2_12', 'F_2_13', 'F_2_14', 'F_2_15', 
           'F_2_16', 'F_2_17', 'F_2_18', 'F_2_19', 'F_2_20', 'F_2_21', 'F_2_22', 'F_2_23', 'F_2_24', 
           'F_3_0', 'F_3_1', 'F_3_2', 'F_3_3', 'F_3_4', 'F_3_5', 'F_3_6', 'F_3_7', 'F_3_8', 'F_3_9', 
           'F_3_10', 'F_3_11', 'F_3_12', 'F_3_13', 'F_3_14', 'F_3_15', 'F_3_16', 'F_3_17', 'F_3_18', 
           'F_3_19', 'F_3_20', 'F_3_21', 'F_3_22', 'F_3_23', 'F_3_24', 
           'F_4_0', 'F_4_1', 'F_4_2', 'F_4_3', 'F_4_4', 'F_4_5', 
           'F_4_6', 'F_4_7', 'F_4_8', 'F_4_9', 'F_4_10', 'F_4_11', 
           'F_4_12', 'F_4_13', 'F_4_14']

types_mapper = {}
for col in columns:
    if col == 'row_id' or col.startswith('F_2'):
        types_mapper[col] = 'int32'
    else:
        types_mapper[col] = 'float32'

In [None]:
data = pd.read_csv(INPUT_DIR + 'data.csv', dtype = types_mapper)
print(data.shape)
data.head()

In [None]:
submission = pd.read_csv(INPUT_DIR + 'sample_submission.csv')
print(submission.shape)
submission.head()

# LightAutoML training

In [None]:
FINAL_PREDS = []

In [None]:
cols_to_predict = data.isnull().sum(axis = 0)
cols_to_predict = cols_to_predict[cols_to_predict > 0].index.values
N_targets = len(cols_to_predict)
cols_to_predict

In [None]:
CATs = ['F_2_'+str(i) for i in range(25)]

def nn_model_hard(n_feats_num, n_feats_cat):
    inputs_num = Input(shape=(n_feats_num))
    inputs_cat = Input(shape=(n_feats_cat))
    
    activation = 'swish'
    hid_num = Dense(512, activation=activation)(inputs_num)
    
    cat_layers = [hid_num]
    for i in range(n_feats_cat):
        cat_layers.append(Embedding(20, 1)(inputs_cat[:, i]))
    
    concated_hid = Concatenate()(cat_layers)
    hid2 = Dense(256, activation=activation)(concated_hid)
    hid3 = Dense(128, activation=activation)(Concatenate()([concated_hid, hid2]))
    x = Dense(64, activation=activation)(Concatenate()([concated_hid, hid2, hid3]))
    x = Dense(1, activation='linear')(x)
    model = Model([inputs_num, inputs_cat], x)
    return model


def fit_model_hard(X_train, y_train, 
                  X_valid, y_valid, 
                  X_test, n_epochs = 50):
    #print(X_train.shape, X_valid.shape, X_test.shape)
    not_cats = [col for col in X_train.columns if col not in CATs]
    X_tr_cat = X_train[CATs]
    X_va_cat = X_valid[CATs]
    X_te_cat = X_test[CATs]
    X_tr = X_train[not_cats]
    X_va = X_valid[not_cats]
    X_te = X_test[not_cats]
    #print(X_tr.shape, X_va.shape, X_te.shape)
    
    scaler = StandardScaler()
    X_tr = scaler.fit_transform(X_tr)
    X_va = scaler.transform(X_va)
    X_te = scaler.transform(X_te)
    valid_data = ([X_va, X_va_cat], y_valid)

    lr = ReduceLROnPlateau(monitor = "val_loss", factor = 0.75, 
                           patience = 5, verbose = 0)
    es = EarlyStopping(monitor = "val_loss", patience = 10, 
                       verbose = 1, restore_best_weights = True)
    model = nn_model_hard(X_tr.shape[1], X_tr_cat.shape[1])
    model.compile(optimizer = Adam(learning_rate = 0.01),
                  loss = MeanSquaredError())

    history = model.fit([X_tr, X_tr_cat], y_train, 
                        validation_data = valid_data, 
                        epochs = n_epochs,
                        verbose = 0,
                        batch_size = 4096,
                        shuffle = True,
                        callbacks = [lr, es])
    
    y_valid_pred = model.predict([X_va, X_va_cat]).reshape(1, -1)[0]
    y_test_pred = model.predict([X_te, X_te_cat]).reshape(1, -1)[0]

    return y_valid_pred, y_test_pred

def nn_model(n_feats):
    inputs = Input(shape=(n_feats))
    activation = 'swish'
    x = Dense(512, activation=activation)(inputs)
    #x = Dropout(0.1)(x)
    x = Dense(256, activation=activation)(x)
    x = Dense(128, activation=activation)(x)
    #x = Dropout(0.1)(x)
    x = Dense(64, activation=activation)(x)
    x = Dense(1, activation='linear')(x)
    model = Model(inputs, x)
    return model


def fit_model(X_train, y_train, 
              X_valid, y_valid, 
              X_test, n_epochs = 50):
    
    scaler = StandardScaler()
    X_tr = scaler.fit_transform(X_train)
    X_va = scaler.transform(X_valid)
    X_te = scaler.transform(X_test)
    valid_data = (X_va, y_valid)

    lr = ReduceLROnPlateau(monitor = "val_loss", factor = 0.75, 
                           patience = 5, verbose = 0)
    es = EarlyStopping(monitor = "val_loss", patience = 10, 
                       verbose = 1, restore_best_weights = True)
    model = nn_model(X_train.shape[1])
    model.compile(optimizer = Adam(learning_rate = 0.01),
                  loss = MeanSquaredError())

    history = model.fit(X_tr, y_train, 
                        validation_data = valid_data, 
                        epochs = n_epochs,
                        verbose = 0,
                        batch_size = 4096,
                        shuffle = True,
                        callbacks = [lr, es])
    
    y_valid_pred = model.predict(X_va).reshape(1, -1)[0]
    y_test_pred = model.predict(X_te).reshape(1, -1)[0]

    return y_valid_pred, y_test_pred

In [None]:
%%time
import gc

hard_targets = ['F_1_7', 'F_1_12', 'F_1_13', 'F_3_19', 'F_3_21'] + ['F_4_'+str(i) for i in range(15)]

for it, TARGET_NAME in enumerate(cols_to_predict):
    print('='*50)
    print('START COLUMN {}/{} {}'.format(it+1, N_targets, TARGET_NAME))
    train_data = data[~data[TARGET_NAME].isna()]
    test_data = data[data[TARGET_NAME].isna()]
    print(train_data.shape, test_data.shape)

    features = [x for x in train_data.columns if x not in ['row_id', TARGET_NAME]]
    NN_OOF_PRED = np.zeros(len(train_data))
    NN_TEST_PRED = np.zeros(len(test_data))

    y = train_data[TARGET_NAME].values
    train_data = train_data[features].fillna(train_data.mean())
    X_test = test_data[features].fillna(train_data.mean())
    N_START = 2
    print(N_START)
    T0 = time.time()
    for it in range(N_START):
        T = time.time()
        skf = KFold(n_splits = N_FOLDS, random_state = RANDOM_STATE + it, shuffle = True)
        for fold, (train_idx, valid_idx) in enumerate(skf.split(y, y)):
            X_train = train_data.iloc[train_idx, :]
            X_valid = train_data.iloc[valid_idx, :]
            y_train = y[train_idx]
            y_valid = y[valid_idx]

            if TARGET_NAME in hard_targets:
                print('TRAIN HARD')
                val_pred, test_pred = fit_model_hard(X_train, y_train, 
                                                    X_valid, y_valid, 
                                                    X_test, 50)
            else:
                print('TRAIN EASY')
                val_pred, test_pred = fit_model(X_train, y_train, 
                                                X_valid, y_valid, 
                                                X_test, 50)

            print('ITER = {} FOLD {} score {:.5f}'.format(it, fold, mean_squared_error(y_valid, val_pred, squared = False)))

            NN_OOF_PRED[valid_idx] += val_pred / N_START
            NN_TEST_PRED += test_pred / N_FOLDS / N_START
            del X_train, X_valid
            gc.collect()
        print('AFTER ITER {} NN OOF score {:.5f}, time {:.2f}'.format(it, 
                                                                  mean_squared_error(y, NN_OOF_PRED / (it + 1) * N_START, squared = False), 
                                                                  time.time() - T))

    print('NN OOF score {:.5f}, time {:.2f}'.format(mean_squared_error(y, NN_OOF_PRED, squared = False), time.time() - T0))
    FINAL_PREDS.append(pd.DataFrame([[str(idx) + '-' + TARGET_NAME, val] for idx,val in zip(test_data['row_id'].values, NN_TEST_PRED)], 
                                    columns = submission.columns))

In [None]:
# train_data = data[~data[TARGET_NAME].isna()]
# test_data = data[data[TARGET_NAME].isna()]
# train_data['NN_preds'] = NN_OOF_PRED
# test_data['NN_preds'] = NN_TEST_PRED

In [None]:
# task = Task('reg',)

# roles = {
#     'target': TARGET_NAME,
#     'drop': ['row_id']
# }

# automl = TabularAutoML(
#     task = task, 
#     timeout = TIMEOUT,
#     cpu_limit = N_THREADS,
#     general_params = {'use_algos': [['cb']]},
#     reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE},
#     selection_params = {'mode': 0}
# )

# oof_pred = automl.fit_predict(train_data.sample(100000), roles = roles, verbose = 3)

# Create submission

We are now ready for submission file creation:

In [None]:
submission = pd.concat(FINAL_PREDS)
submission.to_csv('nn_imputer.csv', index = False)
submission

# Additional materials

- [Official LightAutoML github repo](https://github.com/AILab-MLTools/LightAutoML)
- [LightAutoML documentation](https://lightautoml.readthedocs.io/en/latest)
- [LightAutoML tutorials](https://github.com/AILab-MLTools/LightAutoML/tree/master/examples/tutorials)
- LightAutoML course:
    - [Part 1 - general overview](https://ods.ai/tracks/automl-course-part1) 
    - [Part 2 - LightAutoML specific applications](https://ods.ai/tracks/automl-course-part2)
    - [Part 3 - LightAutoML customization](https://ods.ai/tracks/automl-course-part3)
- [OpenDataScience AutoML benchmark leaderboard](https://ods.ai/competitions/automl-benchmark/leaderboard)

### If you still like the notebook, do not forget to put upvote for the notebook and the ⭐️ for github repo if you like it using the button below - one click for you, great pleasure for us ☺️

In [None]:
s = '<iframe src="https://ghbtns.com/github-btn.html?user=AILab-MLTools&repo=LightAutoML&type=star&count=true&size=large" frameborder="0" scrolling="0" width="170" height="30" title="LightAutoML GitHub"></iframe>'
HTML(s)