In [None]:
from IPython.core.display import display, HTML, Javascript

# ----- Notebook Theme -----
color_map = ['#16a085', '#e8f6f3', '#d0ece7', '#a2d9ce', '#73c6b6', '#45b39d', 
                        '#16a085', '#138d75', '#117a65', '#0e6655', '#0b5345']

prompt = color_map[-1]
main_color = color_map[0]
strong_main_color = color_map[1]
custom_colors = [strong_main_color, main_color]

css_file = ''' 

div #notebook {
background-color: white;
line-height: 20px;
}

#notebook-container {
%s
margin-top: 2em;
padding-top: 2em;
border-top: 4px solid %s; /* light orange */
-webkit-box-shadow: 0px 0px 8px 2px rgba(224, 212, 226, 0.5); /* pink */
    box-shadow: 0px 0px 8px 2px rgba(224, 212, 226, 0.5); /* pink */
}

div .input {
margin-bottom: 1em;
}

.rendered_html h1, .rendered_html h2, .rendered_html h3, .rendered_html h4, .rendered_html h5, .rendered_html h6 {
color: %s; /* light orange */
font-weight: 600;
}

div.input_area {
border: none;
    background-color: %s; /* rgba(229, 143, 101, 0.1); light orange [exactly #E58F65] */
    border-top: 2px solid %s; /* light orange */
}

div.input_prompt {
color: %s; /* light blue */
}

div.output_prompt {
color: %s; /* strong orange */
}

div.cell.selected:before, div.cell.selected.jupyter-soft-selected:before {
background: %s; /* light orange */
}

div.cell.selected, div.cell.selected.jupyter-soft-selected {
    border-color: %s; /* light orange */
}

.edit_mode div.cell.selected:before {
background: %s; /* light orange */
}

.edit_mode div.cell.selected {
border-color: %s; /* light orange */

}
'''
def to_rgb(h): 
    return tuple(int(h[i:i+2], 16) for i in [0, 2, 4])

main_color_rgba = 'rgba(%s, %s, %s, 0.1)' % (to_rgb(main_color[1:]))
open('notebook.css', 'w').write(css_file % ('width: 95%;', main_color, main_color, main_color_rgba, main_color,  main_color, prompt, main_color, main_color, main_color, main_color))

def nb(): 
    return HTML("<style>" + open("notebook.css", "r").read() + "</style>")
nb()

<img src="https://github.com/AILab-MLTools/LightAutoML/raw/master/imgs/LightAutoML_logo_big.png" alt="LightAutoML logo" style="width:70%;"/>

# LightAutoML baseline

Official LightAutoML github repository is [here](https://github.com/AILab-MLTools/LightAutoML). 

### Do not forget to put upvote for the notebook and the ⭐️ for github repo if you like it - one click for you, great pleasure for us ☺️ 

In [None]:
s = '<iframe src="https://ghbtns.com/github-btn.html?user=sb-ai-lab&repo=LightAutoML&type=star&count=true&size=large" frameborder="0" scrolling="0" width="170" height="30" title="LightAutoML GitHub"></iframe>'
HTML(s)

## 0. Prerequisites

### 0.0. install LightAutoML

In [None]:
%%capture
# Dont forget to add dataset "lama0216nn"
# Developers version with tabular neural networks available
!pip install ../input/lama0216nn/LightAutoML-0.2.16-py3-none-any.whl
!pip install pyyaml

### 0.1. Import libraries

Here we will import the libraries we use in this kernel:
- Standard python libraries for timing, working with OS etc.
- Essential python DS libraries like numpy, pandas, scikit-learn and torch (the last we will use in the next cell)
- LightAutoML modules: `TabularAutoML` preset for AutoML model creation and Task class to setup what kind of ML problem we solve (binary/multiclass classification or regression)

In [None]:
# Standard python libraries
import os
import time
import site

# Essential DS libraries
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import yaml
from sklearn.metrics import mean_squared_error
from torch.optim.lr_scheduler import ReduceLROnPlateau

# LightAutoML task, features and model
from lightautoml.tasks import Task
from lightautoml.reader.base import PandasToPandasReader
from lightautoml.validation.np_iterators import FoldsIterator

from lightautoml.pipelines.features.torch_pipeline import TorchBaseFeatures
from lightautoml.pipelines.ml.base import MLPipeline
from lightautoml.ml_algo.dl_model import TorchModel

# Logging
import logging
from lightautoml.utils.logging import set_stdout_level
from lightautoml.utils.logging import verbosity_to_loglevel


### 0.2. Constants

Here we setup the constants to use in the kernel:
- `N_FOLDS` - number of folds in LightAutoML inner CV
- `RANDOM_STATE` - random seed for better reproducibility
- `N_EPOCHS` - number of nn epochs
- `MODEL_NAME` - model name

In [None]:
N_FOLDS = 5
RANDOM_STATE = 42
N_EPOCHS = 50
MODEL_NAME = "dense"
device = torch.device('cuda')
tabular_params = yaml.load(open(site.getsitepackages()[0] + '/lightautoml/automl/presets/tabular_config.yml'),
                           Loader=yaml.FullLoader)

### 0.3. Neural network params
#### Params description:

- `bs` - batch_size  
- `hidden_size_base` - hidden size dim used in all layers  
- `snap_params`:
  - `k` - number of best ckeckpoint to average
  - `swa` - stochastic weight averaging
  
  
Below we have 2 sets of params:
- First one is for the full run of the network inside LAMA, but calculation time is ~ 2 days
- Simplified version of params to be on time in Kaggle kernels

In [None]:
# ~ 2 days

# N_EPOCHS = 500

# snap_params_config_1 = {'k': 3, 'early_stopping': True, 'patience': 16, 'swa': True}
# torch_model_params = {'num_workers': 1,
#                       'verbose': 1,
#                       'optimization_search_space': None,
#                       'path_to_save': None}
# base_def_params = {
#     'num_init_features': 512,
#     'snap_params': snap_params_config_1,
#     'use_noise': False,
#     'use_bn': True,
#     'bs': 2048,
#     'emb_dropout': 0.1,
#     'emb_ratio': 3,
#     'opt': torch.optim.Adam,
#     'opt_params': {
#         'weight_decay': 0,
#         'lr': 0.01
#     },
#     'sch': ReduceLROnPlateau,
#     'scheduler_params': {
#         'patience': 4,
#         'factor': 0.7,
#         'min_lr': 1e-5
#     },
#     'act_fun': nn.SiLU,
#     'drop_rate_base': 0.1,
#     'init_bias': True
# }

# model_to_default_params = {
#     'dense': {
#         'num_blocks': 2,
#         'block_size_base': 2,
#         'growth_size': 256,
#         'bn_factor': 2,
#         'compression': 0.5,
#         'efficient': True,
#         **base_def_params
#     }
# }

# nn_names = ['dense']

# name_to_model_params = {}
# for nn_name in nn_names:
#     name_to_model_params[nn_name] = {**torch_model_params, **model_to_default_params[nn_name],
#                                      'model': nn_name, 'n_epochs': N_EPOCHS, 'device': device}

In [None]:
N_EPOCHS = 40

snap_params_config_1 = {'k': 3, 'early_stopping': True, 'patience': 5, 'swa': True}
torch_model_params = {'num_workers': 2,
                      'verbose': None,
                      'optimization_search_space': None,
                      'path_to_save': None}
base_def_params = {
    'num_init_features': 512,
    'snap_params': snap_params_config_1,
    'use_noise': False,
    'use_bn': True,
    'bs': 4096,
    'emb_dropout': 0.0,
    'emb_ratio': 3,
    'opt': torch.optim.Adam,
    'opt_params': {
        'weight_decay': 0,
        'lr': 0.01
    },
    'sch': ReduceLROnPlateau,
    'scheduler_params': {
        'patience': 3,
        'factor': 0.7,
        'min_lr': 1e-5
    },
    'act_fun': nn.SiLU,
    'drop_rate_base': 0.0,
    'init_bias': True,
}

model_to_default_params = {
    'dense': {
        'num_blocks': 2,
        'block_size_base': 2,
        'growth_size': 256,
        'bn_factor': 2,
        'compression': 0.5,
        'efficient': True,
        **base_def_params
    }
}

nn_names = ['dense']

name_to_model_params = {}
for nn_name in nn_names:
    name_to_model_params[nn_name] = {**torch_model_params, **model_to_default_params[nn_name],
                                     'model': nn_name, 'n_epochs': N_EPOCHS, 'device': device}

### 0.4. Set logging format

In [None]:
logger = logging.getLogger()
level = verbosity_to_loglevel(1)
set_stdout_level(level)
logger.info(f"Stdout logging level is {logging._levelToName[level]}.")

### 0.5. Data loading
Let's check the data we have:

In [None]:
INPUT_DIR = '../input/tabular-playground-series-jun-2022/'

In [None]:
data = pd.read_csv(INPUT_DIR + 'data.csv')
print(data.shape)
data.head()

In [None]:
submission = pd.read_csv(INPUT_DIR + 'sample_submission.csv')
print(submission.shape)
submission.head()

# LightAutoML training

In [None]:
FINAL_PREDS = []

In [None]:
F1_3_features = [col for col in data.columns if col.startswith('F_1_') or col.startswith('F_3_')]
F4_features = [col for col in data.columns if col.startswith('F_4_')]

In [None]:
for col in F1_3_features:
    print('Filling column = {}'.format(col))
    mean_val = data[col].mean()
    idxs = data[col][data[col].isnull()].index.values
    
    FINAL_PREDS.append(pd.DataFrame([[str(idx) + '-' + col, mean_val] for idx in idxs], 
                                    columns = submission.columns))

#### In the cell below we use LightAutoML as the constructor for the ML pipelines (you can create your own blocks and add them to pipeline as well):

In [None]:
N_targets = len(F4_features)
for it, TARGET_NAME in enumerate(F4_features):
    print('='*50)
    print('START COLUMN {}/{} {}'.format(it+1, N_targets, TARGET_NAME))
    train_data = data[~data[TARGET_NAME].isna()][F4_features + ['row_id']]
    test_data = data[data[TARGET_NAME].isna()][F4_features + ['row_id']]
    print(train_data.shape, test_data.shape)

    # Task setup
    task = Task('reg',)

    # Feature roles
    roles = {
        'target': TARGET_NAME,
        'drop': ['row_id']
    }

    # Pipeline creation
    reader = PandasToPandasReader(task, **{**tabular_params['reader_params'], 
                                         "random_state": RANDOM_STATE,
                                         "cv": N_FOLDS})
    train_ds = reader.fit_read(train_data, roles=roles)
    test_ds = reader.read(test_data)

    pipe = TorchBaseFeatures()
    iterator = FoldsIterator(train=train_ds)

    model = TorchModel(name_to_model_params[MODEL_NAME])
    ml_pipe = MLPipeline([
        model
    ], pre_selection=None, features_pipeline=pipe, post_selection=None)

    # Fit_predict using created pipelien
    oof_pred = ml_pipe.fit_predict(iterator)

    # Results calculation
    mask = ~np.isnan(oof_pred.data[:, 0])
    print(sum(mask) == len(train_data))
    print(f'{it+1}/{N_targets} {TARGET_NAME} TRAIN out-of-fold score: {mean_squared_error(train_data[TARGET_NAME].values[mask], oof_pred.data[:, 0][mask], squared = False)}')
    
    # Test prediction using the trained pipeline
    test_pred = ml_pipe.predict(test_ds)
    FINAL_PREDS.append(pd.DataFrame([[str(idx) + '-' + TARGET_NAME, val] for idx,val in zip(test_data['row_id'].values, test_pred.data[:, 0])], 
                                    columns = submission.columns))

# Create submission

We are now ready for submission file creation:

In [None]:
submission = pd.concat(FINAL_PREDS)
submission.to_csv('lightautoml_densenet_imputer.csv', index = False)
submission

# Additional materials

- [Official LightAutoML github repo](https://github.com/AILab-MLTools/LightAutoML)
- [LightAutoML documentation](https://lightautoml.readthedocs.io/en/latest)
- [LightAutoML tutorials](https://github.com/AILab-MLTools/LightAutoML/tree/master/examples/tutorials)
- LightAutoML course:
    - [Part 1 - general overview](https://ods.ai/tracks/automl-course-part1) 
    - [Part 2 - LightAutoML specific applications](https://ods.ai/tracks/automl-course-part2)
    - [Part 3 - LightAutoML customization](https://ods.ai/tracks/automl-course-part3)
- [OpenDataScience AutoML benchmark leaderboard](https://ods.ai/competitions/automl-benchmark/leaderboard)

### If you still like the notebook, do not forget to put upvote for the notebook and the ⭐️ for github repo if you like it using the button below - one click for you, great pleasure for us ☺️

In [None]:
s = '<iframe src="https://ghbtns.com/github-btn.html?user=sb-ai-lab&repo=LightAutoML&type=star&count=true&size=large" frameborder="0" scrolling="0" width="170" height="30" title="LightAutoML GitHub"></iframe>'
HTML(s)