# Step 0. Install LAMA

In [None]:
pip install lightautoml

In [None]:
pip install transformers -U

In [None]:
pip install navec

# Step 0.1. Import necessary libraries 

In [None]:
# Standard python libraries
import os
import time

# Installed libraries
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import torch
import matplotlib.pyplot as plt
from navec import Navec

# Imports from our package
from lightautoml.automl.presets.text_presets import TabularNLPAutoML
from lightautoml.dataset.roles import DatetimeRole
from lightautoml.tasks import Task


# Step 0.2. Parameters 

In [None]:
N_THREADS = 4 # threads cnt for lgbm and linear models
RANDOM_STATE = 42 # fixed random state for various reasons
TEST_SIZE = 0.2 # Test size for metric check
TIMEOUT = 9000 # Time in seconds for automl run
TARGET_NAME = 'sentiment' # Target column name

# Step 0.3. Fix torch number of threads and numpy seed 

In [None]:
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

# Step 0.4. Example data load 

Dataset from https://github.com/sismetanin/rureviews

In [None]:
%%time

data = pd.read_csv('../input/lama-datasets/rureviews.csv', sep='\t')
data.head()

# Step 0.5. Some user feature preparation 

Cell below shows some user feature preparations to create task more difficult (this block can be omitted if you don't want to change the initial data):

In [None]:
data.sentiment.value_counts()

In [None]:
data['review'].str.split(' ').apply(len).hist(bins=100)
plt.show()

# Step 0.6. (Optional) Data splitting for train-test 

Block below can be omitted if you are going to train model only or you have specific train and test files:

In [None]:
%%time

train_data, test_data = train_test_split(data, 
                                         test_size=TEST_SIZE, 
                                         stratify=data[TARGET_NAME], 
                                         random_state=RANDOM_STATE)

train_data = train_data.sample(n=25_000, random_state=RANDOM_STATE)
print('Data splitted. Parts sizes: train_data = {}, test_data = {}'
              .format(train_data.shape, test_data.shape))

In [None]:
train, valid = train_test_split(train_data, 
                                         test_size=TEST_SIZE, 
                                         stratify=train_data[TARGET_NAME], 
                                         random_state=RANDOM_STATE)

print('Data splitted. Parts sizes: train = {}, valid = {}'
              .format(train.shape, valid.shape))

# Step 0.7. (Optional) Load RU text embeddings

In [None]:
!wget https://storage.yandexcloud.net/natasha-navec/packs/navec_hudlit_v1_12B_500K_300d_100q.tar

In [None]:
path = 'navec_hudlit_v1_12B_500K_300d_100q.tar'
navec = Navec.load(path)

#  ==== AutoML preset usage ====


## Step 1. Create Task

In [None]:
def f1_macro(y_true, y_pred):
    return f1_score(y_true, np.argmax(y_pred, axis=1), average='macro')

In [None]:
%%time

task = Task('multiclass', metric=f1_macro)

## Step 2. Setup columns roles

In [None]:
%%time

roles = {'target': TARGET_NAME, 'text': ['review']}

## Step 3. Create AutoML from preset

To create AutoML model here we use `TabularNLPAutoML` preset.


All params we set above can be send inside preset to change its configuration:

In [None]:
%%time
      
start = time.time()
automl = TabularNLPAutoML(task = task, 
                       timeout = TIMEOUT,
                       cpu_limit = N_THREADS,
                       general_params = {'nested_cv': False, 'use_algos': [['linear_l2', 'lgb']]},
                       linear_pipeline_params = {'text_features': "tfidf"},
                       gbm_pipeline_params = {'text_features': 'embed'},
                       text_params = {'lang': 'ru', 'bert_model': 'DeepPavlov/rubert-base-cased-conversational'},
                       autonlp_params = {'model_name': 'random_lstm',
                                         'embedding_model': navec,
                                         'transformer_params': {'dataset_params': {
                                                                                  'max_length': 150,
                                                                                  'embed_size': 300}, 
                                                              }
                                        },
                       tfidf_params = {'svd': True, 'tfidf_params': {'ngram_range': (1, 1)} }
                       
                    )

oof_pred = automl.fit_predict(train, valid_data = valid, roles = roles)
print('oof_pred:\n{}\nShape = {}'.format(oof_pred, oof_pred.shape))
time_automl = time.time() - start

## Step 4. Predict to test data and check scores

In [None]:
%%time

test_pred = automl.predict(test_data)
print('Prediction for test data:\n{}\nShape = {}'.format(test_pred, test_pred.shape))

print('Check scores...')
print('VALID score: {}'.format(f1_macro(valid[TARGET_NAME].map(automl.reader.class_mapping).values,
                                           oof_pred.data)))
test_automl = f1_macro(test_data[TARGET_NAME].map(automl.reader.class_mapping ).values, test_pred.data)
print('TEST score: {}'.format(test_automl))


## Step 5. Same Preset with Bert.

In [None]:
%%time 
start = time.time()
automl = TabularNLPAutoML(task = task, 
                       timeout = TIMEOUT,
                       cpu_limit = N_THREADS,
                       general_params = {'nested_cv': False, 'use_algos': [['nn']]},
                       text_params = {'lang': 'ru', 'bert_model': 'DeepPavlov/rubert-base-cased-conversational'},
                       nn_params = {'opt_params': { 'lr': 1e-5},
                                    'max_length': 150, 'bs': 32, 'epoch': 1
                                    },
                       )

oof_pred = automl.fit_predict(train, valid_data = valid, roles = roles)
print('oof_pred:\n{}\nShape = {}'.format(oof_pred, oof_pred.shape))
time_automl_sbert = time.time() - start

In [None]:
automl.levels

In [None]:
%%time

test_pred = automl.predict(test_data)
print('Prediction for test data:\n{}\nShape = {}'.format(test_pred, test_pred.shape))

In [None]:
print('Check scores...')
print('VALID score: {}'.format(f1_macro(valid[TARGET_NAME].map(automl.reader.class_mapping).values,
                                           oof_pred.data)))
test_automl_sbert = f1_macro(test_data[TARGET_NAME].map(automl.reader.class_mapping ).values, test_pred.data)
print('TEST score: {}'.format(test_automl_sbert))