### Import libraries and load dataset

In [None]:
# import usual libraries
import time
import os
import gc
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import torch
import transformers

transformers.logging.set_verbosity_error()
os.environ["TOKENIZERS_PARALLELISM"] = "false"

device = 'gpu'

import cudf

# import lightautoml
from lightautoml.automl.presets.text_presets import TabularNLPAutoML
from lightautoml.automl.presets.gpu.text_gpu_presets import TabularNLPAutoMLGPU
from lightautoml.tasks import Task
from lightautoml.dataset.utils import roles_parser

In [None]:
# define nlp constants
N_THREADS = 4
N_FOLDS = 5
RANDOM_STATE = 42
TEST_SIZE = 0.2
TIMEOUT = 300
TARGET_NAME = 'is_good'

torch.set_num_threads(N_THREADS)
torch.cuda.empty_cache()

In [None]:
# load bankiru dataset
DATASET_FULLNAME = 'data/bankiru_isgood.csv'

# here only 1000 samples are used for time reasons (for a detailed check, one needs to use larger number:
# 100k-500k)
data = pd.read_csv(DATASET_FULLNAME)[["message", "title", "is_good"]].fillna("")[:1000]

In [None]:
# split data
tr_data, te_data = train_test_split(data,
        test_size=TEST_SIZE,
        stratify=data[TARGET_NAME],
        random_state=RANDOM_STATE
    )
print(data.head())
tr_data = pd.DataFrame(data, index=[i for i in range(tr_data.shape[0])])
te_data = pd.DataFrame(data, index=[i for i in range(te_data.shape[0])])

print(f'Data splitted. Parts sizes: tr_data = {tr_data.shape}, te_data = {te_data.shape}')

In [None]:
# define task and roles
task = Task('binary', device=device)

roles = {
    'text': ['message', 'title'],
    'target': TARGET_NAME,
}
print(roles_parser(roles))

In [None]:
def run_automl(automl, tr_data, te_data):
    t0 = time.time()
    oof_pred = automl.fit_predict(tr_data, roles=roles, verbose=1)
    t1 = time.time()
    print('Elapsed time (train): {}'.format(t1 - t0))

    t0 = time.time()
    te_pred = automl.predict(te_data)
    t1 = time.time()
    print('Elapsed time (test): {}'.format(t1 - t0))

    not_nan = np.any(~np.isnan(oof_pred.data), axis=1)
    print(f'OOF score: {roc_auc_score(tr_data[TARGET_NAME].values[not_nan], oof_pred.data[not_nan][:, 0])}')
    print(f'TEST score: {roc_auc_score(te_data[TARGET_NAME].values, te_pred.data[:, 0])}')

### linear_l2 model with different text features

#### tfidf text features

In [None]:
n_components = 100
n_oversample = 0
ngram = (1, 1)

automl = TabularNLPAutoMLGPU(task=task,
            timeout=600,
            cpu_limit=1,
            gpu_ids='0',
            client=None,
            general_params={
                'nested_cv': False,
                'use_algos': [['linear_l2']]
            },
            reader_params={
                'npartitions': 2
            },
            text_params={
                'lang': 'ru',
                'verbose': False,
                'use_stem': False,
            },
            tfidf_params={
                'n_components': n_components,
                'n_oversample': n_oversample,
                'tfidf_params': {'ngram_range': ngram}
            },
            linear_pipeline_params={
                'text_features': "tfidf"
            }
            )

In [None]:
run_automl(automl, tr_data, te_data)
torch.cuda.empty_cache()
gc.collect()

#### tfidf_subword features

The following __text_params__ work only with __tfidf_subword__ text features:   
__vocab_path__ - path to vocabulary .txt file,  
__data_path__ - .txt file (saved pd.Series) for the tokenizer to be trained on (if vocab is not specified)  
__is_hash__ - True means vocab is not raw vocab but was transformed with hash_vocab function from cudf,  
__max_length__ - max number of tokens to leave in one text (exceeding ones would be truncated)  
__tokenizer__ - ["bpe" or "wordpiece"] if vocab is None. Type of tokenizer to be trained  
__vocab_size__ - vocabulary size for trained tokenizer  
__save_path__ - path where trained vocabulary would be saved to  

Overall, there are 3 possible scenarios to run tfidf_subword text features:  
1) __vocab_path__ is defined, __is_hash__ = True. It means that __vocab_path__ contains path to a hashed version of vocabulary. No additional transformation is needed. This is the optimal usage (all vocabulary pre-processing was done in advance).
2) __vocab_path__ is defined, __is_hash__ = False. __vocab_path__ contains path to a vocabulary with raw words, it needs to be transformed to a hash version. This is the second fastest option.
3) __vocab_path__ is not defined, __data_path__ is defined (with additional parameters __tokenizer__, __vocab_size__ and __save_path__). Only .txt file of a dataframe is available. Note, that it works not with a dataframe itself but with its .txt version. One should be careful with tokenizer settings. Recommended way is to study the dataset in advance, tweak tokenizer settings and create the vocabulary aside from LAMA pipeline. The quality of __tfidf_subword__ text features highly depend on the quality of the used tokenizer. 

Prepare data for all scenarios. Imagine that only pd.Series of text data is available.

In [None]:
# Note: how to create .txt dataframe (one should save only text corpus)
# This is an example, it is not necessary to run it

# Step 1. Choose your representative text data and save it to .txt file. Here only one column of text dataset 
# is taken but sometimes it might be a good idea to concatenate all text columns instead of choosing one.
data_text = data['message']
file_data_text = 'bankiru_isgood_test.txt'
with open(file_data_text, 'w+') as f:
    for i in range(len(data_text)):
        f.write(data_text.iloc[i] + '\n')

In [None]:
# Note: how to use huggingface tokenizer to create vocabulary from .txt dataframe
# This is an example, it is not necessary to run it

# Step 2. Having a text data file, train token vocabulary.
from tokenizers import Tokenizer
from tokenizers.models import BPE, WordPiece
from tokenizers import normalizers
from tokenizers.normalizers import Lowercase, NFD, StripAccents
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer, WordPieceTrainer

tokenizer = 'bpe' # or 'wordpiece'
vocab_size = 30000
data_path = file_data_text # path to a .txt pd.Series of text data
vocab_save_path = f"{tokenizer}_{vocab_size // 1000}k_test.txt"

if tokenizer == "bpe":
    tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
    trainer = BpeTrainer(
        vocab_size=vocab_size, special_tokens=["[UNK]", "[SEP]", "[CLS]"]
    )
else:
    tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
    trainer = WordPieceTrainer(
        vocab_size=vocab_size, special_tokens=["[UNK]", "[SEP]", "[CLS]"]
    )
tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])
tokenizer.pre_tokenizer = Whitespace()

tokenizer.train([data_path], trainer) # train tokenizer on out .txt text data
trained_vocab = tokenizer.get_vocab()

# save trained vocabulary to a .txt file
with open(vocab_save_path, 'w+') as f:
    for key in trained_vocab.keys():
        f.write(key + '\n')

In [None]:
# Note: how to create hash vocabulary from word .txt vocabulary
# This is an example, it is not necessary to run it

# Step 3. Having .txt vocabulary file, create a hashed version of it which would be used by 
# cudf.SubwordTokenizer
from cudf.utils.hash_vocab_utils import hash_vocab

vocab_save_path_hash = vocab_save_path.split('.')[0]+'_hash.txt'
hash_vocab(vocab_save_path, vocab_save_path_hash)

In [None]:
# Alternative Step 1-2. Download existing vocabulary (one could use data from huggingfsce models).

# Download standard bert English vocabulary
!wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt
bert_vocab_en_path = 'bert-base-uncased-vocab.txt'
# Download bert Russian vocabulary
!wget https://s3.amazonaws.com/models.huggingface.co/bert/DeepPavlov/rubert-base-cased/vocab.txt
bert_vocab_ru_path = 'vocab.txt'

In [None]:
# True to use data generated in this notebook, False to use data available in zip dataset archive
use_test_data = True 

if use_test_data:
    bankiru_info = {'path': 'data/bankiru_isgood.csv',
                    'text_roles': ['message', 'title'],
                    'target': 'is_good',
                    'task': 'binary',
                    'lang': 'ru',
                    'csv2text': file_data_text,
                    'vocab_path': vocab_save_path,
                    'vocab_hash_path': vocab_save_path_hash
    }
else:
    bankiru_info = {'path': 'data/bankiru_isgood.csv',
                    'text_roles': ['message', 'title'],
                    'target': 'is_good',
                    'task': 'binary',
                    'lang': 'ru',
                    'csv2text': 'data/csv2text/bankiru_isgood.txt',
                    'vocab_path': 'data/vocab/bankiru_isgood_vocab.txt',
                    'vocab_hash_path': 'data/vocab_hash/bankiru_isgood_vocab_hash.txt'
    }

In [None]:
# scenario 1
automl = TabularNLPAutoMLGPU(task=task, 
                              timeout=600, 
                              cpu_limit=1, 
                              gpu_ids='0', 
                              client=None,
                              general_params={
                                  'nested_cv': False,
                                  'use_algos': [['linear_l2']]
                              },
                              reader_params={
                                  'npartitions': 2
                              },
                              text_params={
                                  'lang': 'ru',
                                  'verbose': False,
                                  'use_stem': False,
                                  'vocab_path': bankiru_info['vocab_hash_path'],
                                  'is_hash': True,
                                  # 'data_path': file_name,
                                  # 'tokenizer': "bpe",
                                  # 'vocab_size': 30000
                              },
                              tfidf_params={
                                  'n_components': n_components,
                                  'n_oversample': n_oversample,
                                  'tfidf_params': {'ngram_range': ngram}
                              },
                              linear_pipeline_params={
                                  'text_features': 'tfidf_subword'
                              },
                              )

In [None]:
run_automl(automl, tr_data, te_data)
torch.cuda.empty_cache()
gc.collect()

In [None]:
# scenario 2
automl = TabularNLPAutoMLGPU(task=task, 
                              timeout=600, 
                              cpu_limit=1, 
                              gpu_ids='0', 
                              client=None,
                              general_params={
                                  'nested_cv': False,
                                  'use_algos': [['linear_l2']]
                              },
                              reader_params={
                                  'npartitions': 2
                              },
                              text_params={
                                  'lang': 'ru',
                                  'verbose': False,
                                  'use_stem': False,
                                  'vocab_path': bankiru_info['vocab_path'],
                                  'is_hash': False,
                                  # 'data_path': file_name,
                                  # 'tokenizer': "bpe",
                                  # 'vocab_size': 30000
                              },
                              tfidf_params={
                                  'n_components': n_components,
                                  'n_oversample': n_oversample,
                                  'tfidf_params': {'ngram_range': ngram}
                              },
                              linear_pipeline_params={
                                  'text_features': 'tfidf_subword'
                              },
                              )

In [None]:
run_automl(automl, tr_data, te_data)
torch.cuda.empty_cache()
gc.collect()

In [None]:
# scenario 3
automl = TabularNLPAutoMLGPU(task=task, 
                              timeout=600, 
                              cpu_limit=1, 
                              gpu_ids='0', 
                              client=None,
                              general_params={
                                  'nested_cv': False,
                                  'use_algos': [['linear_l2']]
                              },
                              reader_params={
                                  'npartitions': 2
                              },
                              text_params={
                                  'lang': 'ru',
                                  'verbose': False,
                                  'use_stem': False,
                                  'vocab_path': None,
                                  'data_path': bankiru_info['csv2text'],
                                  'tokenizer': "bpe",
                                  'vocab_size': 30000
                              },
                              tfidf_params={
                                  'n_components': n_components,
                                  'n_oversample': n_oversample,
                                  'tfidf_params': {'ngram_range': ngram}
                              },
                              linear_pipeline_params={
                                  'text_features': 'tfidf_subword'
                              },
                              )

In [None]:
run_automl(automl, tr_data, te_data)
torch.cuda.empty_cache()
gc.collect()

#### embed text features

In [None]:
# One should note that gensim package was removed, now only torchnlp embeddings are available of fixed
# dimensionality
model_name = 'random_lstm'

automl = TabularNLPAutoMLGPU(task=task,
                              timeout=600,
                              cpu_limit=1,
                              gpu_ids='0',
                              client=None,
                              general_params={
                                  'nested_cv': False,
                                  'use_algos': [['linear_l2']]
                              },
                              reader_params={
                                  'npartitions': 2
                              },
                              text_params={
                                  'lang': 'ru',
                                  'verbose': False,
                                  'use_stem': False
                              },
                              autonlp_params={
                                  'model_name': model_name,
                                  'sent_scaler': 'l1',
                                  'embedding_model': 'fasttext', # now this has a different meaning
                                  'cache_dir': None
                              },
                              linear_pipeline_params={
                                  'text_features': 'embed'
                              },
                              )
run_automl(automl, tr_data, te_data)
torch.cuda.empty_cache()
gc.collect()

In [None]:
model_name = 'borep'
automl = TabularNLPAutoMLGPU(task=task,
                              timeout=600,
                              cpu_limit=1,
                              gpu_ids='0',
                              client=None,
                              general_params={
                                  'nested_cv': False,
                                  'use_algos': [['linear_l2']]
                              },
                              reader_params={
                                  'npartitions': 2
                              },
                              text_params={
                                  'lang': 'ru',
                                  'verbose': False,
                                  'use_stem': False
                              },
                              autonlp_params={
                                  'model_name': model_name,
                                  'sent_scaler': None,
                                  'embedding_model': 'fasttext', # now this has a different meaning
                                  'cache_dir': None
                              },
                              linear_pipeline_params={
                                  'text_features': 'embed'
                              },
                              )

run_automl(automl, tr_data, te_data)
torch.cuda.empty_cache()
gc.collect()

In [None]:
model_name = 'random_lstm_bert'
automl = TabularNLPAutoMLGPU(task=task,
                              timeout=600,
                              cpu_limit=1,
                              gpu_ids='0',
                              client=None,
                              general_params={
                                  'nested_cv': False,
                                  'use_algos': [['linear_l2']]
                              },
                              reader_params={
                                  'npartitions': 2
                              },
                              text_params={
                                  'lang': 'ru',
                                  'verbose': False,
                                  'use_stem': False
                              },
                              autonlp_params={
                                  'model_name': model_name,
                                  'sent_scaler': 'l2',
                                  'embedding_model': 'fasttext', # now this has a different meaning
                                  'cache_dir': None
                              },
                              linear_pipeline_params={
                                  'text_features': 'embed'
                              },
                              )

run_automl(automl, tr_data, te_data)
torch.cuda.empty_cache()
gc.collect()

In [None]:
model_name = 'pooled_bert'
automl = TabularNLPAutoMLGPU(task=task,
                              timeout=600,
                              cpu_limit=1,
                              gpu_ids='0',
                              client=None,
                              general_params={
                                  'nested_cv': False,
                                  'use_algos': [['linear_l2']]
                              },
                              reader_params={
                                  'npartitions': 2
                              },
                              text_params={
                                  'lang': 'ru',
                                  'verbose': False,
                                  'use_stem': False
                              },
                              autonlp_params={
                                  'model_name': model_name,
                                  'sent_scaler': 'l2',
                                  'embedding_model': 'fasttext', # now this has a different meaning
                                  'cache_dir': None
                              },
                              linear_pipeline_params={
                                  'text_features': 'embed'
                              },
                              )

run_automl(automl, tr_data, te_data)
torch.cuda.empty_cache()
gc.collect()

In [None]:
model_name = 'wat'
automl = TabularNLPAutoMLGPU(task=task,
                              timeout=600,
                              cpu_limit=1,
                              gpu_ids='0',
                              client=None,
                              general_params={
                                  'nested_cv': False,
                                  'use_algos': [['linear_l2']]
                              },
                              reader_params={
                                  'npartitions': 2
                              },
                              text_params={
                                  'lang': 'ru',
                                  'verbose': False,
                                  'use_stem': False
                              },
                              autonlp_params={
                                  'model_name': model_name,
                                  'sent_scaler': None,
                                  'embedding_model': 'fasttext', # now this has a different meaning
                                  'cache_dir': None
                              },
                              linear_pipeline_params={
                                  'text_features': 'embed'
                              },
                              )

run_automl(automl, tr_data, te_data)
torch.cuda.empty_cache()
gc.collect()

### catboost and xgb algos with tfidf text_features

In [None]:
# catboost
n_components = 100
n_oversample = 0
ngram = (1, 1)

automl = TabularNLPAutoMLGPU(task=task,
            timeout=600,
            cpu_limit=1,
            gpu_ids='0',
            client=None,
            general_params={
                'nested_cv': False,
                'use_algos': [['cb']]
            },
            reader_params={
                'npartitions': 2
            },
            text_params={
                'lang': 'ru',
                'verbose': False,
                'use_stem': False,
            },
            tfidf_params={
                'n_components': n_components,
                'n_oversample': n_oversample,
                'tfidf_params': {'ngram_range': ngram}
            },
            linear_pipeline_params={
                'text_features': "tfidf"
            }
            )

run_automl(automl, tr_data, te_data)
torch.cuda.empty_cache()
gc.collect()

In [None]:
# xgboost
n_components = 100
n_oversample = 0
ngram = (1, 1)

automl = TabularNLPAutoMLGPU(task=task,
            timeout=600,
            cpu_limit=1,
            gpu_ids='0',
            client=None,
            general_params={
                'nested_cv': False,
                'use_algos': [['xgb']]
            },
            reader_params={
                'npartitions': 2
            },
            text_params={
                'lang': 'ru',
                'verbose': False,
                'use_stem': False,
            },
            tfidf_params={
                'n_components': n_components,
                'n_oversample': n_oversample,
                'tfidf_params': {'ngram_range': ngram}
            },
            linear_pipeline_params={
                'text_features': "tfidf"
            }
            )

run_automl(automl, tr_data, te_data)
torch.cuda.empty_cache()
gc.collect()