# Step 0. Install LAMA

In [None]:
pip install -U lightautoml

In [None]:
pip install -U transformers

# Step 0.1. Import necessary libraries 

In [None]:
# Standard python libraries
import os
import time

# Installed libraries
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import torch
import matplotlib.pyplot as plt

# Imports from our package
from lightautoml.automl.presets.text_presets import TabularNLPAutoML
from lightautoml.dataset.roles import DatetimeRole
from lightautoml.tasks import Task


# Step 0.2. Parameters 

In [None]:
N_THREADS = 4 # threads cnt for lgbm and linear models
RANDOM_STATE = 42 # fixed random state for various reasons
TEST_SIZE = 0.2 # Test size for metric check
TIMEOUT = 6 * 3600 # Time in seconds for automl run
TARGET_NAME = 'target'

# Step 0.3. Fix torch number of threads and numpy seed 

In [None]:
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

# Step 0.4. Example data load 

In [None]:
%%time

train_data = pd.read_csv('../input/nlp-getting-started/train.csv')
train_data.head()

In [None]:
test_data = pd.read_csv('../input/nlp-getting-started/test.csv')
test_data.head()

In [None]:
submission = pd.read_csv('../input/nlp-getting-started/sample_submission.csv')
submission.head()

In [None]:
train_data.shape, test_data.shape, submission.shape

# Step 0.5. Some EDA

In [None]:
train_data.target.value_counts()

In [None]:
train_data['keyword'].value_counts(dropna = False)

In [None]:
train_data['location'].value_counts(dropna = False)

# Step 0.6. Data cleaning and preparation

In [None]:
def clean_text(text):
    
    return text

In [None]:
all_data = pd.concat([
    train_data.drop(TARGET_NAME, axis = 1),
    test_data
]).reset_index(drop = True)

all_data['location'] = all_data['location'].astype(str)
all_data.loc[all_data['location'].value_counts()[all_data['location']].values < 5, 'location'] = "RARE_VALUE"
all_data.loc[all_data['location'] == 'nan', 'location'] = np.nan

all_data['text'] = all_data['text'].map(clean_text)

all_data

In [None]:
y_train = train_data.target.values
train_data = all_data[:len(train_data)]
train_data[TARGET_NAME] = y_train
test_data = all_data[len(train_data):]

#  ==== AutoML preset usage ====


## Step 1. Create Task

In [None]:
%%time

task = Task('binary', )

## Step 2. Setup columns roles

In [None]:
%%time

roles = {'target': TARGET_NAME, 
         'text': ['text'],
        'drop': ['id']}

## Step 3. Create AutoML from preset

To create AutoML model here we use `TabularNLPAutoML` preset.


All params we set above can be send inside preset to change its configuration:

In [None]:
%%time 

automl = TabularNLPAutoML(task = task, 
                       timeout = TIMEOUT,
                       cpu_limit = N_THREADS,
                       reader_params = {'cv': 5},
                       general_params = {'nested_cv': False, 'use_algos': [['linear_l2', 'lgb', 'nn']]},
                       text_params = {'lang': 'en'},
                       nn_params = {'lang': 'en', 
                                    'bert_name': 'vinai/bertweet-base', 
                                    'opt_params': { 'lr': 1e-5},
                                    'max_length': 300, 'bs': 13,
                                    'n_epoch': 5
                                    },
                       )

oof_pred = automl.fit_predict(train_data, roles = roles)
print('oof_pred:\n{}\nShape = {}'.format(oof_pred, oof_pred.shape))

In [None]:
automl.collect_used_feats()

## Step 4. Predict to test data

In [None]:
test_pred = automl.predict(test_data)
print('Prediction for test data:\n{}\nShape = {}'.format(test_pred, test_pred.shape))

## Step 5. Select best threshold to optimize F1 score

In [None]:
def select_threshold_f1(y_true, y_pred):
    best_score = -1
    best_thr = None
    for thr in np.arange(0, 1.01, 0.01):
        score = f1_score(y_true, (y_pred > thr).astype(int))
        if score > best_score:
            best_score = score
            best_thr = thr
            
    print('Best score: {}\nBest selected threshold: {:.2f}'.format(best_score, best_thr))
    return best_thr

best_thr = select_threshold_f1(train_data[TARGET_NAME], oof_pred.data[:, 0])

## Step 6. Generate submission file

In [None]:
submission['target'] = (test_pred.data[:, 0] > best_thr).astype(int)
submission

In [None]:
submission['target'].value_counts()

In [None]:
submission.to_csv('LightAutoML_preds_without_id.csv', index = False)