In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

In [None]:
# import os
# os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [None]:
%load_ext tensorboard


# Imports

In [None]:
import os, sys
import re
import random
import pickle
import logging
import typing as T
from logging import getLogger
from datetime import datetime

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.metrics import f1_score, precision_score, recall_score, make_scorer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

tqdm.pandas()

In [None]:
sys.path.append("..")
from src import (
    save_pickle, load_pickle,
    CustomBertForSequenceClassification,
    preprocess_text, LSTrainer
)

In [None]:
import torch
from torch import nn
import torch.nn.functional as F

import transformers
from transformers import (
    Trainer, TrainingArguments, EvalPrediction, BertModel,
    AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
)
from datasets import Dataset, DatasetDict

In [None]:
torch.cuda.is_available()

# Settings

In [None]:
PROJ_PATH = ".."
DATA_PATH = os.path.join(PROJ_PATH, "data")
SRC_PATH = os.path.join(PROJ_PATH, 'src')
MODELS_PATH = os.path.join(PROJ_PATH, "model")
SUB_PATH = os.path.join(PROJ_PATH, 'submission', 'task1')

In [None]:
TRAIN_DATA_PATH = os.path.join(DATA_PATH, 'task1', "data_fusion_train.parquet")
MODEL_PATH = os.path.join(MODELS_PATH, "rubert_original_vocab_lowercase")

In [None]:
logger = getLogger(name=__name__)
logging.basicConfig(format = u'%(filename)s - %(funcName)s()[LINE:%(lineno)d] # [%(levelname)-8s] [%(asctime)s]  %(message)s', level = logging.INFO)

In [None]:
# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Methods

In [None]:
class TokenizeFunction:
    def __init__(
        self, tokenizer, sent1_name: str,
        padding: T.Union[bool, str] = False, max_seq_length: int = 64
        ):
        self._tokenizer = tokenizer
        self._sent1_name = sent1_name
        self._padding = padding
        self._max_seq_length = max_seq_length

    def __call__(self, examples):            
        return self._tokenizer(
            examples[self._sent1_name],
            truncation=True,
            padding=self._padding,
            max_length=self._max_seq_length,
            # return_special_tokens_mask=True,
        ) 

In [None]:
class MetricsCalculator:
    def __init__(self, class_weights):
        self.class_weights = class_weights
        self.softmax = torch.nn.Softmax(dim=1)
    
    def __call__(self, data: EvalPrediction):
        logits = torch.from_numpy(data.predictions.astype(np.float32))
        probs = self.softmax(logits).cpu().numpy()
        pred = probs.argmax(axis=1)
        target = data.label_ids

        resulted_f1 = []
        resulted_pres = []
        resulted_rec = []
        for c in np.unique(target):
            c_targ = (target == c).astype(int)
            c_pred = (pred == c).astype(int)
            resulted_f1.append(f1_score(c_targ, c_pred) * self.class_weights[c])
            resulted_pres.append(precision_score(c_targ, c_pred) * self.class_weights[c])
            resulted_rec.append(recall_score(c_targ, c_pred) * self.class_weights[c])

        metrics = dict(
            f1_score_weighted = np.sum(resulted_f1),
            f1_score_weighted_sklearn = f1_score(target, pred, average='weighted'),
            precision_score_weighted = np.sum(resulted_pres),
            recall_score_weighted = np.sum(resulted_rec)
        )
        return metrics
        

# Get data

In [None]:
%%time
df = pd.read_parquet(TRAIN_DATA_PATH)
df.shape

In [None]:
df = df[df['category_id'] != -1].reset_index(drop=True)
df.shape

# Get unique df

In [None]:
train = df
train['weight'] = 1
train_unique = train.groupby('item_name').agg({'category_id': lambda cat: cat.value_counts().index[0], 'weight': 'sum'}).reset_index()

In [None]:
train_unique = train_unique[train_unique['item_name'] != ""].reset_index(drop=True)
train_unique.shape

# Preprocess

In [None]:
# train_unique['item_name_prep'] = preprocess_text(train_unique['item_name'])


# Label encode target

In [None]:
le = LabelEncoder()
le.fit(train_unique['category_id'])
# le = load_pickle(os.path.join(SUB_PATH, "label_encoder.pickle"))

In [None]:
train_unique['category_id'] = le.transform(train_unique['category_id'])

# Get weights

In [None]:
train_unique['weight'] = 1

In [None]:
class_weights = train_unique.groupby('category_id').agg({'weight': 'sum'})
class_weights /= class_weights.sum()

class_weights = class_weights.sort_values('weight', ascending=False)
class_weights.head(2)


In [None]:
class_weights = class_weights.to_dict()['weight']

# Split

In [None]:
df_train_unique = train_unique[['item_name', 'category_id']]
df_train_unique.columns = ['item_name', 'labels']

In [None]:
df_train_unique = df_train_unique[df_train_unique['item_name'] != ""].reset_index(drop=True)

In [None]:
df_train, df_val = train_test_split(df_train_unique, test_size=0.15, random_state=42, stratify=df_train_unique['labels'])
df_train, df_val = df_train.reset_index(drop=True), df_val.reset_index(drop=True)
df_train.shape, df_val.shape

In [None]:
save_pickle(df_train, os.path.join(DATA_PATH, 'task1', 'classification_split', 'df_train.pickle'))
save_pickle(df_val, os.path.join(DATA_PATH, 'task1', 'classification_split', 'df_val.pickle'))

# Download models

In [None]:
config = AutoConfig.from_pretrained(MODEL_PATH)
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, do_lower_case=True, use_fast=True)



In [None]:
num_labels = df_train['labels'].nunique()
# num_labels = 96

In [None]:
model = CustomBertForSequenceClassification.from_pretrained(MODEL_PATH, num_labels=num_labels, output_hidden_states=True)


# Prepare dataset

In [None]:
train_dataset = Dataset.from_pandas(df_train[['item_name', 'labels']])
val_dataset = Dataset.from_pandas(df_val[['item_name', 'labels']])
datasets = DatasetDict({"train": train_dataset, "validation": val_dataset})

In [None]:
tokenize_function = TokenizeFunction(
    tokenizer=tokenizer, sent1_name='item_name', padding=False
    )

In [None]:
tokenized_datasets = datasets.map(
        tokenize_function,
        batched=True,
        num_proc=None,
        remove_columns=['item_name']
    )

In [None]:
tokenized_datasets.remove_columns_(['token_type_ids'])

# Train

In [None]:
runs_dir = os.path.join(PROJ_PATH, 'reports', 'task1')
logging_dir = os.path.join(runs_dir, f"{MODEL_NAME}__unique_extend_mlm_labelsmooth_mdrop_lowercase__{datetime.now().strftime('%Y-%m-%d_%H:%M:%S')}")

In [None]:
gradient_accumulation_steps = 1
training_args = TrainingArguments(
    output_dir='./model',
    do_train=True,
    do_eval=True,
    evaluation_strategy='steps',
    eval_steps=2000//gradient_accumulation_steps,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=128,
    gradient_accumulation_steps=gradient_accumulation_steps,
    learning_rate=3e-5,
    weight_decay=0.01,
    num_train_epochs=40,
    adam_epsilon=1e-8,
    lr_scheduler_type='linear',
    warmup_steps=500,
    logging_steps=2000//gradient_accumulation_steps,
    save_steps=2000//gradient_accumulation_steps,
    save_total_limit=7,
    label_smoothing_factor=0.1,
    disable_tqdm=False,
    label_names=['labels'],
    logging_dir=logging_dir,
    fp16=True
)

In [None]:
trainer = LSTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"] if training_args.do_train else None,
        eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None,
        tokenizer=tokenizer,
        compute_metrics=MetricsCalculator(class_weights=class_weights)
    )

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

# Save

In [None]:
trainer.save_model(os.path.join(MODELS_PATH, "rubert_classification"))

In [None]:
save_pickle(le, os.path.join(SUB_PATH, "label_encoder.pickle"))