In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

# Imports

In [None]:
import os, sys
import re
import random
import pickle
import logging
import typing as T
from logging import getLogger
from datetime import datetime

import numpy as np
import pandas as pd
import scipy as sp
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.metrics import f1_score, precision_score, recall_score, make_scorer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

tqdm.pandas()

In [None]:
sys.path.append("..")
from src import (
    save_pickle, load_pickle,
    CustomBertForSequenceClassification,
    preprocess_text, LSTrainer
)

In [None]:
import torch
from torch import nn
import torch.nn.functional as F

import transformers
from transformers import (
    Trainer, TrainingArguments, EvalPrediction, BertModel,
    AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
)
from datasets import Dataset, DatasetDict

In [None]:
torch.cuda.is_available()

# Settings

In [None]:
PROJ_PATH = ".."
DATA_PATH = os.path.join(PROJ_PATH, "data")
SRC_PATH = os.path.join(PROJ_PATH, 'src')
MODELS_PATH = os.path.join(PROJ_PATH, "model")
SUB_PATH = os.path.join(PROJ_PATH, 'submission', 'task1')

In [None]:
TRAIN_DATA_PATH = os.path.join(DATA_PATH, 'task1', "data_fusion_train.parquet")

MODEL_PATH = os.path.join(MODELS_PATH, "rubert_classification")


In [None]:
logger = getLogger(name=__name__)
logging.basicConfig(format = u'%(filename)s - %(funcName)s()[LINE:%(lineno)d] # [%(levelname)-8s] [%(asctime)s]  %(message)s', level = logging.INFO)

In [None]:
# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Methods

In [None]:
class TokenizeFunction:
    def __init__(
        self, tokenizer, sent1_name: str,
        padding: T.Union[bool, str] = False, max_seq_length: int = 64
        ):
        self._tokenizer = tokenizer
        self._sent1_name = sent1_name
        self._padding = padding
        self._max_seq_length = max_seq_length

    def __call__(self, examples):            
        return self._tokenizer(
            examples[self._sent1_name],
            truncation=True,
            padding=self._padding,
            max_length=self._max_seq_length,
            # return_special_tokens_mask=True,
        ) 

In [None]:
class MetricsCalculator:
    def __init__(self):
        self.softmax = torch.nn.Softmax(dim=1)
    
    def __call__(self, data: EvalPrediction):
        logits = torch.from_numpy(data.predictions.astype(np.float32))
        probs = self.softmax(logits).cpu().numpy()
        pred = probs.argmax(axis=1)
        target = data.label_ids

        metrics = dict(
            f1_score_weighted_sklearn = f1_score(target, pred, average='weighted'),
        )
        return metrics
        

# Get data

In [None]:
%%time
df = pd.read_parquet(TRAIN_DATA_PATH)
df_train = load_pickle(os.path.join(DATA_PATH, 'task1', 'classification_split', 'df_train.pickle'))
df_val = load_pickle(os.path.join(DATA_PATH, 'task1', 'classification_split', 'df_val.pickle'))
df.shape, df_train.shape, df_val.shape

In [None]:
df_no_label = df[df['category_id'] == -1].drop_duplicates('item_name').reset_index(drop=True)
df_no_label.shape

In [None]:
df = df[df['category_id'] != -1].reset_index(drop=True)
df.shape

# Download rubert model

In [None]:
MODEL_PATH

In [None]:
config = AutoConfig.from_pretrained(MODEL_PATH)
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, do_lower_case=True, use_fast=True)



In [None]:
num_labels = df_train['labels'].nunique()
# num_labels = 96

In [None]:
model = CustomBertForSequenceClassification.from_pretrained(MODEL_PATH, num_labels=num_labels, output_hidden_states=True)


# Prepare train val datasets

In [None]:
train_dataset = Dataset.from_pandas(df_train[['item_name', 'labels']])
val_dataset = Dataset.from_pandas(df_val[['item_name', 'labels']])
datasets = DatasetDict({"train": train_dataset, "validation": val_dataset})

In [None]:
tokenize_function = TokenizeFunction(
    tokenizer=tokenizer, sent1_name='item_name', padding=False
    )

In [None]:
tokenized_datasets = datasets.map(
        tokenize_function,
        batched=True,
        num_proc=None,
        remove_columns=['item_name']
    )

In [None]:
tokenized_datasets.remove_columns_(['token_type_ids'])
tokenized_datasets

# Prepare dataset for pseudo labeling

In [None]:
%%time
df_train['item_name'] = df_train['item_name'].apply(lambda x: x.lower())
df_val['item_name'] = df_val['item_name'].apply(lambda x: x.lower())
df_no_label['item_name'] = df_no_label['item_name'].apply(lambda x: x.lower())

In [None]:
df_no_label = df_no_label[
                          (~df_no_label['item_name'].isin(df_train['item_name'])) & (~df_no_label['item_name'].isin(df_val['item_name']))
                          ].reset_index(drop=True)
df_no_label.shape

In [None]:
no_label_dataset = Dataset.from_pandas(df_no_label[['item_name']])


In [None]:
tokenized_no_label_dataset = no_label_dataset.map(
        tokenize_function,
        batched=True,
        num_proc=None,
        remove_columns=['item_name']
    )

In [None]:
tokenized_no_label_dataset.remove_columns_(['token_type_ids'])
tokenized_no_label_dataset

# Create predictor

In [None]:
training_args = TrainingArguments(
    output_dir='./model',
    do_train=False,
    do_eval=True,
    evaluation_strategy='steps',
    per_device_eval_batch_size=32,
    disable_tqdm=False,
    label_names=['labels'],
    fp16=True
)

In [None]:
pred_trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"] if training_args.do_train else None,
        eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None,
        tokenizer=tokenizer,
        compute_metrics=MetricsCalculator()
    )

# Predict pseudo

In [None]:
pseudo_predicts = pred_trainer.predict(test_dataset=tokenized_no_label_dataset).predictions

In [None]:
pseudo_predicts_softmax = sp.special.softmax(pseudo_predicts, axis=1)
pseudo_predicts_categories = pseudo_predicts_softmax.argmax(axis=1)
pseudo_predicts_probas = pseudo_predicts_softmax.max(axis=1)


In [None]:
df_no_label['labels'] = pseudo_predicts_categories
df_no_label['proba'] = pseudo_predicts_probas
df_pseudo_label = df_no_label[df_no_label['proba'] > 0.915].reset_index(drop=True)
df_pseudo_label.shape


In [None]:
df_pseudo_label = df_pseudo_label[['item_name', 'labels']]
whole_train = pd.concat((df_pseudo_label, df_train)).sample(frac=1.).reset_index(drop=True)


In [None]:
save_pickle(df_pseudo_label, os.path.join(DATA_PATH, 'task1', 'classification_split', 'df_pseudo_label_rubert.pickle'))


# Create new datasets

In [None]:
df_pseudo_label = load_pickle(os.path.join(DATA_PATH, 'task1', 'classification_split', 'df_pseudo_label_rubert.pickle'))
df_train = load_pickle(os.path.join(DATA_PATH, 'task1', 'classification_split', 'df_train.pickle'))
df_val = load_pickle(os.path.join(DATA_PATH, 'task1', 'classification_split', 'df_val.pickle'))

In [None]:
whole_train = pd.concat((df_pseudo_label, df_train)).sample(frac=1.).reset_index(drop=True)


In [None]:
train_dataset = Dataset.from_pandas(whole_train[['item_name', 'labels']])
val_dataset = Dataset.from_pandas(df_val[['item_name', 'labels']])
datasets = DatasetDict({"train": train_dataset, "validation": val_dataset})

In [None]:
tokenize_function = TokenizeFunction(
    tokenizer=tokenizer, sent1_name='item_name', padding=False
    )

In [None]:
tokenized_datasets = datasets.map(
        tokenize_function,
        batched=True,
        num_proc=None,
        remove_columns=['item_name']
    )

In [None]:
tokenized_datasets.remove_columns_(['token_type_ids'])
tokenized_datasets

## Create distilbert model

In [None]:
num_labels = 96

In [None]:
MODEL_PATH = os.path.join(MODELS_PATH, "distilbert_ru_original_vocab_lowercase_240000")
model = CustomBertForSequenceClassification.from_pretrained(MODEL_PATH, num_labels=num_labels, output_hidden_states=True)


In [None]:
model.num_labels

# Train

In [None]:
runs_dir = os.path.join(PROJ_PATH, 'reports', 'task1')
logging_dir = os.path.join(runs_dir, f"{MODEL_NAME}__rubert_pseudolabel_extend_mlm_labelsmooth_mdrop_lowercase__{datetime.now().strftime('%Y-%m-%d_%H:%M:%S')}")

In [None]:
gradient_accumulation_steps = 1
training_args = TrainingArguments(
    output_dir='./model',
    do_train=True,
    do_eval=True,
    evaluation_strategy='steps',
    eval_steps=8000//gradient_accumulation_steps,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=128,
    gradient_accumulation_steps=gradient_accumulation_steps,
    learning_rate=3e-5,
    weight_decay=0.01,
    num_train_epochs=4,
    adam_epsilon=1e-8,
    lr_scheduler_type='linear',
    warmup_steps=3000,
    logging_steps=8000//gradient_accumulation_steps,
    save_steps=8000//gradient_accumulation_steps,
    save_total_limit=7,
    label_smoothing_factor=0.1,
    disable_tqdm=False,
    label_names=['labels'],
    logging_dir=logging_dir,
    fp16=True
)

In [None]:
trainer = LSTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"] if training_args.do_train else None,
        eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None,
        tokenizer=tokenizer,
        compute_metrics=MetricsCalculator()
    )

In [None]:
trainer.train()

# Finetune head

## Create small datasets

In [None]:
train_dataset = Dataset.from_pandas(df_train[['item_name', 'labels']])
val_dataset = Dataset.from_pandas(df_val[['item_name', 'labels']])
datasets = DatasetDict({"train": train_dataset, "validation": val_dataset})

In [None]:
tokenize_function = TokenizeFunction(
    tokenizer=tokenizer, sent1_name='item_name', padding=False
    )

In [None]:
tokenized_datasets = datasets.map(
        tokenize_function,
        batched=True,
        num_proc=None,
        remove_columns=['item_name']
    )

In [None]:
tokenized_datasets.remove_columns_(['token_type_ids'])
tokenized_datasets

## Freeze bert

In [None]:
_ = model.bert.requires_grad_(True)
model.high_dropout = nn.Dropout(0.2)

## Train

In [None]:
runs_dir = os.path.join(PROJ_PATH, 'reports', 'task1')
logging_dir = os.path.join(runs_dir, f"{MODEL_NAME}__rubert_pseudolabel_extend_mlm_labelsmooth_mdrop_lowercase__{datetime.now().strftime('%Y-%m-%d_%H:%M:%S')}")

In [None]:
gradient_accumulation_steps = 1
training_args = TrainingArguments(
    output_dir='./model',
    do_train=True,
    do_eval=True,
    evaluation_strategy='steps',
    eval_steps=2000//gradient_accumulation_steps,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=128,
    gradient_accumulation_steps=gradient_accumulation_steps,
    learning_rate=1e-7,
    weight_decay=0.01,
    num_train_epochs=10,
    adam_epsilon=1e-8,
    lr_scheduler_type='linear',
    warmup_steps=200,
    logging_steps=2000//gradient_accumulation_steps,
    save_steps=2000//gradient_accumulation_steps,
    save_total_limit=7,
    label_smoothing_factor=0.1,
    disable_tqdm=False,
    label_names=['labels'],
    logging_dir=logging_dir,
    fp16=True
)

In [None]:
trainer = LSTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"] if training_args.do_train else None,
        eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None,
        tokenizer=tokenizer,
        compute_metrics=MetricsCalculator()
    )

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

# Save

In [None]:
trainer.save_model(os.path.join(SUB_PATH, "model"))

# Create Submission

In [None]:
import zipfile 
compression = zipfile.ZIP_DEFLATED
submission_name = os.path.join(SUB_PATH, "submission.zip")

In [None]:
def zip_dir(dir: str, target_dir: str, compression: int, zipObj):
    for filename in os.listdir(dir):
        zipObj.write(
            os.path.join(dir, filename), 
            arcname=os.path.join(target_dir, filename),
            compress_type=compression
        )

In [None]:
with zipfile.ZipFile(submission_name, 'w') as zipObj:
    for filename in [
        'label_encoder.pickle',
        'script.py',
    ]:
        zipObj.write(
            os.path.join(SUB_PATH, filename), 
            arcname=filename,
            compress_type=compression
        )
    zip_dir(
        dir=os.path.join(SUB_PATH, "model"), target_dir="model", compression=compression, zipObj=zipObj
        )
    print(zipObj.namelist())