In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

In [None]:
# import os
# os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# Connect gdrive

In [None]:
try:
  import google.colab
  IN_COLAB = True
  print('RUNNING IN COLAB')
except:
  IN_COLAB = False

if IN_COLAB:
  from google.colab import drive
  drive.mount('/content/gdrive', force_remount=True)
  !pip install transformers==4.3.2
  !pip install datasets==1.2.1

# Imports

In [None]:
import os, sys
import random
import pickle
import logging
import shutil
import json
import typing as T
from logging import getLogger
from datetime import datetime

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

tqdm.pandas()

In [None]:
sys.path.append("..")
from src import (
    save_pickle, load_pickle
)

In [None]:
import torch
import torch.nn.functional as F

import transformers
from transformers import (
    Trainer, TrainingArguments, EvalPrediction,
    AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
)
from datasets import Dataset, DatasetDict

In [None]:
torch.cuda.is_available()

# Settings

In [None]:
PROJ_PATH = ".."
DATA_PATH = os.path.join(PROJ_PATH, "data")
SRC_PATH = os.path.join(PROJ_PATH, 'src')
MODELS_PATH = os.path.join(PROJ_PATH, "model")

In [None]:
TRAIN_DATA_PATH = os.path.join(DATA_PATH, 'task1', "data_fusion_train.parquet")
# Need to pretrain both rubert and distillbert on mlm task
# MODEL_NAME = "Geotrend/bert-base-ru-cased" # ru part of multilingual distilbert

MODEL_NAME = "DeepPavlov/rubert-base-cased"


In [None]:
logger = getLogger(name=__name__)
logging.basicConfig(format = u'%(filename)s - %(funcName)s()[LINE:%(lineno)d] # [%(levelname)-8s] [%(asctime)s]  %(message)s', level = logging.INFO)

In [None]:
# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Methods

In [None]:
class TokenizeFunction:
    def __init__(
        self, tokenizer, sent1_name: str, padding: bool, max_seq_length: int = 128
        ):
        self._tokenizer = tokenizer
        self._sent1_name = sent1_name
        self._padding = padding
        self._max_seq_length = max_seq_length

    def __call__(self, examples):
        return self._tokenizer(
                    examples[self._sent1_name],
                    padding=self._padding,
                    truncation=True,
                    max_length=self._max_seq_length,
                    # We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it
                    # receives the `special_tokens_mask`.
                    return_special_tokens_mask=True,
                )

# Get data

In [None]:
%%time

df = pd.read_parquet(TRAIN_DATA_PATH)
df.shape

In [None]:
df = df.drop_duplicates('item_name').reset_index(drop=True)
df = df[df['item_name'] != ""].reset_index(drop=True)
df.shape

# Split

In [None]:
df = df.sample(frac=1).reset_index(drop=True)

In [None]:
df_train, df_val = df[['item_name']].iloc[:3_000_000], df[['item_name']].iloc[3_000_000:]
df_train, df_val = df_train.reset_index(drop=True), df_val.reset_index(drop=True)

df_train.columns = ['text']
df_val.columns = ['text']

df_train.shape, df_val.shape


# Download models

In [None]:
config = AutoConfig.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, do_lower_case=True, use_fast=True)

In [None]:
len(tokenizer.get_vocab())

# Prepare dataset

In [None]:
train_dataset = Dataset.from_pandas(df_train)
val_dataset = Dataset.from_pandas(df_val)


In [None]:
datasets = DatasetDict({"train": train_dataset, "validation": val_dataset})


In [None]:
tokenize_function = TokenizeFunction(
    tokenizer=tokenizer, sent1_name='text', padding=False
    )

In [None]:
tokenized_datasets = datasets.map(
        tokenize_function,
        batched=True,
        num_proc=None,
        remove_columns=['text']
    )

In [None]:
tokenized_dataset_path = os.path.join(DATA_PATH, "tokenized_dataset")
tokenized_datasets.save_to_disk(tokenized_dataset_path)


# Train

In [None]:
runs_dir = os.path.join(PROJ_PATH, 'reports', 'mlm')
logging_dir = os.path.join(runs_dir, f"{MODEL_NAME}__basic_tokenizer__{datetime.now().strftime('%Y-%m-%d_%H:%M:%S')}")

In [None]:
# for colab
shutil.copyfile(
    src=os.path.join(SRC_PATH, "run_mlm_checkpoint.py"),
    dst="./run_mlm_checkpoint.py"
    )

In [None]:
# output_dir = os.path.join(MODELS_PATH, "distilbert_ru_original_vocab_lowercase_240000")

output_dir = os.path.join(MODELS_PATH, "rubert_original_vocab_lowercase")


In [None]:
data_args_dict = dict(
    tokenized_dataset_dict_path=tokenized_dataset_path
)
model_args_dict = dict(
    model_name_or_path=MODEL_NAME,
    tokenizer_name=MODEL_NAME
)
training_args_dict = dict(
    do_train=True,
    do_eval=True,
    # max_seq_length=128,
    num_train_epochs=5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="steps",
    gradient_accumulation_steps=1,
    learning_rate=3e-5,
    eval_steps=10000,
    save_steps=5000,
    warmup_steps=500,
    weight_decay=0.01,
    save_total_limit=1,
    output_dir=output_dir,
    logging_dir=logging_dir,
    fp16=True
)

In [None]:
args_dict = {**data_args_dict, **model_args_dict, **training_args_dict}
with open('args.json', 'w') as f:
  json.dump(args_dict, f)

In [None]:
!python run_mlm_checkpoint.py args.json
