In [1]:
import transformers
from src.ru import rsg, mokoron
import datasets
from src.data import dict_map, unpack_splits, make_features
from src.trf import MultitaskModel, MultitaskTrainer, NLPDataCollator, Task

In [2]:
base_model_name = "DeepPavlov/rubert-base-cased"
tokenizer = transformers.AutoTokenizer.from_pretrained(base_model_name)
conv_params = {"pad_to_max_length": True, "max_length": 512}
path = "russian_super_glue"
rsg_name = "Russian SuperGLUE"
tasks = {
    'danetqa': Task(
        cls=transformers.AutoModelForSequenceClassification,
        config=transformers.AutoConfig.from_pretrained(base_model_name, num_labels = 2),
        data=dict_map(datasets.load_dataset(path, "danetqa"), rsg.preprocess_danetqa, name="danetqa"),
        converter_to_features=rsg.InputLabelConv(tokenizer, **conv_params),
        name=f"{rsg_name}: DaNetQA"
    ),
    'mokoron': Task(
        cls=transformers.AutoModelForSequenceClassification,
        config=transformers.AutoConfig.from_pretrained(base_model_name, num_labels = 2),
        data=mokoron.load(),
        converter_to_features=rsg.InputLabelConv(tokenizer, **conv_params),
        name="RuTwitter Sentiment"
    ),
    'parus': Task(
        cls=transformers.AutoModelForSequenceClassification,
        config=transformers.AutoConfig.from_pretrained(base_model_name, num_labels = 2),
        data=dict_map(datasets.load_dataset(path, "parus"), rsg.preprocess_parus, name="parus"),
        converter_to_features=rsg.InputLabelConv(tokenizer, **conv_params),
        name=f"{rsg_name}: PARus"
    )
}

Reusing dataset russian_super_glue (/Users/s1m00n/.cache/huggingface/datasets/russian_super_glue/danetqa/0.0.1/6fcadbfc1d8f0298b2f01ff277093772efe9e1b98f3c0df8ab5f511b3b9e13c9)


  0%|          | 0/3 [00:00<?, ?it/s]

map(preprocess_danetqa, danetqa):
    map(preprocess_danetqa, danetqa[train])
    map(preprocess_danetqa, danetqa[validation])
    map(preprocess_danetqa, danetqa[test])


Reusing dataset russian_super_glue (/Users/s1m00n/.cache/huggingface/datasets/russian_super_glue/parus/0.0.1/6fcadbfc1d8f0298b2f01ff277093772efe9e1b98f3c0df8ab5f511b3b9e13c9)


  0%|          | 0/3 [00:00<?, ?it/s]

map(preprocess_parus, parus):
    map(preprocess_parus, parus[train])
    map(preprocess_parus, parus[validation])
    map(preprocess_parus, parus[test])


In [3]:
features = make_features(tasks, batched=True, load_from_cache_file=False)
train, validation = unpack_splits(features, "train", "validation")

  0%|          | 0/2 [00:00<?, ?ba/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/182 [00:00<?, ?ba/s]

  0%|          | 0/23 [00:00<?, ?ba/s]

  0%|          | 0/23 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
model = MultitaskModel.create(base_model_name, tasks)
trainer = MultitaskTrainer(
    model=model,
    args=transformers.TrainingArguments(
        output_dir="test",
        overwrite_output_dir=True,
        learning_rate=1e-5,
        logging_steps=100,
        eval_steps=500,
        do_train=True,
        num_train_epochs=1,
        per_device_train_batch_size=12,
        per_device_eval_batch_size=128,
        save_steps=5000,
    ),
    data_collator=NLPDataCollator(),
    train_dataset=train,
    eval_dataset=validation
)
trainer.train()