In [2]:
import os
import random
from collections import defaultdict

import evaluate
import numpy as np
import torch
from tqdm import tqdm
from transformers import (
    AutoModelForQuestionAnswering,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)

from datasets import load_dataset

In [3]:
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
os.environ["PYTHONHASHSEED"] = str(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed(RANDOM_SEED)

In [4]:
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")

In [5]:
DATASET_NAME = "sberquad"
CHECKPOINT_NAME = "ai-forever/ruBert-base"

## Скачаем датасет, сделаем предобработку

Используем датасет squad, потому что так сказано в задании

In [6]:
dataset = load_dataset(DATASET_NAME)
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 45328
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 5036
    })
    test: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 23936
    })
})

Датасет слишком большой, поэтому от train и validation подвыборки отрежем данных в таком количестве, чтобы они соотносились друг к другу в отношении 70/30, а test все равно пустой

In [7]:
for subset, subset_size in zip(["train", "validation"], [7000, 3000]):
    dataset[subset] = dataset[subset].select(range(subset_size))

In [8]:
dataset.pop("test")  # он все равно пустой

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 23936
})

In [9]:
tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT_NAME)

In [10]:
max_length = 384
stride = 128

Токенизируем датасет

In [11]:
def preprocess_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs["offset_mapping"]
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start = answer["answer_start"][0]
        end = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        ctx_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        ctx_end = idx - 1

        if offset[ctx_start][0] > end or offset[ctx_end][1] < start:
            start_positions.append(0)
            end_positions.append(0)
        else:
            idx = ctx_start
            while idx <= ctx_end and offset[idx][0] <= start:
                idx += 1
            start_positions.append(idx - 1)

            idx = ctx_end
            while idx >= ctx_start and offset[idx][1] >= end:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [12]:
preprocessed_dataset = dataset.map(preprocess_examples, batched=True)

Map:   0%|          | 0/7000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [13]:
preprocessed_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers', 'input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'start_positions', 'end_positions'],
        num_rows: 7000
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers', 'input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'start_positions', 'end_positions'],
        num_rows: 3000
    })
})

## Обучим QA на основе модели ruBert

В библиотеке evaluate уже есть метрики для похожего датасета,  поэтому используем их

In [14]:
squad_metric = evaluate.load("squad")

In [15]:
n_best = 15
max_answer_length = 40

In [16]:
def compute_metrics(eval_preds):
    y_pred = np.argmax(eval_preds[0], -1).T

    f1_score = 0
    exact_match = 0
    for data, pred in zip(preprocessed_dataset["validation"], y_pred):
        start = data["offset_mapping"][pred[0]][0]
        end = data["offset_mapping"][pred[1]][1]

        predictions = [
            {"prediction_text": data["context"][start:end], "id": str(data["id"])}
        ]
        gt = [{"answers": data["answers"], "id": str(data["id"])}]
        results = squad_metric.compute(predictions=predictions, references=gt)
        f1_score += results["f1"] / 100
        exact_match += results["exact_match"] / 100
    f1_score /= len(preprocessed_dataset["validation"])
    exact_match /= len(preprocessed_dataset["validation"])
    return {"f1": f1_score, "exact_match": exact_match}

In [17]:
model = AutoModelForQuestionAnswering.from_pretrained(CHECKPOINT_NAME).to(device)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at ai-forever/ruBert-base and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Для обучения будем использовать встроенный Trainer из библиотеки transformers

In [18]:
args = TrainingArguments(
    "my_custom_bert_squad",
    do_eval=True,
    save_strategy="steps",
    optim="adamw_hf",
    eval_steps=50,
    logging_steps=50,
    learning_rate=1e-5,
    num_train_epochs=10,
    gradient_accumulation_steps=4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    fp16=True,
)

In [19]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=preprocessed_dataset["train"],
    eval_dataset=preprocessed_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Прежде чем обучать, давайте замерим изначальные метрики

In [20]:
predictions = trainer.predict(preprocessed_dataset["validation"])
print(f"Test metric before training: {compute_metrics(predictions)}")

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Test metric before training: {'f1': 0.0690604238411928, 'exact_match': 0.0}


In [21]:
trainer.train()



Step,Training Loss
50,4.9059
100,3.0992
150,2.5328
200,2.258
250,2.108
300,1.9944
350,1.8948
400,1.86
450,1.7885
500,1.759


TrainOutput(global_step=540, training_loss=2.3715108094392, metrics={'train_runtime': 1657.595, 'train_samples_per_second': 42.23, 'train_steps_per_second': 0.326, 'total_flos': 1.3531513844330496e+16, 'train_loss': 2.3715108094392, 'epoch': 9.86})

In [22]:
predictions = trainer.predict(preprocessed_dataset["validation"])
print(f"Test metric before training: {compute_metrics(predictions)}")

Test metric before training: {'f1': 0.7340265674022828, 'exact_match': 0.5206666666666667}


Вывод: метрики модели после обучения стали лучше