In [1]:
import os
if not os.path.exists("ua_squad_dataset.json"):
    !wget https://github.com/fido-ai/ua-datasets/releases/download/v0.0.1/ua_squad_dataset.json

In [2]:
!mkdir ukr_squad

mkdir: cannot create directory ‘ukr_squad’: File exists


In [3]:
import json

with open("ua_squad_dataset.json") as file:
    squad = json.load(file)

with open("ukr_squad/output.json", mode="w") as file:
    for item in squad:
        file.write(json.dumps(item, ensure_ascii=False))
        file.write("\n")


In [4]:
from datasets import load_dataset, Dataset

ukr_squad = load_dataset("ukr_squad")
ukr_squad = ukr_squad.shuffle(seed=42)
ukr_squad

Using custom data configuration ukr_squad-feb2caf182c7b742


Downloading and preparing dataset json/ukr_squad to /home/robinhad/.cache/huggingface/datasets/json/ukr_squad-feb2caf182c7b742/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /home/robinhad/.cache/huggingface/datasets/json/ukr_squad-feb2caf182c7b742/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['Question', 'Context', 'Answer'],
        num_rows: 13859
    })
})

In [5]:
ukr_squad = ukr_squad.filter(lambda x: len(x["Answer"]) > 0)

  0%|          | 0/14 [00:00<?, ?ba/s]

In [6]:
split_point = int(0.05*ukr_squad.num_rows["train"]) # split at 10%
ukr_squad["validation"] = Dataset.from_dict(ukr_squad["train"][0:split_point]) 
ukr_squad["train"] =  Dataset.from_dict(ukr_squad["train"][split_point::])
ukr_squad

DatasetDict({
    train: Dataset({
        features: ['Question', 'Context', 'Answer'],
        num_rows: 10386
    })
    validation: Dataset({
        features: ['Question', 'Context', 'Answer'],
        num_rows: 546
    })
})

In [7]:
ukr_squad["train"][1]

{'Question': 'Який рік був використаний для оцінок у звіті за 2015 рік?',
 'Context': 'Звіт про людський розвиток за 2015 рік підготований Програмою розвитку Організації Об\'єднаних Націй був опублікований 14 грудня 2015 року, він використовує значення ІРЛ на основі оцінок 2014 року. Нижче наведено перелік країн "дуже високого людського розвитку":',
 'Answer': '2014'}

In [8]:
from transformers import AutoTokenizer, AlbertTokenizerFast

model_name = "ukr-models/xlm-roberta-base-uk"

tokenizer = AutoTokenizer.from_pretrained(model_name)

In [9]:
def preprocess_function(examples):
    #print(examples)
    questions = [q.strip() for q in examples["Question"]]
    inputs = tokenizer(
        questions,
        examples["Context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["Answer"]
    contexts = examples["Context"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        
        if answer is None:
            print(questions[i], answer, contexts[i])
        start_char = contexts[i].find(answer)
        end_char = start_char + len(answer)
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [10]:
tokenized_ukr_squad = ukr_squad.map(preprocess_function, batched=True, remove_columns=ukr_squad["train"].column_names)

  0%|          | 0/11 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [11]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

In [12]:
from transformers import AutoModelForQuestionAnswering, GPT2LMHeadModel, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained(model_name)

Some weights of the model checkpoint at ukr-models/xlm-roberta-base-uk were not used when initializing XLMRobertaForQuestionAnswering: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForQuestionAnswering were not initialized from the model checkpoint at ukr-models/xlm-roberta-base-uk and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream t

In [13]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=6,
    weight_decay=0.01,
    save_total_limit=3,
    save_strategy="epoch",
    #fp16=True,
    #gradient_checkpointing=True,
    #gradient_accumulation_steps=4,
    report_to="tensorboard"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ukr_squad["train"],
    eval_dataset=tokenized_ukr_squad["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

***** Running training *****
  Num examples = 10386
  Num Epochs = 6
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 3900


Epoch,Training Loss,Validation Loss
1,2.4526,1.363105
2,1.3317,1.222852
3,1.0693,1.218412
4,0.6851,1.317087
5,0.5594,1.38929
6,0.4954,1.477818


***** Running Evaluation *****
  Num examples = 546
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-650
Configuration saved in ./results/checkpoint-650/config.json
Model weights saved in ./results/checkpoint-650/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-650/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-650/special_tokens_map.json
Deleting older checkpoint [results/checkpoint-500] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 546
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-1300
Configuration saved in ./results/checkpoint-1300/config.json
Model weights saved in ./results/checkpoint-1300/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1300/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1300/special_tokens_map.json
Deleting older checkpoint [results/checkpoint-1000] due to args.save_total_limit
***** Running Evaluation *

TrainOutput(global_step=3900, training_loss=1.0019937759790665, metrics={'train_runtime': 1103.4027, 'train_samples_per_second': 56.476, 'train_steps_per_second': 3.535, 'total_flos': 1.2212226519570432e+16, 'train_loss': 1.0019937759790665, 'epoch': 6.0})

In [14]:
trainer.create_model_card()

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Question Answering', 'type': 'question-answering'}}


In [22]:
from transformers import pipeline

qa_model = pipeline("question-answering", model=model.to("cpu"), tokenizer=tokenizer)
question = "Що відправлять для ЗСУ?"
context = "Про це повідомив міністр оборони Арвідас Анушаускас. Уряд Литви не має наміру зупинятися у військово-технічній допомозі Україні. Збройні сили отримають антидрони, тепловізори та ударний безпілотник. «Незабаром Литва передасть Україні не лише обіцяні бронетехніку, вантажівки та позашляховики, але також нову партію антидронів та тепловізорів. І, звичайно, Байрактар, який придбають на зібрані литовцями гроші», - написав глава Міноборони."
qa_model(question = question, context = context)

{'score': 0.28905344009399414,
 'start': 151,
 'end': 198,
 'answer': ' антидрони, тепловізори та ударний безпілотник.'}