### Import necessary modules

In [200]:
from pathlib import Path

DATASET = Path("../data/dataset-0.5.json")

MODEL = "ufal/robeczech-base"
BATCH_SIZE = 16

### Load data

Load dataset with errored sentences

In [201]:
from datasets import load_dataset

ds = load_dataset("json", data_files=str(DATASET))
ds_train = ds['train'].select(range(len(ds['train']) // 7))

dataset = ds_train.train_test_split(test_size=0.2)
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'error', 'labels'],
        num_rows: 77513
    })
    test: Dataset({
        features: ['sentence', 'error', 'labels'],
        num_rows: 19379
    })
})

In [202]:
label_list = [0, 1]

### Load RoBERTa model tokenizer

Load Czech RoBERTa model tokenizer from huggingface

In [203]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained(MODEL, add_prefix_space=True)

### Examples how the data will be tokenized

In [204]:
example = dataset["train"][4]
example

{'sentence': ['Další',
  'větší',
  'skupina',
  'Čechů',
  'se',
  'do',
  'Paraguaye',
  'dostala',
  'mezi',
  'lety',
  '1950',
  'a',
  '1952',
  'z',
  'utečeneckých',
  'táborů',
  'v',
  'Německu',
  's',
  'pasem',
  'IRO',
  '.'],
 'error': ['Další',
  'větší',
  'skupina',
  'Čechů',
  'se',
  'do8',
  'Paraguaye',
  'dostala',
  'mei',
  'let',
  '1950',
  'a',
  '1952r',
  'z',
  'utečeneckých',
  'táborů',
  'wv',
  'Německu',
  's',
  'pasem',
  'IRO',
  '.'],
 'labels': [0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0]}

In [205]:
tokenized_input = tokenizer(example["error"],  is_split_into_words=True)
tokenized_input

{'input_ids': [0, 5002, 549, 1344, 6302, 8, 15, 724, 14746, 32219, 7439, 1756, 2642, 280, 74, 16300, 6, 23053, 219, 13, 47372, 2852, 639, 19086, 2953, 307, 2087, 14, 42320, 285, 2395, 5, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [206]:
tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])

['[CLS]',
 'ĠDalÅ¡ÃŃ',
 'ĠvÄĽtÅ¡ÃŃ',
 'Ġskupina',
 'ĠÄĮechÅ¯',
 'Ġse',
 'Ġdo',
 '8',
 'ĠPara',
 'gua',
 'ye',
 'Ġdostala',
 'Ġme',
 'i',
 'Ġlet',
 'Ġ1950',
 'Ġa',
 'Ġ1952',
 'r',
 'Ġz',
 'ĠuteÄį',
 'ene',
 'ckÃ½ch',
 'ĠtÃ¡borÅ¯',
 'Ġw',
 'v',
 'ĠNÄĽmecku',
 'Ġs',
 'Ġpasem',
 'ĠI',
 'RO',
 'Ġ.',
 '[SEP]']

In [207]:
tokenized_input.word_ids()

[None,
 0,
 1,
 2,
 3,
 4,
 5,
 5,
 6,
 6,
 6,
 7,
 8,
 8,
 9,
 10,
 11,
 12,
 12,
 13,
 14,
 14,
 14,
 15,
 16,
 16,
 17,
 18,
 19,
 20,
 20,
 21,
 None]

### Process dataset

Here we set the labels of all special tokens to -100 (the index that is ignored by PyTorch) and the labels of all other tokens to the label of the word they come from.

In [208]:
label_all_tokens = True

Define function that will preprocess our samples. We feed them to the tokenizer with the argument truncation=True (to truncate texts that are bigger than the maximum size allowed by the model) and is_split_into_words=True (as seen above). Then we align the labels with the token ids.

In [209]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["error"], is_split_into_words=True, truncation=True, padding="max_length", max_length=128)

    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [210]:
tokenize_and_align_labels(dataset['train'][:5])

{'input_ids': [[0, 19173, 2179, 775, 74, 4, 2915, 5, 98, 2143, 11757, 701, 230, 20984, 130, 2294, 376, 1145, 9731, 545, 6720, 7105, 1214, 5243, 230, 5, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 5002, 1762, 10243, 512, 195, 6252, 1688, 6018, 51, 42384, 2275, 26766, 51605, 7183, 805, 1009, 942, 376, 981, 5238, 13, 80, 3162, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 72, 81, 282, 22912, 31, 569, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [211]:
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True, num_proc=4)

Map (num_proc=4): 100%|██████████| 77513/77513 [00:02<00:00, 27039.67 examples/s]
Map (num_proc=4): 100%|██████████| 19379/19379 [00:00<00:00, 25056.34 examples/s]


### Fine-tuning the model

Now that our data is ready, we can download pretrained model and fine-tune it.

In [212]:
from transformers import RobertaForTokenClassification

# num labels should be 2 (valid and error)
model = RobertaForTokenClassification.from_pretrained(MODEL, num_labels=2)

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at ufal/robeczech-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


We need to specify training arguments. It requires one folder name, which will be used to save the checkpoints of the model, others are optional.

In [213]:
from transformers import TrainingArguments

args = TrainingArguments(
    f"{MODEL}-finetuned-error-detection",
    evaluation_strategy = "epoch",
    optim="adamw_torch",
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,
)

We will need a data collator that will batch our processed examples together while applying padding to make them all the same size (each pad will be padded to the length of its longest example). There is a data collator for this task in the Transformers library, that not only pads the inputs, but also the labels.

In [214]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

The last thing to define for our Trainer is how to compute the metrics from the predictions. Here we will load the `seqeval` metric (which is commonly used to evaluate results on the CONLL dataset) via the Datasets library.

In [215]:
from datasets import load_metric
metric = load_metric("seqeval")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [216]:
labels = [1, 1]
metric.compute(predictions=[labels], references=[[0, 1]])

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


{'overall_precision': 0.0,
 'overall_recall': 0.0,
 'overall_f1': 0.0,
 'overall_accuracy': 0.5}

So we will need to do a bit of post-processing on our predictions:

- select the predicted index (with the maximum logit) for each token
- convert it to its string label
- ignore everywhere we set a label of -100

In [217]:
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]  # noqa: E741
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (_, l) in zip(prediction, label) if l != -100]  # noqa: E741
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

We drop the precision/recall/f1 computed for each category and only focus on the overall precision/recall/f1/accuracy.

We just need to pass all of this along with our datasets to the `Trainer`.

In [218]:
from transformers import Trainer

trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [219]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0586,0.049234,0.0,0.0,0.0,0.984451
2,0.039,0.050001,0.0,0.0,0.0,0.985711
3,0.0289,0.053651,0.0,0.0,0.0,0.985986


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


TrainOutput(global_step=14535, training_loss=0.045978135713236744, metrics={'train_runtime': 1139.504, 'train_samples_per_second': 204.07, 'train_steps_per_second': 12.756, 'total_flos': 1.5190421628658176e+16, 'train_loss': 0.045978135713236744, 'epoch': 3.0})

In [220]:
trainer.evaluate()

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


{'eval_loss': 0.053650856018066406,
 'eval_precision': 0.0,
 'eval_recall': 0.0,
 'eval_f1': 0.0,
 'eval_accuracy': 0.9859863742805869,
 'eval_runtime': 24.8299,
 'eval_samples_per_second': 780.47,
 'eval_steps_per_second': 48.812,
 'epoch': 3.0}

In [222]:
model.save_pretrained("./model/roberta-error-detection", from_pt=True)
tokenizer.save_pretrained("./model/roberta-error-detection", from_pt=True)

('./model/roberta-error-detection/tokenizer_config.json',
 './model/roberta-error-detection/special_tokens_map.json',
 './model/roberta-error-detection/vocab.json',
 './model/roberta-error-detection/merges.txt',
 './model/roberta-error-detection/added_tokens.json',
 './model/roberta-error-detection/tokenizer.json')