Model Trainer for the new HuggingFace training pipeline

Written by Maksym and Artem

Last version update by Artem on Aug 15th  *(edit this line if you change something)*

**Required Parameters**

In [None]:
# select the folder with the input data, expected format outlined in errorifier-tagger
data_folder = "/content/drive/MyDrive/datasets/preprocessed/borshch4-grammar-07-08"

# name your model
model_name = "chornobaivka-25"

# Internals

### Setup

In [None]:
!pip install transformers &> /dev/null
!pip install datasets &> /dev/null

In [None]:
# all the imports we will need
import os
import gc
import json
import numpy as np
import pandas as pd
from datetime import datetime
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification
from transformers import DataCollatorForTokenClassification
from transformers import EarlyStoppingCallback
from transformers import TrainingArguments

In [None]:
# model path from name
model_path = f'/content/drive/MyDrive/model-directories/{model_name}'

# creating the output folder
if not os.path.exists(model_path):
  os.mkdir(model_path)

# Writing model metadata

# Write train metadata
message = f"My name is {model_name}\n"
message += f'data: {data_folder}\n'
message += "Train datetime: " + datetime.now().strftime("%Y-%m-%d %H:%M:%S") + "\n"
message += "\n"

#we use append here in case we fine-tune an existing model
with open(model_path + "/metadata.txt", 'a') as metadata_file:
  metadata_file.write(message)

### Read data

In [None]:
# reading the input data
with open(data_folder + "/train.json", 'r') as f:
    train_json = json.load(f)

with open(data_folder + "/dev.json", 'r') as f:
    dev_json = json.load(f)

# converting it to pandas dataframe first
train_df = pd.DataFrame(train_json)
dev_df = pd.DataFrame(dev_json)

# assigning the column names
train_df.columns = ['tokens', 'labels']
dev_df.columns = ['tokens', 'labels']

# converting both sets in the required format
train_dataset = Dataset.from_pandas(train_df)
dev_dataset = Dataset.from_pandas(dev_df)

In [None]:
"Generate label vocab from the dataset"
all_labels = [label for sequence in list(train_df['labels']) for label in sequence]
label_list = list(set(all_labels))

# generate label_encoding_dict for use in tokenization
i = 0
label_encoding_dict = {}
for label in label_list:
    label_encoding_dict[label] = i
    i += 1

# save label_encodings for later use in the model folder
lines = [f'{label} {label_encoding_dict[label]}\n' for label in label_encoding_dict]
with open(f'{model_path}/label_encoding.txt', 'w') as label_file:
    label_file.writelines(lines)

### Train preparation

Usable models
- https://huggingface.co/ukr-models/xlm-roberta-base-uk
- https://huggingface.co/xlm-roberta-base
- https://huggingface.co/youscan/ukr-roberta-base


In [None]:
"Initialize tokenizer"
tokenizer = AutoTokenizer.from_pretrained("youscan/ukr-roberta-base", add_prefix_space=True)

Downloading tokenizer_config.json:   0%|          | 0.00/16.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/1.77M [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [None]:
"takes in Dataset hugging face object, tokenizes words into wordpieces, aligns lables with tokenized input"
def tokenize_and_align_labels(examples):
    label_all_tokens = True
    tokenized_inputs = tokenizer(list(examples["tokens"]), truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif label[word_idx] == '0':
                label_ids.append(0)
            elif word_idx != previous_word_idx:
                label_ids.append(label_encoding_dict[label[word_idx]])
            else:
                label_ids.append(label_encoding_dict[label[word_idx]] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)
        
    tokenized_inputs["labels"] = labels
    return tokenized_inputs


train_tokenized_datasets = train_dataset.map(tokenize_and_align_labels, batched=True)
dev_tokenized_datasets = dev_dataset.map(tokenize_and_align_labels, batched=True)

  0%|          | 0/826 [00:00<?, ?ba/s]

  0%|          | 0/100 [00:00<?, ?ba/s]

In [None]:
"initialize model"
model = AutoModelForTokenClassification.from_pretrained("youscan/ukr-roberta-base", num_labels=len(label_list))

Downloading pytorch_model.bin:   0%|          | 0.00/483M [00:00<?, ?B/s]

Some weights of the model checkpoint at youscan/ukr-roberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at youscan/ukr-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably T

In [None]:
"Define metrics to be computed. Adapted from old eval"
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]

    # Dataset-wide metrics
    total = 0
    TN = 0
    FN = 0
    TP = 0
    FP = 0

    # types of FP
    # extra_error + bad_guess = FP
    extra_error = 0
    bad_guess = 0

    # Classifying label combinations
    for sent_idx in range(len(true_labels)):
      for tag_idx in range(len(true_labels[sent_idx])):
        model_label = predictions[sent_idx][tag_idx]
        real_label = true_labels[sent_idx][tag_idx]

        if (real_label == '$KEEP' and model_label == '$KEEP'):
          TN += 1

        elif real_label != '$KEEP' and model_label == '$KEEP':
          FN += 1

        elif real_label == '$KEEP' and model_label != '$KEEP':
          FP += 1
          extra_error += 1
                
        if real_label !='$KEEP' and model_label != '$KEEP':
          if model_label == real_label:
            TP += 1

          if model_label != real_label:
            FP += 1
            bad_guess += 1

    total = FP+TP+FN+TN

    print(f'TP:{TP}')
    print(f'FP:{FP}')
    print(f'TN:{TN}')
    print(f'FN:{FN}')
    print(f'total:{total}')
    print()


    # Calculating metrics

    # Accuracy     
    accuracy = (TP+TN)/total

    # Precision     
    if (TP+FP) != 0:
      precision = TP/(TP+FP)
    else:
      precision = 'n/a'

    # Recall
    if (TP+FN) != 0:
      recall = TP/(TP+FN)
    else:
      recall = 'n/a'

    # F score
    if recall != 'n/a' and precision != 'n/a':
      f1 = 2*(recall*precision)/(recall+precision)
      fhalf = (1.25*precision*recall)/(0.25*precision + recall)
    else:
      f1 = 'n/a'
      fhalf = 'n/a'

    # Bad guess/extra error
    if FP != 0:
      extra_error_proportion = extra_error/FP
      bad_guess_proportion = bad_guess/FP
    else:
      extra_error_proportion = 'n/a'
      bad_guess_proportion  = 'n/a'

    out = {"accuracy": accuracy,
          "precision": precision, 
          "recall": recall, 
          "f1": f1,
          "f0.5":fhalf,     
          "extra-error":extra_error_proportion,
          "bad-guess":bad_guess_proportion}

    # print('\n\n')
    # for key in out:
    #   print(f'{key}: {round(out[key], 3)}')
    # print('\n\n')
    # ! FIXME ↑↑↑
    # ! type str doesn't define __round__ method

    # for now, dont print exrtra-error and bad-guess. 
    #will be included in next iteration of compute metrics (together with label_data)
    out.pop('extra-error')
    out.pop("bad-guess")

    return out

In [None]:
"initialize data collator"
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

"initialize early stopping callback"
callback = EarlyStoppingCallback(early_stopping_patience=1)

"initialize training args"
args = TrainingArguments(
    model_path,
    evaluation_strategy = "epoch", 
    save_strategy = "epoch",
    load_best_model_at_end=True,
    gradient_accumulation_steps = 4,
    learning_rate=1e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=1e-5,
)

In [None]:
# cleaning the memory
gc.collect()

0

### Train

In [None]:
from transformers import Trainer

from transformers import EarlyStoppingCallback
callback = EarlyStoppingCallback(early_stopping_patience=1)

trainer = Trainer(
    model,
    args,
    callbacks=[callback],
    train_dataset=train_tokenized_datasets,
    eval_dataset=dev_tokenized_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.evaluate()
trainer.save_model('model')

The following columns in the training set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: tokens. If tokens are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 825422
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 4
  Total optimization steps = 128970


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,F0.5
0,0.257,0.237846,0.922127,0.825756,0.895784,0.859346,0.838872
1,0.2069,0.215123,0.929813,0.83889,0.913859,0.874771,0.852883
2,0.1693,0.21002,0.933236,0.846581,0.918963,0.881288,0.860131
3,0.1366,0.212027,0.934827,0.849228,0.92229,0.884253,0.8629


The following columns in the evaluation set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: tokens. If tokens are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 100000
  Batch size = 16


TP:547022
FP:115428
TN:1573395
FN:63641
total:2299486



Saving model checkpoint to /content/drive/MyDrive/model-directories/chornobaivka-25/checkpoint-12897
Configuration saved in /content/drive/MyDrive/model-directories/chornobaivka-25/checkpoint-12897/config.json
Model weights saved in /content/drive/MyDrive/model-directories/chornobaivka-25/checkpoint-12897/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/model-directories/chornobaivka-25/checkpoint-12897/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/model-directories/chornobaivka-25/checkpoint-12897/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: tokens. If tokens are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 100000
  Batch size = 16


TP:563699
FP:108259
TN:1574393
FN:53135
total:2299486



Saving model checkpoint to /content/drive/MyDrive/model-directories/chornobaivka-25/checkpoint-25794
Configuration saved in /content/drive/MyDrive/model-directories/chornobaivka-25/checkpoint-25794/config.json
Model weights saved in /content/drive/MyDrive/model-directories/chornobaivka-25/checkpoint-25794/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/model-directories/chornobaivka-25/checkpoint-25794/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/model-directories/chornobaivka-25/checkpoint-25794/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: tokens. If tokens are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 100000
  Batch size = 16


TP:569860
FP:103271
TN:1576103
FN:50252
total:2299486



Saving model checkpoint to /content/drive/MyDrive/model-directories/chornobaivka-25/checkpoint-38691
Configuration saved in /content/drive/MyDrive/model-directories/chornobaivka-25/checkpoint-38691/config.json
Model weights saved in /content/drive/MyDrive/model-directories/chornobaivka-25/checkpoint-38691/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/model-directories/chornobaivka-25/checkpoint-38691/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/model-directories/chornobaivka-25/checkpoint-38691/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: tokens. If tokens are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 100000
  Batch size = 16


TP:572447
FP:101632
TN:1577174
FN:48233
total:2299486



Saving model checkpoint to /content/drive/MyDrive/model-directories/chornobaivka-25/checkpoint-51588
Configuration saved in /content/drive/MyDrive/model-directories/chornobaivka-25/checkpoint-51588/config.json
Model weights saved in /content/drive/MyDrive/model-directories/chornobaivka-25/checkpoint-51588/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/model-directories/chornobaivka-25/checkpoint-51588/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/model-directories/chornobaivka-25/checkpoint-51588/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from /content/drive/MyDrive/model-directories/chornobaivka-25/checkpoint-38691 (score: 0.21002022922039032).
The following columns in the evaluation set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: tokens. If tokens are not expected by `RobertaForTokenClassif

Saving model checkpoint to model
Configuration saved in model/config.json


TP:569860
FP:103271
TN:1576103
FN:50252
total:2299486



Model weights saved in model/pytorch_model.bin
tokenizer config file saved in model/tokenizer_config.json
Special tokens file saved in model/special_tokens_map.json
