In [1]:
!pip install --quiet numpy scipy scikit-learn lightning transformers datasets

In [2]:
import csv
import logging

from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoConfig, AutoModelForSequenceClassification
from typing import Dict, List, Optional, Tuple

In [3]:
logger = logging.getLogger(__name__)

# Some fo the training inputs are too large, this is a hackish solution, should resort to limiting the file column size
# More info here: https://stackoverflow.com/questions/54042406/error-field-larger-than-field-limit-131072
csv.field_size_limit(256<<10)

131072

In [4]:
DATASET = '../data/casimedicos/dev_relations.tsv'
MODEL = 'xlm-roberta-base'

# PyTorch Dataset for Sequence Classification

The following is the prototype for a dataset class for Sequence Classification. I am not quite convinced that the tokenizer should be part of this, but in this way we do not have to store the whole padded data at once (unlike the version showed in [Lightining's documentation](https://lightning.ai/docs/pytorch/stable/notebooks/lightning_examples/text-transformers.html)).

As a drawback, in this version we cannot really benefit from the **Fast Tokenizers**, since there's a warning that using a Fast Tokenizer with a collator function for padding after is slower than processing the whole batch using the tokenizer call. However, if we have that version, we will have to pad/tokenize everything in advance which can be memory consuming.

In [5]:
class SentenceClassificationDataset(Dataset):
    """
    TODO: Docstring. Explain the way the dataset is expected (label first with the __label__ at the beginning)
    """
    def __init__(self,
                 tokenizer_model_or_path: str,
                 path_to_dataset: Optional[str] = None,
                 dataset: Optional[List[Tuple[str, str, str]]] = None,
                 labels: Optional[Dict[str, int]] = None,
                 delimiter: str = '\t',
                 quotechar: str = '"'):
        if path_to_dataset is not None and dataset is not None:
            logger.warn("Both path and dataset were provided. Ignoring the path, using the parsed dataset.")
            path_to_dataset = None
        elif path_to_dataset is None and dataset is None:
            raise ValueError("Provide either path to a file or a dataset as a list of tuples")

        if path_to_dataset is not None:
            with open(path_to_dataset, "rt") as fh:
                csv_reader = csv.reader(fh, delimiter=delimiter, quotechar=quotechar)
                dataset = list(csv_reader)

        target = [d[0].lstrip('__label__') for d in dataset]  # TODO: this assumes the dataset was not parsed with this same function

        self.labels = labels if labels is not None else {lbl: idx for idx, lbl in enumerate(sorted(set(target)))}  # TODO: This is a problem if we have 3 different splits and not all labels appear in all splits
        self.dataset = [
            {
                "text": d[1],
                "text_pair": d[2]
            }
            for d, t in zip(dataset, target)
        ]
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_model_or_path, use_fast=False)
        self.target = [self.labels[t] for t in target]

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        data = self.dataset[idx]

        # WARNING: This does not work with a range of values (if idx is a range instead of a single value)
        tokenized_data = self.tokenizer(**data, truncation=True)
        tokenized_data['label'] = self.target[idx]

        return tokenized_data

As you can see, now we can just use the SentenceClassificationDataset class as is in the DataLoader along a DataCollatorWithPadding and the DataLoader will pad the batch and in general will give us the batch that can be directly used over the Transformer Model.

In [6]:
dataset = SentenceClassificationDataset(MODEL, DATASET)
loader = DataLoader(
    dataset=dataset,
    batch_size=32,
    shuffle=False,
    collate_fn=DataCollatorWithPadding(dataset.tokenizer)
)
batch_input = next(iter(loader))
batch_input

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': tensor([[    0,    62, 46667,  ...,     1,     1,     1],
        [    0,    62, 46667,  ...,     1,     1,     1],
        [    0,    62, 46667,  ...,     1,     1,     1],
        ...,
        [    0,  4687,  1556,  ...,     1,     1,     1],
        [    0,  4687,  1556,  ...,     1,     1,     1],
        [    0,  4687,  1556,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 1, 2, 2, 2])}

There's no need for extra steps and we have the loss for this specific batch of data already implemented within the model, we can directly use that as the loss function for the `LightningModule`'s `train_step`.

In [7]:
config = AutoConfig.from_pretrained(MODEL, num_labels=len(dataset.labels), label2id=dataset.labels, id2label={idx: lbl for lbl, idx in dataset.labels.items()})
model = AutoModelForSequenceClassification.from_pretrained(MODEL, config=config)
model(**batch_input)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


SequenceClassifierOutput(loss=tensor(1.1785, grad_fn=<NllLossBackward0>), logits=tensor([[ 0.0862,  0.0999, -0.0301],
        [ 0.0893,  0.1055, -0.0283],
        [ 0.0911,  0.1032, -0.0371],
        [ 0.0872,  0.1015, -0.0356],
        [ 0.0882,  0.1011, -0.0362],
        [ 0.0905,  0.1023, -0.0377],
        [ 0.0887,  0.1025, -0.0408],
        [ 0.0889,  0.1024, -0.0390],
        [ 0.0859,  0.1052, -0.0344],
        [ 0.0893,  0.1045, -0.0368],
        [ 0.0848,  0.1048, -0.0362],
        [ 0.0841,  0.1062, -0.0274],
        [ 0.0832,  0.1102, -0.0211],
        [ 0.0919,  0.1011, -0.0368],
        [ 0.0818,  0.1074, -0.0355],
        [ 0.0850,  0.1069, -0.0297],
        [ 0.0847,  0.1063, -0.0300],
        [ 0.0904,  0.1054, -0.0337],
        [ 0.0900,  0.0977, -0.0301],
        [ 0.0921,  0.0985, -0.0345],
        [ 0.0914,  0.1070, -0.0391],
        [ 0.0864,  0.1073, -0.0318],
        [ 0.0832,  0.1102, -0.0211],
        [ 0.0883,  0.1073, -0.0415],
        [ 0.0872,  0.1015, -0.0

# Pytorch Dataset for Sequence Classification

In [1]:
import logging

from itertools import chain
from operator import itemgetter
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, DataCollatorForTokenClassification, AutoConfig, AutoModelForTokenClassification
from typing import Dict, List, Optional, Tuple

In [2]:
logger = logging.getLogger(__name__)

In [3]:
DATASET = '../data/casimedicos/dev_revisited.conll'
MODEL = 'xlm-roberta-base'

In [4]:
class SentenceClassificationDataset(Dataset):
    """
    TODO: Docstring. Explain the way the dataset is expected (label first with the __label__ at the beginning)
    """
    def __init__(self,
                 tokenizer_model_or_path: str,
                 path_to_dataset: Optional[str] = None,
                 dataset: Optional[List[Tuple[str, str, str]]] = None,
                 labels: Optional[Dict[str, int]] = None,
                 delimiter: str = '\t',
                 token_position: int = 1,
                 label_position: int = 4):
        if path_to_dataset is not None and dataset is not None:
            logger.warn("Both path and dataset were provided. Ignoring the path, using the parsed dataset.")
            path_to_dataset = None
        elif path_to_dataset is None and dataset is None:
            raise ValueError("Provide either path to a file or a dataset as a list of tuples")

        if path_to_dataset is not None:
            sentences = self._load_conll_sentences(path_to_dataset, delimiter, token_position, label_position)

        # TODO: What happens if not all the labels are present? This might be better to do outside here
        if labels is None:
            self.labels = {lbl: idx for idx, lbl in enumerate(sorted(set(chain(*map(itemgetter('labels'), sentences)))))}
        else:
            self.labels = labels

        self.dataset = [
            {
                "tokens": sentence['tokens'],
                "labels": [self.labels[l] for l in sentence['labels']]
            } for sentence in sentences
        ]
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_model_or_path, use_fast=False)

    def _load_conll_sentences(self, path_to_dataset, delimiter='\t', token_position=1, label_position=4):
        with open(path_to_dataset, 'rt') as fh:
            sentences = []
            sentence_tokens = []
            sentence_labels = []
            for line in fh:
                line = line.strip().split(delimiter)

                if len(line) < 2:
                    # We have the end of a sentence
                    assert len(sentence_tokens) == len(sentence_labels)
                    if len(sentence_tokens) == 0:
                        # Happens after a paragraph change (there are 2 blank lines)
                        continue

                    sentences.append({
                        "tokens": sentence_tokens,
                        "labels": sentence_labels
                    })
                    sentence_tokens = []
                    sentence_labels = []
                else:
                    sentence_tokens.append(line[token_position])
                    sentence_labels.append(line[label_position])
        return sentences

    def _tokenize_and_align_labels(self, sentence):
        tokenized_sentence = self.tokenizer(sentence["tokens"], truncation=True, is_split_into_words=True)

        sentence_labels = []
        word_ids = tokenized_sentence.word_ids()
        previous_wid = None
        for wid in word_ids:
            if wid is None:
                # Assing the -100 label to the special characters (e.g. [CLS], [SEP], etc.)
                # Why the -100? Because Pytorch CrossEntropy loss ignore this.
                sentence_labels.append(-100)
            elif wid != previous_wid:
                # Only label the first token of a given word
                # This is according to https://huggingface.co/docs/transformers/tasks/token_classification#train
                # Maybe a better way would be to replicate label? It might be good that this is configurable
                sentence_labels.append(sentence["labels"][wid])
            else:
                # The subtokens don't have a valid label.
                # Again, this perhaps should be confiurable.
                sentence_labels.append(-100)
            previous_wid = wid

        tokenized_sentence['labels'] = sentence_labels

        assert len(tokenized_sentence['input_ids']) == len(tokenized_sentence['labels'])

        return tokenized_sentence

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        return self._tokenize_and_align_labels(self.dataset[idx])

In [5]:
dataset = SentenceClassificationDataset(MODEL, DATASET)

In [6]:
loader = DataLoader(
    dataset=dataset,
    batch_size=32,
    shuffle=False,
    collate_fn=DataCollatorForTokenClassification(dataset.tokenizer)
)
batch_input = next(iter(loader))
batch_input

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': tensor([[    0, 21389,   294,  ...,     1,     1,     1],
        [    0,  4865,    83,  ...,     1,     1,     1],
        [    0,   116,    20,  ...,     1,     1,     1],
        ...,
        [    0,    20,   984,  ...,     1,     1,     1],
        [    0, 21389,   294,  ...,     1,     1,     1],
        [    0,  4865,  9060,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[-100,   10, -100,  ..., -100, -100, -100],
        [-100,   10,   10,  ..., -100, -100, -100],
        [-100,   10,   10,  ..., -100, -100, -100],
        ...,
        [-100,   10,    0,  ..., -100, -100, -100],
        [-100,   10, -100,  ..., -100, -100, -100],
        [-100,   10,   10,  ..., -100, -100, -100]])}

In [7]:
config = AutoConfig.from_pretrained(MODEL, num_labels=len(dataset.labels), label2id=dataset.labels, id2label={idx: lbl for lbl, idx in dataset.labels.items()})
model = AutoModelForTokenClassification.from_pretrained(MODEL, config=config)
model(**batch_input)

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TokenClassifierOutput(loss=tensor(2.0128, grad_fn=<NllLossBackward0>), logits=tensor([[[ 0.3063, -0.2447, -0.0432,  ..., -0.2811,  0.1820, -0.1890],
         [ 0.2146, -0.5982,  0.0442,  ..., -0.3867,  0.2949, -0.2873],
         [ 0.2043, -0.6358,  0.0644,  ..., -0.3147,  0.2378, -0.4256],
         ...,
         [ 0.3311, -0.2030, -0.0354,  ..., -0.3211,  0.1602, -0.1613],
         [ 0.3311, -0.2030, -0.0354,  ..., -0.3211,  0.1602, -0.1613],
         [ 0.3311, -0.2030, -0.0354,  ..., -0.3211,  0.1602, -0.1613]],

        [[ 0.3273, -0.0974,  0.0015,  ..., -0.2431,  0.1527, -0.1260],
         [ 0.2783, -0.5119,  0.0414,  ..., -0.4060,  0.2896, -0.2661],
         [ 0.2476, -0.5478, -0.0046,  ..., -0.4702,  0.2320, -0.3108],
         ...,
         [ 0.2645, -0.5441,  0.0147,  ..., -0.4056,  0.2817, -0.4508],
         [ 0.2645, -0.5441,  0.0147,  ..., -0.4056,  0.2817, -0.4508],
         [ 0.2645, -0.5441,  0.0147,  ..., -0.4056,  0.2817, -0.4508]],

        [[ 0.2776, -0.0603,  0.0315,  