In [1]:
!pip install --quiet numpy scipy scikit-learn lightning transformers datasets

In [2]:
import csv
import logging

from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoConfig, AutoModelForSequenceClassification
from typing import Dict, List, Optional, Tuple

In [3]:
logger = logging.getLogger(__name__)

# Some fo the training inputs are too large, this is a hackish solution, should resort to limiting the file column size
# More info here: https://stackoverflow.com/questions/54042406/error-field-larger-than-field-limit-131072
csv.field_size_limit(256<<10)

131072

In [4]:
DATASET = '../data/casimedicos/dev_relations.tsv'
MODEL = 'xlm-roberta-base'

# PyTorch Dataset for Sequence Classification

The following is the prototype for a dataset class for Sequence Classification. I am not quite convinced that the tokenizer should be part of this, but in this way we do not have to store the whole padded data at once (unlike the version showed in [Lightining's documentation](https://lightning.ai/docs/pytorch/stable/notebooks/lightning_examples/text-transformers.html)).

As a drawback, in this version we cannot really benefit from the **Fast Tokenizers**, since there's a warning that using a Fast Tokenizer with a collator function for padding after is slower than processing the whole batch using the tokenizer call. However, if we have that version, we will have to pad/tokenize everything in advance which can be memory consuming.

In [5]:
class SentenceClassificationDataset(Dataset):
    """
    TODO: Docstring. Explain the way the dataset is expected (label first with the __label__ at the beginning)
    """
    def __init__(self,
                 tokenizer_model_or_path: str,
                 path_to_dataset: Optional[str] = None,
                 dataset: Optional[List[Tuple[str, str, str]]] = None,
                 labels: Optional[Dict[str, int]] = None,
                 delimiter: str = '\t',
                 quotechar: str = '"'):
        if path_to_dataset is not None and dataset is not None:
            logger.warn("Both path and dataset were provided. Ignoring the path, using the parsed dataset.")
            path_to_dataset = None
        elif path_to_dataset is None and dataset is None:
            raise ValueError("Provide either path to a file or a dataset as a list of tuples")

        if path_to_dataset is not None:
            with open(path_to_dataset, "rt") as fh:
                csv_reader = csv.reader(fh, delimiter=delimiter, quotechar=quotechar)
                dataset = list(csv_reader)

        target = [d[0].lstrip('__label__') for d in dataset]

        self.labels = labels if labels is not None else {lbl: idx for idx, lbl in enumerate(sorted(set(target)))}
        self.dataset = [
            {
                "text": d[1],
                "text_pair": d[2]
            }
            for d, t in zip(dataset, target)
        ]
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_model_or_path, use_fast=False)
        self.target = [self.labels[t] for t in target]

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        data = self.dataset[idx]

        # WARNING: This does not work with a range of values (if idx is a range instead of a single value)
        tokenized_data = self.tokenizer(**data, truncation=True)
        tokenized_data['label'] = self.target[idx]

        return tokenized_data

As you can see, now we can just use the SentenceClassificationDataset class as is in the DataLoader along a DataCollatorWithPadding and the DataLoader will pad the batch and in general will give us the batch that can be directly used over the Transformer Model.

In [6]:
dataset = SentenceClassificationDataset(MODEL, DATASET)
loader = DataLoader(
    dataset=dataset,
    batch_size=32,
    shuffle=False,
    collate_fn=DataCollatorWithPadding(dataset.tokenizer)
)
batch_input = next(iter(loader))
batch_input

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': tensor([[    0,    62, 46667,  ...,     1,     1,     1],
        [    0,    62, 46667,  ...,     1,     1,     1],
        [    0,    62, 46667,  ...,     1,     1,     1],
        ...,
        [    0,  4687,  1556,  ...,     1,     1,     1],
        [    0,  4687,  1556,  ...,     1,     1,     1],
        [    0,  4687,  1556,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 1, 2, 2, 2])}

There's no need for extra steps and we have the loss for this specific batch of data already implemented within the model, we can directly use that as the loss function for the `LightningModule`'s `train_step`.

In [7]:
config = AutoConfig.from_pretrained(MODEL, num_labels=len(dataset.labels), label2id=dataset.labels, id2label={idx: lbl for lbl, idx in dataset.labels.items()})
model = AutoModelForSequenceClassification.from_pretrained(MODEL, config=config)
model(**batch_input)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


SequenceClassifierOutput(loss=tensor(1.0986, grad_fn=<NllLossBackward0>), logits=tensor([[-0.1783,  0.1205, -0.0246],
        [-0.1729,  0.1264, -0.0276],
        [-0.1686,  0.1263, -0.0224],
        [-0.1696,  0.1244, -0.0245],
        [-0.1716,  0.1232, -0.0256],
        [-0.1682,  0.1239, -0.0219],
        [-0.1661,  0.1245, -0.0228],
        [-0.1679,  0.1249, -0.0238],
        [-0.1686,  0.1274, -0.0237],
        [-0.1711,  0.1278, -0.0234],
        [-0.1697,  0.1281, -0.0227],
        [-0.1713,  0.1248, -0.0297],
        [-0.1780,  0.1245, -0.0307],
        [-0.1743,  0.1194, -0.0293],
        [-0.1677,  0.1249, -0.0304],
        [-0.1732,  0.1279, -0.0274],
        [-0.1726,  0.1263, -0.0312],
        [-0.1787,  0.1183, -0.0262],
        [-0.1804,  0.1296, -0.0282],
        [-0.1767,  0.1230, -0.0294],
        [-0.1701,  0.1282, -0.0272],
        [-0.1727,  0.1273, -0.0281],
        [-0.1780,  0.1245, -0.0307],
        [-0.1645,  0.1210, -0.0294],
        [-0.1696,  0.1244, -0.0