In [1]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AdamW

### Training a sequence classifier for one batch

In [2]:
# same as before
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
sequences = [
    "Peace is what all seek, and war is for the weak",
    "Roses are red and sky is blue"
]

In [4]:
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")

In [7]:
batch["input_ids"].shape

torch.Size([2, 14])

In [8]:
# choosing some random label
batch["labels"] = torch.tensor([1, 1])

In [9]:
optimizer = AdamW(model.parameters())



In [10]:
loss = model(**batch).loss

In [11]:
loss.backward()

In [12]:
optimizer.step()

## Using the MRPC Dataset

In [13]:
from datasets import load_dataset

In [14]:
raw_datasets = load_dataset("glue", "mrpc")

Downloading readme: 100%|██████████| 35.3k/35.3k [00:00<00:00, 43.0MB/s]
Downloading data: 100%|██████████| 649k/649k [00:00<00:00, 2.43MB/s]
Downloading data: 100%|██████████| 75.7k/75.7k [00:00<00:00, 182kB/s]
Downloading data: 100%|██████████| 308k/308k [00:00<00:00, 1.59MB/s]
Generating train split: 100%|██████████| 3668/3668 [00:00<00:00, 516265.34 examples/s]
Generating validation split: 100%|██████████| 408/408 [00:00<00:00, 293127.10 examples/s]
Generating test split: 100%|██████████| 1725/1725 [00:00<00:00, 855626.11 examples/s]


In [15]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [16]:
raw_datasets["train"][0]

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0}

In [17]:
raw_datasets["train"].features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [18]:
type(raw_datasets["train"])

datasets.arrow_dataset.Dataset

In [21]:
tokenized_sentance_1 = tokenizer(raw_datasets["train"]["sentence1"])
tokenized_sentance_2 = tokenizer(raw_datasets["train"]["sentence2"])

In [23]:
# tokenize multiple sentances together
inputs = tokenizer("This is the first sentance", "This is the second sentance")

{'input_ids': [101, 2023, 2003, 1996, 2034, 2741, 6651, 102, 2023, 2003, 1996, 2117, 2741, 6651, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [24]:
from pprint import pprint
pprint(inputs)

{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'input_ids': [101,
               2023,
               2003,
               1996,
               2034,
               2741,
               6651,
               102,
               2023,
               2003,
               1996,
               2117,
               2741,
               6651,
               102],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]}


In [27]:
print(tokenizer.convert_ids_to_tokens(inputs["input_ids"]))

['[CLS]', 'this', 'is', 'the', 'first', 'sent', '##ance', '[SEP]', 'this', 'is', 'the', 'second', 'sent', '##ance', '[SEP]']


In [29]:
## Here we are tyring to fine-tune a model to do better at next sentance prediction

# we will first tokenize sentance1 and sentance2 in the dataset
# this works, but loads the entire dataset into RAM and sub-optimal, wheras the Dataset library are Apache arrow files stord on disk, so you only load the
# samples you ask for to load in memory
tokenized_dataset = tokenizer(
    raw_datasets["train"]["sentence1"],
    raw_datasets["train"]["sentence2"],
    truncation=True,
    padding=True
)

In [31]:
def tokenize_func(ex):
    return tokenizer(ex["sentence1"], ex["sentence2"], truncation=True)

In [32]:
tokenized_datasets = raw_datasets.map(tokenize_func, batched=True)

Map: 100%|██████████| 3668/3668 [00:00<00:00, 18082.68 examples/s]
Map: 100%|██████████| 408/408 [00:00<00:00, 19896.48 examples/s]
Map: 100%|██████████| 1725/1725 [00:00<00:00, 30846.65 examples/s]


In [37]:
# instead of padding the entire dataset at once, we will pad each batch, also called dynamic padding. The function that is responsible
# for putting together samples inside a batch is called collate function.

# the collate function  of hf applies the correct amount of padding for each batch
from transformers import DataCollatorWithPadding
import os
os.environ["TOKENIZERS_PARALLELISM"] = "False"
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [36]:
# non-padded batch 
samples = tokenized_datasets["train"][:8]
samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]}
[len(x) for x in samples["input_ids"]]

[50, 59, 47, 67, 59, 50, 62, 32]

In [38]:
# dynamically pad the batch
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

{'input_ids': torch.Size([8, 67]),
 'token_type_ids': torch.Size([8, 67]),
 'attention_mask': torch.Size([8, 67]),
 'labels': torch.Size([8])}