# Preprocessing for Fine Tuning Prep

In [1]:
import torch
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

# Same as before
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "This course is amazing!",
]
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")

# This is new
batch["labels"] = torch.tensor([1, 1])

optimizer = AdamW(model.parameters())
loss = model(**batch).loss
loss.backward()
optimizer.step()

ModuleNotFoundError: No module named 'torch'

In [7]:
batch.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

In [31]:
loss

tensor(0.7715, grad_fn=<NllLossBackward0>)

In [4]:
from datasets import load_dataset
raw_datasets = load_dataset("glue", "mrpc")
raw_datasets

Generating train split: 100%|██████████| 3668/3668 [00:00<00:00, 274159.01 examples/s]
Generating validation split: 100%|██████████| 408/408 [00:00<00:00, 135869.47 examples/s]
Generating test split: 100%|██████████| 1725/1725 [00:00<00:00, 422171.46 examples/s]


DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [35]:
raw_train_dataset = raw_datasets["train"]
raw_train_dataset[87]

{'sentence1': 'Tuition at four-year private colleges averaged $ 19,710 this year , up 6 percent from 2002 .',
 'sentence2': 'For the current academic year , tuition at public colleges averaged $ 4,694 , up almost $ 600 from the year before .',
 'label': 1,
 'idx': 100}

In [33]:
raw_train_dataset.features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [36]:
from transformers import AutoTokenizer

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"])
tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"])

In [39]:
raw_datasets["train"]["sentence1"][0]

'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .'

In [42]:
raw_datasets["train"]["sentence2"][0]

'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .'

In [47]:
tokenized_sentences_1['attention_mask'][0]

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [48]:
inputs = tokenizer("Paige likes Gracie.", "and Joe likes Nora.")
inputs

{'input_ids': [101, 17031, 7777, 19005, 1012, 102, 1998, 3533, 7777, 12306, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [50]:
inputs['token_type_ids']

[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]

In [49]:
tokenizer.convert_ids_to_tokens(inputs["input_ids"])

['[CLS]',
 'paige',
 'likes',
 'gracie',
 '.',
 '[SEP]',
 'and',
 'joe',
 'likes',
 'nora',
 '.',
 '[SEP]']

In [51]:
tokenized_dataset = tokenizer(
    raw_datasets["train"]["sentence1"],
    raw_datasets["train"]["sentence2"],
    padding=True,
    truncation=True,
)

In [53]:
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

In [54]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets

Map: 100%|██████████| 3668/3668 [00:00<00:00, 7801.41 examples/s]
Map: 100%|██████████| 408/408 [00:00<00:00, 17765.46 examples/s]
Map: 100%|██████████| 1725/1725 [00:00<00:00, 20924.03 examples/s]


DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [67]:
len(tokenized_datasets['train'][0]['sentence1']) == len(tokenized_datasets['train'][0]['sentence2']) 

False

In [68]:
# Padding the sentences so the tensors are all the same length

In [69]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [70]:
samples = tokenized_datasets["train"][:8]
samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]}
[len(x) for x in samples["input_ids"]]

[50, 59, 47, 67, 59, 50, 62, 32]

see how the input_ids are all different lengths. We apply padding at the *end* in order to create same length tensors, but not consume a bunch of memory with just padding up to this point
(see)[We have deliberately postponed the padding, to only apply it as necessary on each batch and avoid having over-long inputs with a lot of padding. This will speed up training by quite a bit, but note that if you’re training on a TPU it can cause problems — TPUs prefer fixed shapes, even when that requires extra padding.]

[https://huggingface.co/learn/nlp-course/chapter3/2]

In [72]:
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

{'input_ids': torch.Size([8, 67]),
 'token_type_ids': torch.Size([8, 67]),
 'attention_mask': torch.Size([8, 67]),
 'labels': torch.Size([8])}

No surprise, we get samples of varying length, from 32 to 67. Dynamic padding means the samples in this batch should all be padded to a length of 67, the maximum length inside the batch. Without dynamic padding, all of the samples would have to be padded to the maximum length in the whole dataset, or the maximum length the model can accept. Let’s double-check that our data_collator is dynamically padding the batch properly:

# Fine tuning with Trainer API from Hugging Face