In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

In [4]:
raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)



In [10]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [14]:
def tokenizer_func(data):
    return tokenizer(data["sentence1"], data["sentence2"], truncation=True)

In [22]:
"""
benefits of the Dataset.map() method
The results of the function are cached, so it won't take any time if we re-execute the code.
It can apply multiprocessing to go faster than applying the function on each element of the dataset.
It does not load the whole dataset into memory, saving the results as soon as one element is processed.
"""
tokenized_dataset = raw_datasets.map(tokenizer_func)
data_collator = DataCollatorWithPadding(tokenizer)

In [16]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [17]:
# Note: after mapping the tokenizer func - the diff between the features of raw_dataset and toknized_dataset are
# 'input_ids', 'token_type_ids', 'attention_mask' which is what we need for training to work
# we can hence remove the others from the training data, to make the load lighter on the ram

In [None]:
### Loading the data
"""
Load the dataset
Apply transformations
Use a dataloader to iterate over batches
"""

In [23]:
tokenized_dataset = tokenized_dataset.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_dataset = tokenized_dataset.rename_columns({"label": "labels"})
tokenized_dataset.set_format("torch")

In [24]:
tokenized_dataset["train"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [44]:
BATCH_SIZE = 16

In [45]:
# defining the dataloaders
from torch.utils.data import DataLoader

train_dataloader = DataLoader(tokenized_dataset["train"], shuffle=True, batch_size=BATCH_SIZE, collate_fn=data_collator)
eval_dataloader = DataLoader(tokenized_dataset["validation"], shuffle=True, batch_size=BATCH_SIZE, collate_fn=data_collator)


In [46]:
# testing that the batch size is indeed 8 the passing is dynamic picking top 2 
count = 0
for batch in train_dataloader:
    print({k: v.shape for k, v in batch.items()})
    count += 1
    if count >= 2:
        break

{'labels': torch.Size([16]), 'input_ids': torch.Size([16, 72]), 'token_type_ids': torch.Size([16, 72]), 'attention_mask': torch.Size([16, 72])}
{'labels': torch.Size([16]), 'input_ids': torch.Size([16, 72]), 'token_type_ids': torch.Size([16, 72]), 'attention_mask': torch.Size([16, 72])}


In [None]:
## Prepare to train

In [31]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [47]:
# to test everything will run smooth, we pass a batch to the model -> size 8, given the output is size 2...we expext 8x2 tensor
# Note: All transformer models return the loss when labels are provided (num_labels=2) above
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)


tensor(0.5708, grad_fn=<NllLossBackward0>) torch.Size([16, 2])


In [33]:
# The shape of the output seems as intended, next we just needs to define an optimizer and the learning rate scheduler
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)



In [48]:
# the learning rate scheduler used here is a lineary decay from initial lr value to 0. Trainer uses 3 epochs by default. 
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

690


In [49]:
import math
# since the data is batched  

assert len(train_dataloader) == math.ceil(len(raw_datasets["train"]["label"]) / BATCH_SIZE)

In [51]:
# The training loop
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [52]:
model.to(device)
device

device(type='cpu')

In [53]:
from tqdm.auto import tqdm # this generates the loader

progress_bar = tqdm(range(num_training_steps)) # training steps = epochs * (raw_data_size / batch_size) which is the actual num of steps

model.train() # this sets the model in training model - i.e trainable params
for epoch in range(num_epochs): # epoch is the number of times the model weights are adjusted against the entire dataset 
    for batch in train_dataloader: # loop across each batch i.e (raw_data_size / batch_size)
        batch = {k: v.to(device) for k, v in batch.items()} # sending the input_ids to device (as we did with the model)
        outputs = model(**batch) # getting the output
        loss = outputs.loss      # get the loss 
        loss.backward()          # doing backprop after loss

        optimizer.step()       # running an optimizer step that actually adjust the weights
        lr_scheduler.step()    # stepping the learning rate inline with back prop
        optimizer.zero_grad()  # resets the gradients of all optimized tensors
        progress_bar.update(1) # update the loader

100%|██████████| 690/690 [15:59<00:00,  1.01it/s]

In [55]:
### The evaluation loop
import evaluate

metrics = evaluate.load("glue", "mrpc")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k,v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    
    logits = outputs.logits
    predictions = torch.argmax(logits,  axis=-1)
    metrics.add_batch(predictions=predictions, references=batch["labels"])
metrics.compute()

{'accuracy': 0.8676470588235294, 'f1': 0.9059233449477352}

In [None]:
# using the acceletate library to enable distrtibuted training on multiple gpus or tpus.

In [58]:
from accelerate import Accelerator

accelerator = Accelerator()

In [59]:
train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(train_dataloader, eval_dataloader, model, optimizer)

In [60]:
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        outputs = model(**batch) # note: we didnt have to manually deal with device placement, the accelerate lib handles it for us
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

100%|██████████| 690/690 [1:59:04<00:00, 10.35s/it]
