<a href="https://colab.research.google.com/github/sfarrukhm/huggingface-learning/blob/main/finetuning_without_Trainer_API.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m4.

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets=load_dataset("glue", "mrpc")
checkpoint="bert-base-uncased"
tokenizer=AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

tokenized_datasets=raw_datasets.map(tokenize_function, batched=True)
data_collator=DataCollatorWithPadding(tokenizer=tokenizer)

README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/649k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/75.7k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/308k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

#### Prepare data for training
Define dataloaders to iterate over batches

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [None]:
# first we need to apply a bit of postprocessing to our tokenized_datasets
# to train the model, we only need following columns:
# 'input_ids', 'token_type_ids', 'attention_mask', 'labels'
# So we well remove not required columns and rename 'label' to 'labels'
tokenized_datasets=tokenized_datasets.remove_columns(['sentence1','sentence2','idx'])
tokenized_datasets=tokenized_datasets.rename_column("label","labels")

# we also need to set the format of the dataset to Pytorch tensor
tokenized_datasets.set_format("torch")


In [None]:
# Now define dataloaders
from torch.utils.data import DataLoader
train_dataloader=DataLoader(tokenized_datasets['train'],shuffle=True,batch_size=8,
                            collate_fn=data_collator)
eval_dataloader=DataLoader(tokenized_datasets['validation'],shuffle=True, batch_size=8,
                           collate_fn=data_collator)
for batch in train_dataloader:
    break
{k:v.shape for k,v in batch.items()}

{'labels': torch.Size([8]),
 'input_ids': torch.Size([8, 78]),
 'token_type_ids': torch.Size([8, 78]),
 'attention_mask': torch.Size([8, 78])}

#### Prepare the objects for training



1. Model

In [None]:
from transformers import AutoModelForSequenceClassification
model=AutoModelForSequenceClassification.from_pretrained(checkpoint,num_labels=2)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
outputs=model(**batch)
# outputs.hidden_states

In [None]:
outputs

SequenceClassifierOutput(loss=tensor(0.6148, grad_fn=<NllLossBackward0>), logits=tensor([[-0.0566,  0.3437],
        [-0.0636,  0.3337],
        [-0.0555,  0.3257],
        [-0.0491,  0.3323],
        [-0.0398,  0.3552],
        [-0.0600,  0.3411],
        [-0.0419,  0.3536],
        [-0.0659,  0.3297]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

All the 🤗 Transformers models return `loss` when `labels` are porvided along with `logits` for each input sample.

2. Define the optimizer:
We will se AdamW optimizer which also the default optimizer for `Trainer`. It is same as Adam but with a twist for weight decay regularization.
3. Define the learning rate scheduler:
`Trainer` implements lr as linearly decaying by default. To define it as linearly decaying, we would need number of training steps.

In [None]:
from transformers import AdamW
optimizer=AdamW(model.parameters(),lr=5e-5)

# defining learning rate schadualer:
from transformers import get_scheduler
num_epochs=3
num_training_steps=num_epochs*len(train_dataloader)
lr_scheduler=get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)




4. Define the training loop

In [None]:
import torch
device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

from tqdm.auto import tqdm
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
  for batch in train_dataloader:
    batch={k: v.to(device) for k, v in batch.items()}
    outputs=model(**batch)
    loss = outputs.loss

    loss.backward()

    optimizer.step()
    lr_scheduler.step()
    optimizer.zero_grad()
    progress_bar.update(1)

  0%|          | 0/1377 [00:00<?, ?it/s]

### The evaluation loop

In [None]:
import evaluate
metric=evaluate.load("glue","mrpc")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
      ouputs=model(**batch)
    logits=ouputs.logits
    predictions=torch.argmax(logits,dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()
