<a href="https://colab.research.google.com/github/saverin0/llms_workshops_files/blob/main/oxford_llm_workshop_3_Fine_tune_pretrained_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install transformers==4.57.0
!pip install Pillow==11.1.0
!pip install -U sentence-transformers==5.1.1
!pip install datasets==3.2.0
!pip install sentencepiece==0.2.1

In [2]:
from datasets import load_dataset
import transformers
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
import numpy as np
import typing as tp

In [3]:
imdb = load_dataset("imdb", split="test")

README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [4]:
imdb_df = imdb.to_pandas()

In [5]:
imdb_df.head()

Unnamed: 0,text,label
0,I love sci-fi and am willing to put up with a ...,0
1,"Worth the entertainment value of a rental, esp...",0
2,its a totally average film with a few semi-alr...,0
3,STAR RATING: ***** Saturday Night **** Friday ...,0
4,"First off let me say, If you haven't enjoyed a...",0


In [6]:
imdb_sample = imdb_df.sample(n=1000, random_state=2023)

In [7]:
imdb_sample["label"].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,529
0,471


## 3. Using pretrained model as is.

In [8]:
model_name = "siebert/sentiment-roberta-large-english"

model = transformers.AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=2
)
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
config = transformers.AutoConfig.from_pretrained(model_name)

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

### 3.1 Dataset and Dataloader creation

In [9]:
class ExampleDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt",
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "label": torch.tensor(label, dtype=torch.long),
        }

In [10]:
example_dataset = ExampleDataset(
    texts=imdb_sample["text"].tolist(),
    labels=imdb_sample["label"].tolist(),
    tokenizer=tokenizer,
    max_length=512,
)

In [11]:
dataloader = DataLoader(example_dataset, batch_size=16, shuffle=False)

### 3.2 Evaluation function.

In [12]:
def evaluate(model, dataloader, device="cuda"):
    model.eval()
    model.to(device)

    valid_preds, valid_labels = [], []  # init arrays to fill labels and preds

    for batch in dataloader:  # iterate all data points
        b_input_ids = batch["input_ids"].to(device)  # extract inputd and move to GPU
        b_input_mask = batch["attention_mask"].to(device)
        b_labels = batch["label"].to(device)  # extract labels and move to GPU

        with torch.no_grad():  # optimization low level detail
            logits = model(
                input_ids=b_input_ids, attention_mask=b_input_mask
            )  # apply model and get model predictions as logits

        logits = logits[0].detach().cpu().numpy()  # convert preds to simple array
        label_ids = b_labels.to("cpu").numpy()  # same as above

        batch_preds = np.argmax(
            logits, axis=1
        )  # take label with max proba, this is our prediction
        batch_labels = np.concatenate(label_ids.reshape(-1, 1))  # techincal detauls
        valid_preds.extend(batch_preds)  # fill our general list of preds
        valid_labels.extend(batch_labels)  # same as above but with labels

    return valid_labels, valid_preds  # return list of preds and ground truth labels

In [13]:
valid_labels, valid_preds = evaluate(model, dataloader)

In [16]:
valid_labels = np.array(valid_labels, dtype=int)

print(valid_labels)

[0 0 1 0 1 1 0 1 1 1 1 0 1 0 0 0 0 1 0 0 1 0 0 0 1 1 1 0 1 0 1 1 1 1 1 0 1
 1 1 1 1 1 1 1 1 1 0 1 1 1 0 0 0 1 0 0 0 1 0 0 1 0 1 0 0 0 1 0 0 0 1 1 0 0
 1 0 1 0 0 0 1 0 0 1 0 1 0 1 1 1 0 0 0 1 1 1 1 0 0 1 1 0 1 1 0 1 1 1 1 0 1
 1 1 1 0 0 1 0 0 1 0 1 1 1 1 1 1 0 0 0 1 0 1 0 1 0 0 1 0 0 1 1 1 0 1 0 1 0
 1 0 1 1 1 1 0 1 1 0 1 1 1 0 0 0 1 0 1 0 0 1 1 0 0 0 0 1 1 1 0 1 0 1 1 1 0
 0 1 0 0 1 1 0 1 0 1 0 1 0 0 0 1 1 0 0 0 1 1 1 0 0 1 1 0 1 1 1 0 1 0 1 0 0
 1 1 0 1 0 0 1 0 1 1 0 1 1 1 1 0 1 1 0 1 0 0 1 1 1 0 0 1 1 1 0 1 1 1 1 0 0
 0 1 0 1 1 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 0 0 0 1 1 1 0 0 1 0 0 1 1 0 1 0 0
 1 0 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 0 0 0 0 0 1 1 1 0 0 0 1 1 1 0 1 0 1 0 0
 0 1 0 0 0 0 1 1 1 1 0 0 0 1 0 1 0 0 1 0 1 0 1 1 1 1 1 0 1 0 1 0 0 1 1 0 1
 1 0 1 1 0 1 0 1 1 0 0 1 1 0 1 1 1 0 1 1 0 0 0 1 0 0 0 0 0 1 1 1 0 0 1 0 1
 1 1 0 1 0 1 0 0 0 1 0 1 1 0 0 0 0 0 1 0 1 0 1 1 0 1 1 1 0 0 1 0 0 0 1 1 1
 0 0 0 1 0 0 1 0 0 0 1 0 0 1 1 0 1 1 1 1 0 0 1 1 1 1 1 1 1 1 0 1 0 1 0 1 1
 0 1 1 0 0 0 0 0 0 0 1 1 

In [17]:
valid_preds = np.array(valid_preds, dtype=int)

print(valid_preds)

[0 0 1 0 1 1 0 1 1 1 1 0 1 0 0 0 1 1 0 0 1 0 0 0 1 1 1 0 1 0 1 1 1 0 1 0 1
 1 1 1 1 1 1 1 1 1 0 1 1 1 0 0 0 1 0 0 0 1 0 0 1 0 1 0 0 0 1 0 0 0 1 1 0 0
 1 0 1 0 0 0 1 0 0 1 0 1 0 1 1 1 0 1 0 1 1 1 1 0 0 1 1 0 1 1 0 1 0 1 0 0 1
 0 1 1 0 0 1 0 0 1 0 1 1 1 1 1 1 0 0 0 1 0 1 0 1 0 0 1 0 0 1 1 1 0 1 0 1 0
 1 0 1 1 1 1 1 1 1 0 1 1 1 0 0 0 1 0 1 0 0 1 1 0 0 0 0 1 1 1 0 1 0 1 1 1 0
 1 1 0 0 1 1 0 1 0 1 0 1 0 0 0 1 1 0 0 0 1 1 1 0 0 1 1 0 1 1 0 0 1 0 1 0 0
 1 1 0 1 0 0 1 0 1 1 0 1 0 1 1 0 1 1 0 1 0 0 1 1 1 0 1 1 1 1 1 1 1 1 1 0 0
 0 1 0 1 1 0 0 0 0 0 0 0 1 0 1 0 1 1 0 1 0 0 0 1 1 1 0 0 1 0 0 1 1 1 1 0 0
 0 0 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 0 0 0 0 0 1 1 1 1 0 0 1 1 1 0 1 0 1 0 0
 0 1 0 0 0 0 1 1 1 1 0 0 0 1 0 1 0 0 1 0 1 0 1 0 1 0 0 0 1 0 1 0 0 1 1 0 1
 1 0 1 1 0 1 0 1 1 0 0 1 1 0 1 1 1 0 1 1 0 0 0 1 0 0 0 0 0 1 1 1 0 0 1 0 1
 1 1 0 1 0 1 0 0 0 1 0 1 1 0 0 0 0 0 1 0 1 0 1 1 0 1 1 1 0 0 1 0 0 0 1 1 1
 0 0 0 1 0 0 1 0 0 0 1 0 1 1 0 0 1 1 0 1 0 0 1 1 1 1 1 1 1 1 0 1 0 1 0 1 1
 0 1 1 0 1 0 0 0 0 0 1 1 

In [18]:
from sklearn.metrics import f1_score

f1_score(valid_labels, valid_preds)

0.9584905660377359

## 4. Model fine tuning

In [19]:
model_name = "distilbert-base-uncased"

model = transformers.AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=2
)
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
config = transformers.AutoConfig.from_pretrained(model_name)

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

### 4.1 Data for fine tuning

In [20]:
imdb_train = load_dataset("imdb", split="train")

In [21]:
imdb_train_sample = imdb_train.to_pandas().sample(n=5000, random_state=2023)

In [22]:
imdb_train_sample.label.value_counts(dropna=False)

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,2552
0,2448


In [23]:
train_dataset = ExampleDataset(
    texts=imdb_train_sample["text"].tolist(),
    labels=imdb_train_sample["label"].tolist(),
    tokenizer=tokenizer,
    max_length=512,
)

In [24]:
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=False)

In [25]:
def train(model, train_loader, num_epochs=2, learning_rate=2e-5, device="cuda"):
    """
    function is a simple train loop
    """
    model.to(device)

    optimizer = AdamW(model.parameters(), lr=learning_rate)  # define optimizer

    for epoch in range(num_epochs):
        model.train()  # put in the train mode
        total_loss = 0.0

        for batch in train_loader:  # iterate our data
            # get below inout and ground truth labels
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            optimizer.zero_grad()  # technical op
            outputs = model(
                input_ids, attention_mask=attention_mask, labels=labels
            )  # apply model
            loss = outputs.loss  # compute the loss
            loss.backward()  # we do here backprop step
            optimizer.step()  # step of optimizer ti update the weights

            total_loss += loss.item()
        average_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{num_epochs} - Average Loss: {average_loss:.4f}")

    print("Training complete!")

In [26]:
train(model, train_dataloader)

Epoch 1/2 - Average Loss: 0.3206
Epoch 2/2 - Average Loss: 0.1552
Training complete!


In [27]:
val_dataset = ExampleDataset(
    texts=imdb_sample["text"].tolist(),
    labels=imdb_sample["label"].tolist(),
    tokenizer=tokenizer,
    max_length=512,
)

val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [28]:
valid_labels, valid_preds = evaluate(model, val_dataloader)


f1_score(valid_labels, valid_preds)

0.9173478655767484

In [29]:
# in case you need to free GPU memory

model.cpu()
torch.cuda.empty_cache()