# Starter Notebook

In [None]:
# Importing necessary libraries
import os
import torch
from transformers import RobertaModel, RobertaTokenizer, TrainingArguments, Trainer, DataCollatorWithPadding, RobertaForSequenceClassification
from peft import LoraConfig, get_peft_model, PeftModel
from datasets import load_dataset, Dataset, ClassLabel
import pickle

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


## Load Tokenizer and Preprocess Data

In [None]:
#Initialise the RoBERTa base model
base_model = 'roberta-base'

#Load the AGNews dataset using the Hugging Face datasets library
dataset = load_dataset('ag_news', split='train')

# Load the tokenizer associated with the RoBERTa base model
tokenizer = RobertaTokenizer.from_pretrained(base_model)

# Define a preprocessing function to tokenize the input text
def preprocess(examples):
    tokenized = tokenizer(examples['text'], truncation=True, padding=True)
    return tokenized

# Apply the preprocessing function to the entire dataset using batched processing
tokenized_dataset = dataset.map(preprocess, batched=True,  remove_columns=["text"])
# Rename the "label" column to "labels" to match the expected input format for most Hugging Face models
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

In [None]:
# Extract the number of classess and their names
num_labels = dataset.features['label'].num_classes
class_names = dataset.features["label"].names
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

# Create a mapping from label IDs to label names
id2label = {i: label for i, label in enumerate(class_names)}

# Pad inputs in a batch to the same length and return PyTorch tensors
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")


number of labels: 4
the labels: ['World', 'Sports', 'Business', 'Sci/Tech']


## Load Pre-trained Model
Set up config for pretrained model and download it from hugging face

In [None]:
model = RobertaForSequenceClassification.from_pretrained(
    base_model,
    id2label=id2label)
model

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

## Preparing Test and Evaluation Datasets

In [None]:
# Split the dataset into training and evaluation sets
split_datasets = tokenized_dataset.train_test_split(test_size=640, seed=42)
train_dataset = split_datasets['train']
eval_dataset = split_datasets['test']

## Check Class Distribution in Training Set

In [None]:
from collections import Counter

# Count the number of samples for each label in the training set
label_counts = Counter(train_dataset['labels'])

print("Class distribution in training set:")
total = sum(label_counts.values())

# Print the count and percentage of each label
for label, count in sorted(label_counts.items()):
    percent = (count / total) * 100
    print(f"Label {label}: {count} samples ({percent:.2f}%)")


Class distribution in training set:
Label 0: 29855 samples (25.01%)
Label 1: 29842 samples (25.00%)
Label 2: 29847 samples (25.01%)
Label 3: 29816 samples (24.98%)


## Setup LoRA Configuration
Setup PEFT config and get peft model for finetuning

In [None]:
# PEFT Config
peft_config = LoraConfig(
    r=6, # Rank of the LoRA update matrices
    lora_alpha=32, # Scaling factor for the LoRA updates
    lora_dropout=0.05, # Dropout applied to LoRA layers
    bias = 'none', # Do not train biases
    target_modules = ['query','value','key'], # Apply LoRA to these attention submodules
    task_type="SEQ_CLS", # Task type: Sequence Classification
)

In [None]:
peft_model = get_peft_model(model, peft_config)
peft_model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): RobertaForSequenceClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSdpaSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=False)
                    )
                    (lora_A): Mo

In [None]:
#Print the Trainable parameters
print("Trainable parameters:")
# Looping construct
for name, param in peft_model.named_parameters():
# Conditional check
    if param.requires_grad:
        print(name)

Trainable parameters:
base_model.model.roberta.encoder.layer.0.attention.self.query.lora_A.default.weight
base_model.model.roberta.encoder.layer.0.attention.self.query.lora_B.default.weight
base_model.model.roberta.encoder.layer.0.attention.self.key.lora_A.default.weight
base_model.model.roberta.encoder.layer.0.attention.self.key.lora_B.default.weight
base_model.model.roberta.encoder.layer.0.attention.self.value.lora_A.default.weight
base_model.model.roberta.encoder.layer.0.attention.self.value.lora_B.default.weight
base_model.model.roberta.encoder.layer.1.attention.self.query.lora_A.default.weight
base_model.model.roberta.encoder.layer.1.attention.self.query.lora_B.default.weight
base_model.model.roberta.encoder.layer.1.attention.self.key.lora_A.default.weight
base_model.model.roberta.encoder.layer.1.attention.self.key.lora_B.default.weight
base_model.model.roberta.encoder.layer.1.attention.self.value.lora_A.default.weight
base_model.model.roberta.encoder.layer.1.attention.self.value.

In [None]:
print('PEFT Model')
peft_model.print_trainable_parameters()

PEFT Model
trainable params: 925,444 || all params: 125,574,152 || trainable%: 0.7370


In [None]:
from torchinfo import summary
summary(peft_model, input_size=(1, 128), dtypes=[torch.int64])

Layer (type:depth-idx)                                                      Output Shape              Param #
PeftModelForSequenceClassification                                          [1, 4]                    --
├─LoraModel: 1-1                                                            [1, 4]                    --
│    └─RobertaForSequenceClassification: 2-1                                --                        --
│    │    └─RobertaModel: 3-1                                               [1, 128, 768]             124,386,816
│    │    └─ModulesToSaveWrapper: 3-2                                       [1, 4]                    1,187,336
Total params: 125,574,152
Trainable params: 925,444
Non-trainable params: 124,648,708
Total mult-adds (Units.MEGABYTES): 124.98
Input size (MB): 0.00
Forward/backward pass size (MB): 135.49
Params size (MB): 499.92
Estimated Total Size (MB): 635.42

## Training Setup

In [None]:
# To track evaluation accuracy during training
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    return {
        'accuracy': accuracy
    }

In [None]:
# Setup Training args
from transformers import TrainingArguments

training_args = TrainingArguments(

    output_dir="./results",
    eval_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    logging_steps=100,
    eval_steps=200,
    save_steps=400,
    load_best_model_at_end=True,  # Load best model based on evaluation metric
    metric_for_best_model="accuracy",  # Use accuracy to pick the best model
    warmup_ratio=0.1, # Warmup steps as a fraction of total steps
    lr_scheduler_type="cosine", # Use cosine learning rate schedule
    learning_rate=2e-4,  # Initial learning rate
    per_device_train_batch_size=16, # Training batch size
    per_device_eval_batch_size=64, # Evaluation batch size
    num_train_epochs=3, # Number of training epochs
    optim="adamw_torch", # Optimizer
    weight_decay=0.01, #Weight decay for regularization
    logging_dir="./logs",
    report_to="none",

)

# Return a configured Hugging Face Trainer with model and settings
def get_trainer(model):
      return  Trainer(
          model=model,
          args=training_args,
          compute_metrics=compute_metrics,
          train_dataset=train_dataset,
          eval_dataset=eval_dataset,
          data_collator=data_collator,
      )

In [None]:

!pip install evaluate -q

from torch.utils.data import DataLoader
import evaluate
from tqdm import tqdm

# Function to evaluate a model on a given dataset
def evaluate_model(model, dataset, labelled=True, batch_size=8, data_collator=None):
    # Create a DataLoader for the dataset
    dl = DataLoader(dataset, batch_size=batch_size, collate_fn=data_collator)

    # Set device to GPU if available, otherwise use CPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device).eval()

    all_logits = []
    all_labels = []

    # Run inference on batches from the dataloader
    for batch in tqdm(dl):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            out = model(**batch)
        logits = out.logits.cpu()
        all_logits.append(logits)

        # Store labels if available
        if labelled:
            all_labels.append(batch["labels"].cpu())

    # Concatenate all logits
    all_logits = torch.cat(all_logits, dim=0)

    if labelled:
        # Concatenate all true labels and calculate accuracy
        all_labels = torch.cat(all_labels, dim=0)
        preds = all_logits.argmax(dim=-1).numpy()
        acc = accuracy_score(all_labels.numpy(), preds)
        print("Accuracy:", acc)
        return {"accuracy": acc}, preds
    else:
        # Return logits directly if no labels (for ensembling)
        return all_logits


### Train and Run Inference

In [None]:
# Ensemble Inference Section
from transformers import set_seed
seeds = [42, 123, 999]
probs_list = []

# Looping construct
for seed in seeds:
    set_seed(seed)
    # re-init & train student
    peft_model = get_peft_model(
        RobertaForSequenceClassification.from_pretrained(base_model, id2label=id2label),
        peft_config
    )
    trainer = get_trainer(peft_model)
    trainer.train()

    # get raw logits
    logits = evaluate_model(peft_model, test_dataset, labelled=False, batch_size=8, data_collator=data_collator)
    # convert to probabilities
    probs = F.softmax(logits, dim=-1).numpy()
    probs_list.append(probs)




Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2967,0.213844,0.91875
2,0.1752,0.190328,0.942187
3,0.151,0.193946,0.940625


100%|██████████| 1000/1000 [02:05<00:00,  7.99it/s]


NameError: name 'F' is not defined

In [None]:
import torch.nn.functional as F

# Convert logits to probabilities using softmax
probs = F.softmax(logits, dim=-1).numpy()
probs_list.append(probs)

# Average probabilities from multiple models (ensembling)
ensemble_probs = np.mean(probs_list, axis=0)

# Get final predictions by selecting class with highest probability
ensemble_preds = ensemble_probs.argmax(axis=-1)

# Print the ensemble predictions
print(ensemble_preds)

[3 0 0 ... 3 0 2]


In [None]:
#Run inference and save predictions
preds = evaluate_model(peft_model, test_dataset, False, 8, data_collator)
df_output = pd.DataFrame({
    'ID': range(len(preds)),
    'Label': preds.numpy()
})
df_output.to_csv("submission.csv", index=False)
print("✅ Batched predictions complete. Saved to submission.csv.")

100%|██████████| 1000/1000 [02:05<00:00,  7.94it/s]

✅ Batched predictions complete. Saved to submission.csv.





In [None]:
import pandas as pd
import os

# Create a DataFrame with IDs and predicted labels
df_output = pd.DataFrame({
    'ID': range(len(ensemble_probs)),
    'Label': ensemble_preds
})

# Save the predictions to a CSV file
df_output.to_csv("submission.csv", index=False)
print("Ensemble predictions saved to ensemble_predictions.csv")


Ensemble predictions saved to ensemble_predictions.csv


In [None]:
# Check evaluation accuracy
_, _ = evaluate_model(peft_model, eval_dataset, True, 8, data_collator)

100%|██████████| 80/80 [00:13<00:00,  5.82it/s]

Accuracy: 0.9421875





In [None]:
from IPython.display import HTML
import numpy as np
import base64

# Function to create a clickable download link for a DataFrame
def create_download_link(df, title = "Download CSV file", filename = "data.csv"):
    csv = df.to_csv()
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)

# Generate the download link for the ensemble prediction output
create_download_link(df_output)