# Starter Notebook

Install and import required libraries

V1 Accuracy 94.19% 84.425% on unlabelled

In [None]:
# !pip install transformers datasets evaluate accelerate peft trl bitsandbytes
!pip install nvidia-ml-py3

In [None]:
import os
import pandas as pd
import torch
from transformers import RobertaModel, RobertaTokenizer, TrainingArguments, Trainer, DataCollatorWithPadding, RobertaForSequenceClassification
from peft import LoraConfig, get_peft_model, PeftModel
from datasets import load_dataset, Dataset, ClassLabel
import pickle

## Load Tokenizer and Preprocess Data

In [None]:
base_model = 'roberta-base'

dataset = load_dataset('ag_news', split='train')
tokenizer = RobertaTokenizer.from_pretrained(base_model)

def preprocess(examples):
    tokenized = tokenizer(examples['text'], truncation=True, padding=True)
    return tokenized

tokenized_dataset = dataset.map(preprocess, batched=True,  remove_columns=["text"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

In [None]:
# Extract the number of classess and their names
num_labels = dataset.features['label'].num_classes
class_names = dataset.features["label"].names
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

# Create an id2label mapping
# We will need this for our classifier.
id2label = {i: label for i, label in enumerate(class_names)}

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")


## Load Pre-trained Model
Set up config for pretrained model and download it from hugging face

In [None]:
model = RobertaForSequenceClassification.from_pretrained(
    base_model,
    id2label=id2label)
model

## Anything from here on can be modified

In [None]:
# Split the original training set
split_datasets = tokenized_dataset.train_test_split(test_size=20000, seed=42)
train_dataset = split_datasets['train']
eval_dataset = split_datasets['test']

print(train_dataset.shape)
print(eval_dataset.shape)

## Setup LoRA Config
Setup PEFT config and get peft model for finetuning

In [None]:
!pip install peft accelerate transformers datasets


In [None]:
from peft import get_peft_model, LoraConfig, TaskType

# Configure LoRA
lora_config = LoraConfig(
    r=8,  # rank
    lora_alpha=16,
    target_modules=["query", "value"],  # works for transformers like RoBERTa
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_CLS  # sequence classification
)

# Wrap the model with PEFT
model = get_peft_model(model, lora_config)
model

In [None]:
peft_model = get_peft_model(model, lora_config)
peft_model

In [None]:
print("Trainable parameters:")
count = 0
for name, param in peft_model.named_parameters():
    if param.requires_grad:
        count=count+1
        print(name)
print(count)

In [None]:
print('PEFT Model')
peft_model.print_trainable_parameters()

## Training Setup

In [None]:
# To track evaluation accuracy during training
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="macro")

    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./roberta-lora-agnews",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    learning_rate=2e-4,
    logging_dir="./logs",
    logging_steps=50,
    report_to="none",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    label_names=["labels"]
)



### Start Training

In [None]:
from transformers import Trainer

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        # Remove the unused argument before forwarding
        if "num_items_in_batch" in inputs:
            inputs.pop("num_items_in_batch")
        return super().compute_loss(model, inputs, return_outputs)


In [None]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=final_train_dataset,
    eval_dataset=eval_dataset, 
    compute_metrics=compute_metrics,  
    data_collator=data_collator
)

In [None]:
%debug
trainer.train()

## Evaluate Finetuned Model


### Performing Inference on Custom Input
Uncomment following functions for running inference on custom inputs

In [None]:
def classify(model, tokenizer, text):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inputs = tokenizer(text, truncation=True, padding=True, return_tensors="pt").to(device)
    output = model(**inputs)

    prediction = output.logits.argmax(dim=-1).item()

    print(f'\n Class: {prediction}, Label: {id2label[prediction]}, Text: {text}')
    return id2label[prediction]

In [None]:
classify( peft_model, tokenizer, "Kederis proclaims innocence Olympic champion Kostas Kederis today left hospital ahead of his date with IOC inquisitors claiming his ...")
classify( peft_model, tokenizer, "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.")

### Run Inference on eval_dataset

In [None]:
!pip install evaluate


In [None]:
from torch.utils.data import DataLoader
import evaluate
from tqdm import tqdm

def evaluate_model(inference_model, dataset, labelled=True, batch_size=8, data_collator=None):
    """
    Evaluate a PEFT model on a dataset.

    Args:
        inference_model: The model to evaluate.
        dataset: The dataset (Hugging Face Dataset) to run inference on.
        labelled (bool): If True, the dataset includes labels and metrics will be computed.
                         If False, only predictions will be returned.
        batch_size (int): Batch size for inference.
        data_collator: Function to collate batches. If None, the default collate_fn is used.

    Returns:
        If labelled is True, returns a tuple (metrics, predictions)
        If labelled is False, returns the predictions.
    """
    # Create the DataLoader
    eval_dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=data_collator)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    inference_model.to(device)
    inference_model.eval()

    all_predictions = []
    if labelled:
        metric = evaluate.load('accuracy')

    # Loop over the DataLoader
    for batch in tqdm(eval_dataloader):
        # Move each tensor in the batch to the device
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = inference_model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        all_predictions.append(predictions.cpu())

        if labelled:
            # Expecting that labels are provided under the "labels" key.
            references = batch["labels"]
            metric.add_batch(
                predictions=predictions.cpu().numpy(),
                references=references.cpu().numpy()
            )

    # Concatenate predictions from all batches
    all_predictions = torch.cat(all_predictions, dim=0)

    if labelled:
        eval_metric = metric.compute()
        print("Evaluation Metric:", eval_metric)
        return eval_metric, all_predictions
    else:
        return all_predictions

In [None]:
# Check evaluation accuracy
_, _ = evaluate_model(peft_model, eval_dataset, True, 8, data_collator)

In [None]:
import os
import torch
import pandas as pd
from torch.utils.data import DataLoader
from torch.nn.functional import softmax
from transformers import DataCollatorWithPadding
from datasets import Dataset, concatenate_datasets

# Load your unlabeled dataset (with 'text' column)
unlabelled_dataset = pd.read_pickle("/kaggle/input/deep-learning-spring-2025-project-2/test_unlabelled.pkl")

# Tokenize using your existing preprocess function
test_dataset = unlabelled_dataset.map(preprocess, batched=True, remove_columns=["text"])

# Set up data collator and dataloader
data_collator = DataCollatorWithPadding(tokenizer)
dataloader = DataLoader(test_dataset, batch_size=32, collate_fn=data_collator)

# Inference and confidence filtering
model.eval()
pseudo_input_ids = []
pseudo_attention_masks = []
pseudo_labels = []
pseudo_confidences = []

for batch in dataloader:
    batch = {k: v.to(model.device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
        probs = softmax(outputs.logits, dim=-1)
        max_probs, preds = torch.max(probs, dim=1)

        for i in range(len(preds)):
            if max_probs[i].item() >= 0.95:  # Confidence threshold
                pseudo_input_ids.append(batch["input_ids"][i].cpu())
                pseudo_attention_masks.append(batch["attention_mask"][i].cpu())
                pseudo_labels.append(preds[i].item())
                pseudo_confidences.append(max_probs[i].item())

# Build Hugging Face Dataset from pseudo-labeled examples
pseudo_dataset = Dataset.from_dict({
    "input_ids": pseudo_input_ids,
    "attention_mask": pseudo_attention_masks,
    "labels": pseudo_labels
})

print(f"✅ Pseudo-labeled dataset size: {len(pseudo_dataset)}")

# Match label types with original training dataset
label_feature = train_dataset.features["labels"]
pseudo_dataset = pseudo_dataset.cast_column("labels", label_feature)

# Combine original and pseudo-labeled data
final_train_dataset = concatenate_datasets([train_dataset, pseudo_dataset])
print("✅ Combined dataset ready for fine-tuning.")

# Save to disk for reuse
final_train_dataset.save_to_disk("final_train_dataset")
print("✅ Final train dataset saved to disk as 'final_train_dataset/'")


In [None]:
# Convert to pandas and save
df = final_train_dataset.to_pandas()
df.to_pickle("final_train_dataset.pkl")

### Run Inference on unlabelled dataset

In [None]:
#Load your unlabelled data
unlabelled_dataset = pd.read_pickle("/kaggle/input/deep-learning-spring-2025-project-2/test_unlabelled.pkl")
test_dataset = unlabelled_dataset.map(preprocess, batched=True, remove_columns=["text"])
unlabelled_dataset

In [None]:
# Run inference and save predictions
preds = evaluate_model(peft_model, test_dataset, False, 8, data_collator)
df_output = pd.DataFrame({
    'ID': range(len(preds)),
    'Label': preds.numpy()  # or preds.tolist()
})
df_output.to_csv(os.path.join("inference_output2.csv"), index=False)
print("Inference complete. Predictions saved to inference_output.csv")