# Deep Learning Project 2

Install and import required libraries

93.91% 84.450%
noise and eda
synonym replacement, word shuffle, random punctuation

In [1]:
# !pip install transformers datasets evaluate accelerate peft trl bitsandbytes
!pip install nvidia-ml-py3

Collecting nvidia-ml-py3
  Downloading nvidia-ml-py3-7.352.0.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: nvidia-ml-py3
  Building wheel for nvidia-ml-py3 (setup.py) ... [?25l[?25hdone
  Created wheel for nvidia-ml-py3: filename=nvidia_ml_py3-7.352.0-py3-none-any.whl size=19173 sha256=6fabd6de6830eba7aff743667e5a4a143022717064ca6252cf65f4f7a877c896
  Stored in directory: /root/.cache/pip/wheels/47/50/9e/29dc79037d74c3c1bb4a8661fb608e8674b7e4260d6a3f8f51
Successfully built nvidia-ml-py3
Installing collected packages: nvidia-ml-py3
Successfully installed nvidia-ml-py3-7.352.0


In [2]:
import os
import pandas as pd
import torch
from transformers import RobertaModel, RobertaTokenizer, TrainingArguments, Trainer, DataCollatorWithPadding, RobertaForSequenceClassification
from peft import LoraConfig, get_peft_model, PeftModel
from datasets import load_dataset, Dataset, ClassLabel
import pickle

2025-04-17 18:12:17.069723: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744913537.328529      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744913537.403418      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## Load Tokenizer and Preprocess Data

In [12]:
import random

# Dirty augmentation utilities
def add_random_punctuation(text, num_punct=3):
    punctuations = list("\™œ#â€“;&")
    words = text.split()
    for _ in range(num_punct):
        insert_idx = random.randint(0, len(words)-1)
        words[insert_idx] += random.choice(punctuations)
    return " ".join(words)

def word_shuffle(text, shuffle_ratio=0.3):
    words = text.split()
    n_shuffle = int(len(words) * shuffle_ratio)
    indices = random.sample(range(len(words)), n_shuffle)
    shuffled = words.copy()
    for i in indices:
        j = random.randint(0, len(words)-1)
        shuffled[i], shuffled[j] = shuffled[j], shuffled[i]
    return " ".join(shuffled)

def dirty_augment(text):
    aug = add_random_punctuation(text)
    aug = word_shuffle(aug)
    return aug


In [13]:
# Load your clean combined dataset
raw_dataset = load_dataset("ag_news", split="train")
# Get proper label info from original AG News
original_features = load_dataset("ag_news", split="train").features

# Convert to mutable list
data = raw_dataset.to_list()

# Choose ratio to corrupt
dirty_ratio = 0.2
num_dirty = int(dirty_ratio * len(data))

# Shuffle and select
indices = random.sample(range(len(data)), num_dirty)

# Apply dirty augmentation in-place
for idx in indices:
    original_text = data[idx]["text"]
    data[idx]["text"] = dirty_augment(original_text)

# Convert back to Dataset
dirty_injected_dataset = Dataset.from_list(data)


In [14]:
from nltk.corpus import wordnet
import nltk
nltk.download("wordnet")
nltk.download("omw-1.4")
import random

# EDA synonym replacement
def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            name = lemma.name().replace("_", " ").lower()
            if name != word:
                synonyms.add(name)
    return list(synonyms)

def synonym_replacement(text, n=1):
    words = text.split()
    new_words = words.copy()
    candidates = [w for w in words if get_synonyms(w)]
    random.shuffle(candidates)

    replaced = 0
    for word in candidates:
        synonyms = get_synonyms(word)
        if synonyms:
            synonym = random.choice(synonyms)
            new_words = [synonym if w == word else w for w in new_words]
            replaced += 1
        if replaced >= n:
            break
    return " ".join(new_words)


[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [15]:
# Convert to list so we can work on it
data = dirty_injected_dataset.to_list()

# Get indices NOT already dirtied (you'll reuse same 20% ratio)
eda_ratio = 0.25
num_eda = int(eda_ratio * len(data))

# Grab samples that were not dirtied (just shuffle and take a new set)
remaining_indices = [i for i in range(len(data))]
eda_indices = random.sample(remaining_indices, num_eda)

for idx in eda_indices:
    text = data[idx]["text"]
    words = text.split()
    n_replace = max(1, int(0.15 * len(words)))  # Replace 15% of words
    data[idx]["text"] = synonym_replacement(text, n_replace)

In [16]:
from datasets import Dataset

final_augmented_dataset = Dataset.from_list(data)

# Recast label type
original_features = load_dataset("ag_news", split="train").features
final_augmented_dataset = final_augmented_dataset.cast_column("label", original_features["label"])

Casting the dataset:   0%|          | 0/120000 [00:00<?, ? examples/s]

In [17]:
base_model = 'roberta-base'

tokenizer = RobertaTokenizer.from_pretrained(base_model)

def preprocess(examples):
    tokenized = tokenizer(examples['text'], truncation=True, padding=True)
    return tokenized

tokenized_dataset = final_augmented_dataset.map(preprocess, batched=True,  remove_columns=["text"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

In [18]:
# Extract the number of classess and their names
num_labels = final_augmented_dataset.features['label'].num_classes
class_names = final_augmented_dataset.features["label"].names
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

# Create an id2label mapping
# We will need this for our classifier.
id2label = {i: label for i, label in enumerate(class_names)}

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

number of labels: 4
the labels: ['World', 'Sports', 'Business', 'Sci/Tech']


In [19]:
def show_augmented_samples(final_dataset, original_dataset, indices=None, n=5):
    import random
    from termcolor import colored

    if indices is None:
        indices = random.sample(range(len(final_dataset)), n)

    for i in indices:
        original = original_dataset[i]["text"]
        augmented = final_dataset[i]["text"]
        label = final_dataset[i]["label"]

        print(f"\n🔹 Sample {i} — Label: {label}")
        print(colored("Original: ", "cyan"), original)
        print(colored("Augmented:", "green"), augmented)


In [25]:
show_augmented_samples(final_augmented_dataset, raw_dataset, n=10)


🔹 Sample 41603 — Label: 3
Original:  Hotlines: Blue Man Group Returns in  #36;120 Mil. Centrino Campaign (AdWeek.com) AdWeek.com - NEW YORK -- Intel is reintroducing the Blue Man Group, not used since 2001, in a campaign for its Centrino mobile technology for wireless computing.
Augmented: reintroducing its #36;120 group Returns inwards technology not (AdWeek.com) political campaign Centrino Mil. - NEW house of york blue-blooded Intel is inœ the wireless Man Group, AdWeek.com used since 2001, Hotlines: a campaign for™ -- Centrino computing. Man for blue-blooded mobile;

🔹 Sample 103978 — Label: 3
Original:  Family Tree Maker 2005 Deluxe; Law   Order: Justice Is Served; WWE Smackdown! Vs. Raw The 12th edition of this genealogy program comes in three editions -- Standard, Deluxe and Collector's -- but the core program is identical in all.
Augmented: Family Tree Maker 2005 Deluxe; Law Order: Justice Is Served; WWE Smackdown! Vs. Raw The twelfth version of this genealogy political platfor

## Load Pre-trained Model
Set up config for pretrained model and download it from hugging face

In [None]:
model = RobertaForSequenceClassification.from_pretrained(
    base_model,
    id2label=id2label)
model

## Anything from here on can be modified

In [None]:
# Split the original training set
split_datasets = tokenized_dataset.train_test_split(test_size=640, seed=42)
train_dataset = split_datasets['train']
eval_dataset = split_datasets['test']

## Setup LoRA Config
Setup PEFT config and get peft model for finetuning

In [None]:
!pip install peft accelerate transformers datasets

In [None]:
from peft import get_peft_model, LoraConfig, TaskType

# Configure LoRA
lora_config = LoraConfig(
    r=8,  #back to 8
    lora_alpha=8, # back to 8
    target_modules=["query", "value"],
    lora_dropout=0.1, # bumping up dropout
    bias="none",
    task_type=TaskType.SEQ_CLS
)

In [None]:
peft_model = get_peft_model(model, lora_config)
peft_model

In [None]:
print("Trainable parameters:")
count = 0
for name, param in peft_model.named_parameters():
    if param.requires_grad:
        count=count+1
        print(name)
print(count)

In [None]:
print('PEFT Model')
peft_model.print_trainable_parameters()

## Training Setup

In [None]:
# To track evaluation accuracy during training
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="macro")

    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./roberta-lora-agnews-v2",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16, # reducing to 16 to improve generalization
    num_train_epochs=3,
    learning_rate=2e-4, # significant increase, but slightly lower than common lora LR of 2e-2
    logging_dir="./logs",
    logging_steps=50,
    eval_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="eval_accuracy",
    report_to="none"
)

### Start Training

#### Introducing weighted crossentropy loss

Rationale behind adding Weighted Loss Trainer is that we were having uneven performance for different classes (see our metrics per business, science / tech, etc. in plots below)

In [None]:
import torch
import numpy as np
from collections import Counter

def get_class_weights(labels, num_classes):
    counts = Counter(labels)
    total = sum(counts.values())

    # Inverse frequency: total / (count * num_classes)
    weights = [total / (counts[i] * num_classes) for i in range(num_classes)]
    return torch.tensor(weights, dtype=torch.float)

In [None]:
from transformers import Trainer

# Adding WeightedLossTrainer after observing uneven performance for different classes
class WeightedLossTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        loss_fn = torch.nn.CrossEntropyLoss(weight=self.class_weights.to(logits.device))
        loss = loss_fn(logits, labels)

        return (loss, outputs) if return_outputs else loss

In [None]:
train_labels = train_dataset["labels"]
class_weights = get_class_weights(train_labels, num_classes=4)

In [None]:
trainer = WeightedLossTrainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    class_weights=class_weights
)

In [None]:
trainer.train()

### Training Plots

In [None]:
logs = trainer.state.log_history

In [None]:
log_df = pd.DataFrame(logs)
log_df = log_df[log_df["step"].notnull()]  # keep rows that are tied to steps
log_df.reset_index(drop=True, inplace=True)
log_df.head()

In [None]:
import matplotlib.pyplot as plt

# Clean the DataFrame: remove rows without step
log_df = log_df[log_df["step"].notnull()]
log_df = log_df.reset_index(drop=True)

# Fill train loss forward (every other row has it missing)
log_df["train_loss"] = log_df["loss"].fillna(method="ffill")

# Filter to rows with evaluation metrics
eval_df = log_df[log_df["eval_accuracy"].notnull()]

# Plotting
def plot_metrics_from_csv(log_df, eval_df):
    plt.figure(figsize=(14, 6))

    # Evaluation Metrics
    plt.subplot(1, 2, 1)
    plt.plot(eval_df["step"], eval_df["eval_accuracy"], label="Accuracy")
    plt.plot(eval_df["step"], eval_df["eval_precision"], label="Precision")
    plt.plot(eval_df["step"], eval_df["eval_recall"], label="Recall")
    plt.plot(eval_df["step"], eval_df["eval_f1"], label="F1 Score")
    plt.xlabel("Step")
    plt.ylabel("Score")
    plt.title("Evaluation Metrics Over Steps")
    plt.legend()
    plt.grid(True)

    # Training vs Evaluation Loss
    plt.subplot(1, 2, 2)
    plt.plot(log_df["step"], log_df["train_loss"], label="Train Loss", linestyle="--")
    plt.plot(eval_df["step"], eval_df["eval_loss"], label="Eval Loss")
    plt.xlabel("Step")
    plt.ylabel("Loss")
    plt.title("Train vs Eval Loss Over Steps")
    plt.legend()
    plt.grid(True)

    plt.tight_layout()
    plt.show()

plot_metrics_from_csv(log_df, eval_df)

In [None]:
# log_df.to_csv("training_metrics V5.1.csv", index=False)
# print("Training metrics saved to training_metrics.csv")

In [None]:
## Plan to later compare models

# log_df = pd.read_csv("training_metrics V5.1.csv")

## Evaluate Finetuned Model

### Performing Inference on Custom Input
Uncomment following functions for running inference on custom inputs

In [None]:
def classify(model, tokenizer, text):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inputs = tokenizer(text, truncation=True, padding=True, return_tensors="pt").to(device)
    output = model(**inputs)

    prediction = output.logits.argmax(dim=-1).item()

    print(f'\n Class: {prediction}, Label: {id2label[prediction]}, Text: {text}')
    return id2label[prediction]

In [None]:
classify( peft_model, tokenizer, "Kederis proclaims innocence Olympic champion Kostas Kederis today left hospital ahead of his date with IOC inquisitors claiming his ...")
classify( peft_model, tokenizer, "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.")

### Run Inference on eval_dataset

In [None]:
!pip install evaluate

In [None]:
from torch.utils.data import DataLoader
import evaluate
from tqdm import tqdm
import torch

def evaluate_model(inference_model, dataset, labelled=True, batch_size=8, data_collator=None):
    """
    Evaluate a PEFT model on a dataset.

    Returns:
        If labelled is True:
            - metrics (dict)
            - predictions (tensor)
            - true labels (tensor)
        Else:
            - predictions only
    """
    eval_dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=data_collator)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    inference_model.to(device)
    inference_model.eval()

    all_predictions = []
    all_labels = []

    if labelled:
        accuracy_metric = evaluate.load("accuracy")
        precision_metric = evaluate.load("precision")
        recall_metric = evaluate.load("recall")
        f1_metric = evaluate.load("f1")

    for batch in tqdm(eval_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = inference_model(**batch)
        predictions = outputs.logits.argmax(dim=-1)

        all_predictions.append(predictions.cpu())

        if labelled:
            labels = batch["labels"].cpu()
            all_labels.append(labels)

            accuracy_metric.add_batch(predictions=predictions.cpu().numpy(), references=labels.numpy())
            precision_metric.add_batch(predictions=predictions.cpu().numpy(), references=labels.numpy())
            recall_metric.add_batch(predictions=predictions.cpu().numpy(), references=labels.numpy())
            f1_metric.add_batch(predictions=predictions.cpu().numpy(), references=labels.numpy())

    all_predictions = torch.cat(all_predictions, dim=0)

    if labelled:
        all_labels = torch.cat(all_labels, dim=0)

        # Introducing other metrics for better assessment and improvmeent of model
        eval_metric = {
            "accuracy": accuracy_metric.compute(),
            "precision": precision_metric.compute(average="macro"),
            "recall": recall_metric.compute(average="macro"),
            "f1": f1_metric.compute(average="macro"),
        }

        print("Evaluation Metrics:", eval_metric)
        return eval_metric, all_predictions, all_labels
    else:
        return all_predictions

In [None]:
metrics, predictions, labels = evaluate_model(
    peft_model,
    eval_dataset,
    labelled=True,
    data_collator=data_collator
)

In [None]:
from sklearn.metrics import classification_report

class_names = dataset.features["label"].names

print("\nPer-Class Metrics:\n")
print(classification_report(labels.numpy(), predictions.numpy(), target_names=class_names))

In [None]:
# import matplotlib.pyplot as plt

# def plot_per_class_f1(preds, labels, class_names):
#     report = classification_report(
#         labels.numpy(),
#         preds.numpy(),
#         target_names=class_names,
#         output_dict=True
#     )
#     f1_scores = {cls: report[cls]["f1-score"] for cls in class_names}

#     plt.figure(figsize=(10, 5))
#     plt.bar(f1_scores.keys(), f1_scores.values())
#     plt.ylim(0, 1)
#     plt.ylabel("F1 Score")
#     plt.title("Per-Class F1 Scores")
#     plt.xticks(rotation=45)
#     plt.grid(axis='y')
#     plt.tight_layout()
#     plt.show()

In [None]:
# plot_per_class_f1(predictions, labels, class_names)

In [None]:
!pip install scikit-learn matplotlib seaborn

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

def plot_confusion_matrix(predictions, labels, class_names):
    cm = confusion_matrix(labels, predictions)

    plt.figure(figsize=(8, 6))
    ax = sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')

    ax.set_xticks(range(len(class_names)))
    ax.set_xticklabels(class_names, rotation=45)

    ax.set_yticks(range(len(class_names)))
    ax.set_yticklabels(class_names, rotation=0)

    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.title("Confusion Matrix")
    plt.tight_layout()
    plt.show()

In [None]:
plot_confusion_matrix(predictions.numpy(), labels.numpy(), class_names)

In [None]:
# import pandas as pd
# df = pd.DataFrame({"predictions": predictions, "labels": labels})
# df.to_csv("confusion_data.csv", index=False)

### Run Inference on unlabelled dataset

In [None]:
#Load your unlabelled data
unlabelled_dataset = pd.read_pickle("/kaggle/input/deep-learning-spring-2025-project-2/test_unlabelled.pkl")
test_dataset = unlabelled_dataset.map(preprocess, batched=True, remove_columns=["text"])
unlabelled_dataset

In [None]:
# Run inference and save predictions
preds = evaluate_model(peft_model, test_dataset, False, 8, data_collator)
df_output = pd.DataFrame({
    'ID': range(len(preds)),
    'Label': preds.numpy()  # or preds.tolist()
})

df_output.to_csv('/kaggle/working/submission.csv', index=False)
print("Inference complete. Predictions saved to submission.csv")