In [2]:
import os
import pandas as pd
import torch
import numpy as np
import pickle
import evaluate  # Use the evaluate library for metrics
import time
from tqdm.notebook import tqdm # Use notebook tqdm for better colab rendering

from transformers import (
    RobertaForSequenceClassification,
    RobertaTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    set_seed
)
from peft import LoraConfig, get_peft_model, PeftModel, TaskType
from datasets import load_dataset, Dataset, ClassLabel
from sklearn.metrics import accuracy_score

# --- Configuration ---
BASE_MODEL = 'roberta-base'
DATASET_NAME = 'ag_news'
OUTPUT_DIR = "results_lora_agnews"
SEED = 42
MAX_TRAINABLE_PARAMS = 1_000_000 # Project constraint

# Set seed for reproducibility
set_seed(SEED)

# Check for GPU
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("Using CPU")

# --- 1. Load Tokenizer and Dataset ---
print("\n--- Loading Tokenizer and Dataset ---")
tokenizer = RobertaTokenizer.from_pretrained(BASE_MODEL)
dataset = load_dataset(DATASET_NAME, split='train')

# --- 2. Preprocess Data ---
print("\n--- Preprocessing Dataset ---")
def preprocess_function(examples):
    # Tokenize, truncate long sequences, pad shorter sequences
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512) # Using max_length for consistency

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["text"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

# Extract class info
num_labels = dataset.features['label'].num_classes
class_names = dataset.features["label"].names
id2label = {i: label for i, label in enumerate(class_names)}
label2id = {label: i for i, label in id2label.items()} # Useful for the model config

print(f"Number of labels: {num_labels}")
print(f"Labels: {class_names}")
print(f"Example tokenized entry: {tokenized_dataset[0]}")

# --- 3. Split Data ---
print("\n--- Splitting Data (Train/Eval) ---")
# Using a slightly larger eval set might give more stable results
split_datasets = tokenized_dataset.train_test_split(test_size=0.05, seed=SEED, stratify_by_column="labels") # 5% for eval
train_dataset = split_datasets['train']
eval_dataset = split_datasets['test']
print(f"Train dataset size: {len(train_dataset)}")
print(f"Eval dataset size: {len(eval_dataset)}")

# --- 4. Data Collator ---
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

# --- 5. Load Base Model ---
print("\n--- Loading Base RoBERTa Model ---")
model = RobertaForSequenceClassification.from_pretrained(
    BASE_MODEL,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)

# Freeze base model parameters (standard practice for PEFT methods like LoRA)
for param in model.parameters():
    param.requires_grad = False

print(f"Base model loaded. Device: {model.device}") # Should be CPU initially
# print(model) # Uncomment to see model structure


Using GPU: NVIDIA A100-SXM4-40GB

--- Loading Tokenizer and Dataset ---

--- Preprocessing Dataset ---
Number of labels: 4
Labels: ['World', 'Sports', 'Business', 'Sci/Tech']
Example tokenized entry: {'labels': 2, 'input_ids': [0, 28216, 312, 4, 6033, 44121, 3727, 20693, 5, 1378, 36, 1251, 43, 1201, 111, 7787, 12, 5727, 268, 6, 2298, 852, 18, 25564, 37457, 9484, 9, 9620, 12, 4469, 282, 2857, 6, 32, 1782, 2272, 456, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Base model loaded. Device: cpu


In [16]:

# --- 6. LoRA Configuration ---
# Utility to calculate trainable params for a given config
def calculate_trainable_parameters(model, peft_config):
    # Create a temporary PEFT model instance to calculate params
    # Avoids modifying the main model object prematurely
    temp_peft_model = get_peft_model(model, peft_config)
    # get_nb_trainable_parameters returns a tuple: (trainable, total)
    trainable_params, total_params = temp_peft_model.get_nb_trainable_parameters()
    del temp_peft_model # Clean up the temporary model object
    return trainable_params # <<< FIX: Return only the trainable count (integer)

# --- Experiment with LoRA settings here ---
# Goal: Get close to MAX_TRAINABLE_PARAMS without exceeding it.
# Common modules to target in RoBERTa: ['query', 'value'] in self-attention
# Other possibilities: ['key', 'dense'] in attention output, ['dense'] in intermediate MLP
# Start with r=8 or r=16 and target_modules=['query', 'value']

lora_r = 8  # Rank
lora_alpha = 32 # Alpha (scaling factor, often 2*r)
lora_dropout = 0.1
# Target both query and value matrices in self-attention layers
# target_modules = ["query", "value"] # Good starting point
# target_modules = ["query", "key", "value", "dense"] # More params
target_modules = ["query", "value", "roberta.encoder.layer.*.output.dense"] # Example targeting specific layers

peft_config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    target_modules=target_modules,
    lora_dropout=lora_dropout,
    bias="lora_only",  # Or 'all' or 'lora_only', 'none' saves params
    task_type=TaskType.SEQ_CLS # Important for sequence classification
)

# Calculate parameters *before* creating the final PEFT model
# Need to load a temporary base model instance as get_peft_model modifies in place if model is already on GPU?
# Let's try directly on the CPU model first.
print("Calculating parameters on a temporary model instance...")
temp_base_model = RobertaForSequenceClassification.from_pretrained(
    BASE_MODEL, num_labels=num_labels, id2label=id2label, label2id=label2id
)
# Ensure the temporary model is configured before passing to the calculation function
num_trainable = calculate_trainable_parameters(temp_base_model, peft_config)
del temp_base_model # Free memory

print(f"Chosen LoRA Config:")
print(f"  r = {lora_r}")
print(f"  alpha = {lora_alpha}")
print(f"  dropout = {lora_dropout}")
print(f"  target_modules = {target_modules}")
print(f"  bias = {peft_config.bias}")
# This print statement should now work correctly
print(f"Calculated Trainable Parameters: {num_trainable:,}") # Now num_trainable is an int

if num_trainable > MAX_TRAINABLE_PARAMS:
    raise ValueError(f"Trainable parameters ({num_trainable:,}) exceed the limit ({MAX_TRAINABLE_PARAMS:,}). Adjust LoRA config (e.g., lower 'r', fewer 'target_modules').")
elif num_trainable == 0:
     raise ValueError(f"Trainable parameters is zero. Check LoRA config (e.g., 'target_modules'). Valid modules often include 'query', 'value', 'key', 'dense'.")
else:
    print("Parameter count is within the limit.")



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Calculating parameters on a temporary model instance...
Chosen LoRA Config:
  r = 8
  alpha = 32
  dropout = 0.1
  target_modules = ['query', 'value', 'roberta.encoder.layer.*.output.dense']
  bias = lora_only
Calculated Trainable Parameters: 907,012
Parameter count is within the limit.


In [17]:



# --- 7. Create PEFT Model ---
print("\n--- Creating PEFT Model ---")
peft_model = get_peft_model(model, peft_config)
peft_model.print_trainable_parameters()

# --- 8. Training Setup ---
print("\n--- Setting up Training ---")

# Metrics Calculation Function
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    acc = accuracy_metric.compute(predictions=predictions, references=labels)
    return {
        "accuracy": acc["accuracy"],
    }

# Training Arguments
# Adjust hyperparameters based on experiments
# Common LoRA learning rates: 5e-5, 1e-4, 2e-4
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=10, # Train for a few epochs
    per_device_train_batch_size=16, # Adjust based on GPU memory
    per_device_eval_batch_size=64,
    warmup_ratio=0.1, # Warmup for 10% of steps
    weight_decay=0.01,
    learning_rate=1e-4, # Starting point for LoRA
    logging_dir=f"{OUTPUT_DIR}/logs",
    logging_strategy="steps",
    logging_steps=50,
    evaluation_strategy="epoch", # Evaluate at the end of each epoch
    save_strategy="epoch", # Save checkpoint at the end of each epoch
    save_total_limit=2, # Keep only the last 2 checkpoints
    load_best_model_at_end=True, # Load the best model based on eval metric
    metric_for_best_model="accuracy",
    greater_is_better=True,
    report_to="none", # Disable external reporting (like wandb) for simplicity
    fp16=torch.cuda.is_available(), # Use mixed precision if GPU available
    gradient_checkpointing=False, # Set to True if memory is tight, but slows down training
    seed=SEED,
    # optim="adamw_torch", # Default AdamW from PyTorch
)

# Create Trainer
trainer = Trainer(
    model=peft_model, # Use the PEFT model
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# --- 9. Start Training ---
print("\n--- Starting Training ---")
start_time = time.time()
train_result = trainer.train()
end_time = time.time()

print(f"\n--- Training Finished ---")
print(f"Training Time: {(end_time - start_time)/60:.2f} minutes")

# Log metrics
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()
print("Training metrics saved.")



--- Creating PEFT Model ---
trainable params: 907,012 || all params: 125,537,288 || trainable%: 0.7225

--- Setting up Training ---


  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.



--- Starting Training ---


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2354,0.204269,0.932333
2,0.1858,0.20065,0.938
3,0.1781,0.190489,0.9405
4,0.1532,0.178691,0.941667
5,0.1448,0.197201,0.944
6,0.1947,0.18317,0.947
7,0.1008,0.185998,0.946333
8,0.1628,0.193182,0.948
9,0.1268,0.198615,0.946167
10,0.1766,0.200388,0.947167



--- Training Finished ---
Training Time: 112.08 minutes
***** train metrics *****
  epoch                    =        10.0
  total_flos               = 282250221GF
  train_loss               =      0.1623
  train_runtime            =  1:52:04.36
  train_samples_per_second =     169.533
  train_steps_per_second   =      10.596
Training metrics saved.


In [18]:

# --- 10. Evaluate Final Model on Eval Set ---
print("\n--- Evaluating Best Model on Evaluation Set ---")

# The trainer automatically loads the best model if load_best_model_at_end=True
# Let's explicitly print the trainable parameters of the model loaded by the trainer
print("Confirming trainable parameters of the final loaded model:")
# Ensure the model loaded by the trainer is the PEFT model
if hasattr(trainer.model, 'print_trainable_parameters'):
    trainer.model.print_trainable_parameters()
else:
    # This case shouldn't happen if setup is correct, but good to check
    total_params = sum(p.numel() for p in trainer.model.parameters())
    trainable_params = sum(p.numel() for p in trainer.model.parameters() if p.requires_grad)
    print(f"Model is not a PeftModel? Total params: {total_params:,}, Trainable params: {trainable_params:,}")


print("\nRunning final evaluation...")
eval_metrics = trainer.evaluate(eval_dataset=eval_dataset)

# Log and save evaluation metrics
trainer.log_metrics("eval", eval_metrics)
trainer.save_metrics("eval", eval_metrics)
print(f"Evaluation Metrics: {eval_metrics}")

final_accuracy = eval_metrics.get("eval_accuracy", 0)
final_trainable_params = sum(p.numel() for p in trainer.model.parameters() if p.requires_grad) # Re-calculate just to be sure

print("-" * 50)
print(f"Final Evaluation Accuracy: {final_accuracy:.4f}")
print(f"Final Model Trainable Parameters: {final_trainable_params:,}") # Print the final count clearly
print("-" * 50)

# Double-check against the limit
if final_trainable_params > MAX_TRAINABLE_PARAMS:
     print(f"WARNING: Final model trainable parameters ({final_trainable_params:,}) exceed the limit ({MAX_TRAINABLE_PARAMS:,})!")
elif final_trainable_params == 0:
     print(f"WARNING: Final model has 0 trainable parameters!")
else:
     print("Final model parameter count is within the limit.")






--- Evaluating Best Model on Evaluation Set ---
Confirming trainable parameters of the final loaded model:
trainable params: 907,012 || all params: 125,537,288 || trainable%: 0.7225

Running final evaluation...


***** eval metrics *****
  epoch                   =       10.0
  eval_accuracy           =     0.9483
  eval_loss               =     0.1933
  eval_runtime            = 0:00:09.92
  eval_samples_per_second =    604.463
  eval_steps_per_second   =       9.47
Evaluation Metrics: {'eval_loss': 0.19326800107955933, 'eval_accuracy': 0.9483333333333334, 'eval_runtime': 9.9262, 'eval_samples_per_second': 604.463, 'eval_steps_per_second': 9.47, 'epoch': 10.0}
--------------------------------------------------
Final Evaluation Accuracy: 0.9483
Final Model Trainable Parameters: 907,012
--------------------------------------------------
Final model parameter count is within the limit.


In [19]:
# --- 12. Run Inference on Unlabelled Test Data ---
print("\n--- Running Inference on Unlabelled Test Data ---")

from tqdm import tqdm # Standard tqdm
from torch.utils.data import DataLoader # Ensure DataLoader is imported

def predict_on_test_dataset(model, tokenizer, test_dataset_path="test_unlabelled.pkl", output_directory=OUTPUT_DIR, batch_size=32):
    """Run inference on the test dataset and save predictions to CSV"""
    
    # Use the global OUTPUT_DIR defined earlier for consistency
    output_dir = output_directory
    os.makedirs(output_dir, exist_ok=True)

    print(f"Loading test dataset from: {test_dataset_path}")
    if not os.path.exists(test_dataset_path):
        print(f"ERROR: Test dataset file not found at {test_dataset_path}")
        return None

    # Load the test dataset - Assuming it's a datasets.Dataset pickled
    try:
        with open(test_dataset_path, 'rb') as f:
            test_dataset = pickle.load(f)
        # Verify it's a Dataset object
        if not isinstance(test_dataset, Dataset):
             print(f"Warning: Loaded object is type {type(test_dataset)}, not datasets.Dataset. Trying to convert from Pandas DataFrame.")
             # If it was saved as a DataFrame:
             if isinstance(test_dataset, pd.DataFrame):
                 if 'text' not in test_dataset.columns:
                     raise ValueError("Loaded DataFrame does not contain a 'text' column.")
                 test_dataset = Dataset.from_pandas(test_dataset)
             else:
                 raise TypeError("Loaded pickle file is not a datasets.Dataset or pandas.DataFrame.")

    except Exception as e:
        print(f"Error loading or processing pickled dataset: {e}")
        return None

    print(f"Test dataset loaded successfully. Type: {type(test_dataset)}")
    print(f"Dataset features: {test_dataset.column_names}")
    print(f"Number of examples: {len(test_dataset)}")

    if 'text' not in test_dataset.column_names:
        print("ERROR: 'text' column not found in the loaded test dataset.")
        return None

    print("Preprocessing test data...")
    # Tokenize the text data
    def preprocess(examples):
        # Ensure padding is consistent with training, e.g., max_length
        return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)

    # Important: Remove the 'text' column after tokenization
    # Keep other columns needed for ID mapping if they exist (e.g., 'ID')
    columns_to_remove = ['text']
    test_tokenized = test_dataset.map(preprocess, batched=True, remove_columns=columns_to_remove)

    # Set up device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval() # Set model to evaluation mode

    # Create data loader for batched inference
    # Use the same data_collator as training for consistency
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")
    test_dataloader = DataLoader(test_tokenized, batch_size=batch_size, collate_fn=data_collator)

    all_predictions = []
    print(f"Running inference on {len(test_dataset)} examples using device: {device}")

    with torch.no_grad():
        for batch in tqdm(test_dataloader, desc="Inference"):
            # Move batch to device, ensure only expected inputs are passed
            batch_inputs = {k: v.to(device) for k, v in batch.items() if k in ['input_ids', 'attention_mask', 'token_type_ids']}
            if not batch_inputs:
                print("Warning: No model inputs found in batch. Skipping.")
                continue

            outputs = model(**batch_inputs)
            predictions = outputs.logits.argmax(dim=-1)
            all_predictions.append(predictions.cpu().numpy()) # Move predictions to CPU before converting to numpy

    # Concatenate predictions from all batches
    if not all_predictions:
        print("ERROR: No predictions were generated.")
        return None

    all_predictions = np.concatenate(all_predictions)
    print(f"Inference completed. Generated {len(all_predictions)} predictions.")

    # Create output DataFrame
    # Check if the original dataset had an 'ID' column, otherwise generate sequential IDs
    if 'ID' in test_dataset.column_names:
         ids = test_dataset['ID']
         if len(ids) != len(all_predictions):
              print(f"Warning: Length mismatch between original IDs ({len(ids)}) and predictions ({len(all_predictions)}). Generating sequential IDs.")
              ids = range(len(all_predictions))
         else:
              print("Using 'ID' column from the original dataset.")
    else:
         print("Generating sequential IDs as 'ID' column was not found in the original dataset.")
         ids = range(len(all_predictions))

    output_df = pd.DataFrame({'ID': ids, 'Label': all_predictions})

    # Save to CSV
    submission_filename = os.path.join(output_dir, "inference_output.csv")
    output_df.to_csv(submission_filename, index=False)
    print(f"Predictions saved to {submission_filename}")

    return all_predictions


# === Call the Prediction Function ===
# Use trainer.model because it holds the best model loaded after training
# Pass the OUTPUT_DIR defined earlier in the notebook
predictions = predict_on_test_dataset(
    model=trainer.model,
    tokenizer=tokenizer,
    test_dataset_path="test_unlabelled.pkl", # Make sure this path is correct
    output_directory=OUTPUT_DIR,
    batch_size=training_args.per_device_eval_batch_size # Use eval batch size from training args
)

print("\n--- Project Notebook Finished ---")


--- Running Inference on Unlabelled Test Data ---
Loading test dataset from: test_unlabelled.pkl
Test dataset loaded successfully. Type: <class 'datasets.arrow_dataset.Dataset'>
Dataset features: ['text']
Number of examples: 8000
Preprocessing test data...


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Running inference on 8000 examples using device: cuda


Inference: 100%|██████████| 125/125 [00:12<00:00,  9.78it/s]

Inference completed. Generated 8000 predictions.
Generating sequential IDs as 'ID' column was not found in the original dataset.
Predictions saved to results_lora_agnews/inference_output.csv

--- Project Notebook Finished ---



