In [None]:
!pip install transformers datasets evaluate accelerate peft trl bitsandbytes
!pip install nvidia-ml-py3

In [None]:
import os
import pandas as pd
import numpy as np
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
from peft import LoraConfig, get_peft_model, PeftModel
from datasets import load_dataset, Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from kaggle_secrets import UserSecretsClient
import wandb
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("wandb")
wandb.login(key=secret_value_0)

In [None]:


# Set random seeds for reproducibility
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    
set_seed(42)

# 1. Load and prepare the AG News dataset
dataset = load_dataset('ag_news')
print(f"Training set size: {len(dataset['train'])}")
print(f"Test set size: {len(dataset['test'])}")

# Get class information
num_labels = dataset['train'].features['label'].num_classes
class_names = dataset['train'].features['label'].names
print(f"Classes: {class_names}")

# 2. Initialize tokenizer with improved settings
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# 3. Enhanced preprocessing with dynamic max length based on data distribution
def analyze_text_lengths(dataset, sample_size=10000):
    lengths = []
    for i in range(min(sample_size, len(dataset))):
        lengths.append(len(tokenizer.encode(dataset[i]['text'])))
    
    p95 = np.percentile(lengths, 95)
    return int(p95)

# Sample the dataset to determine optimal sequence length
optimal_length = analyze_text_lengths(dataset['train'])
print(f"Optimal sequence length (95th percentile): {optimal_length}")
# max_length = min(512, optimal_length)  # Cap at 512 tokens
max_length = 128

def preprocess_function(examples):
    return tokenizer(
        examples['text'],
        truncation=True,
        max_length=max_length,
        return_tensors=None
    )

# Process the dataset
tokenized_train = dataset['train'].map(preprocess_function, batched=True, remove_columns=['text'])
tokenized_test = dataset['test'].map(preprocess_function, batched=True, remove_columns=['text'])

# Rename 'label' to 'labels' for model compatibility
tokenized_train = tokenized_train.rename_column('label', 'labels')
tokenized_test = tokenized_test.rename_column('label', 'labels')

# Create validation split
tokenized_train, tokenized_val = tokenized_train.train_test_split(test_size=0.1, seed=42).values()

# 4. Create data collator for efficient batching
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

# 5. Initialize base model
id2label = {i: label for i, label in enumerate(class_names)}
label2id = {label: i for i, label in enumerate(class_names)}


In [None]:
base_model = RobertaForSequenceClassification.from_pretrained(
    'roberta-base',
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)

In [None]:
# 6. Optimized LoRA Configuration
# Calculate maximum rank possible within parameter budget
def calculate_max_rank(lora_targets, param_budget=1_000_000):
    total_params = 0
    ranks = {}
    
    # Calculate base parameters for each target
    for target in lora_targets:
        if 'query' in target or 'key' in target or 'value' in target:
            d_in = d_out = 768
        elif 'intermediate.dense' in target:
            d_in = 768
            d_out = 3072
        elif 'output.dense' in target:
            d_in = 3072
            d_out = 768
        elif 'classifier.dense' in target:
            d_in = d_out = 768
        elif 'classifier.out_proj' in target:
            d_in = 768
            d_out = num_labels
        else:
            d_in = d_out = 768  # Default assumption
            
        # For each parameter matrix, we have r*(d_in + d_out) parameters with LoRA
        ranks[target] = 16  # Initial guess
        total_params += 12 * ranks[target] * (d_in + d_out)  # 12 layers
    
    # Adjust ranks to fit within budget
    if total_params > param_budget:
        scale_factor = param_budget / total_params
        for target in ranks:
            ranks[target] = max(1, int(ranks[target] * scale_factor))
    
    return ranks

# Define target modules to apply LoRA
lora_targets = [
    "query", "key","value"  # Attention matrices
    # "intermediate.dense",      # Feedforward up-projection
    # "output.dense",           # Feedforward down-projection
    # "classifier.dense",       # Classification head
    # "classifier.out_proj"     # Final projection
]

# Calculate optimal ranks for each target
target_ranks = calculate_max_rank(lora_targets)
base_rank = min(target_ranks.values())  # Use the smallest rank for simplicity
print(f"base rank: {base_rank}")

# Define optimized LoRA configuration
peft_config = LoraConfig(
    r=6,
    lora_alpha=14,  # Higher scale parameter for stronger updates
    lora_dropout=0.1,  # Increased dropout for regularization
    bias="lora_only",  # Train bias terms alongside LoRA matrices
    target_modules=lora_targets,
    task_type="SEQ_CLS",
    fan_in_fan_out=False
)

# Create the LoRA model
peft_model = get_peft_model(base_model, peft_config)
print("\nTrainable parameters information:")
peft_model.print_trainable_parameters()

In [None]:
# 7. Define evaluation metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# 8. Define optimized training arguments
training_args = TrainingArguments(
    output_dir="results/ag_news_roberta_lora",
    eval_strategy="steps",
    eval_steps=200,
    save_strategy="steps",
    save_steps=200,
    logging_steps=50,
    learning_rate=5e-04,
    lr_scheduler_type="linear",  # Cosine scheduler with warmup
    warmup_ratio=0.06,
    num_train_epochs=1,  # Train longer
    per_device_train_batch_size=16,  # Larger batch size
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    # fp16=True,  # Use mixed precision
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    # gradient_accumulation_steps=2,  # Accumulate gradients
    remove_unused_columns=True,
    group_by_length=True,  # Improve efficiency with similar-length samples
    optim="adamw_torch_fused",  # Use fused optimizer
    # early_stopping_patience=1
)

from transformers import EarlyStoppingCallback

# 9. Initialize and run trainer
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]  # Add this line
)

# Train the model
trainer.train()

In [None]:
# # 10. Evaluate on test set
# test_results = trainer.evaluate(tokenized_test)
# print(f"Test accuracy: {test_results['eval_accuracy']:.4f}")

# # 11. Save the model
# trainer.save_model("final_model")

# # 12. Function to run inference on new data
# def predict_class(text, model=peft_model, tokenizer=tokenizer):
#     inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=max_length)
#     inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
#     with torch.no_grad():
#         outputs = model(**inputs)
    
#     logits = outputs.logits
#     predicted_class_id = logits.argmax(-1).item()
#     return id2label[predicted_class_id]

# # Example inference
# example_texts = [
#     "New trade agreement between US and China set to boost global markets",
#     "Manchester United wins 2-0 against Liverpool in Premier League clash",
#     "Tech giant announces new smartphone with revolutionary camera system",
#     "Scientists discover potential vaccine for previously incurable disease"
# ]

# for text in example_texts:
#     print(f"Text: {text[:50]}...\nPredicted class: {predict_class(text)}\n")



In [None]:
# # 13. Inference on test set for submission
# def generate_submission(model, test_dataset, output_file="submission.csv"):
#     # Create test dataloader
#     test_dataloader = torch.utils.data.DataLoader(
#         test_dataset, 
#         batch_size=64, 
#         collate_fn=data_collator
#     )
    
#     model.eval()
#     all_preds = []
    
#     for batch in test_dataloader:
#         batch = {k: v.to(model.device) for k, v in batch.items()}
#         with torch.no_grad():
#             outputs = model(**batch)
        
#         preds = outputs.logits.argmax(dim=-1).cpu().numpy()
#         all_preds.extend(preds)
    
#     # Create submission file
#     submission_df = pd.DataFrame({
#         'id': range(len(all_preds)),
#         'label': all_preds
#     })
    
#     submission_df.to_csv(output_file, index=False)
#     print(f"Submission file created: {output_file}")

# # Generate submission file
# generate_submission(peft_model, tokenized_test)

In [None]:
from torch.utils.data import DataLoader
import evaluate
from tqdm import tqdm

def evaluate_model(inference_model, dataset, labelled=True, batch_size=8, data_collator=None):
    """
    Evaluate a PEFT model on a dataset.

    Args:
        inference_model: The model to evaluate.
        dataset: The dataset (Hugging Face Dataset) to run inference on.
        labelled (bool): If True, the dataset includes labels and metrics will be computed.
                         If False, only predictions will be returned.
        batch_size (int): Batch size for inference.
        data_collator: Function to collate batches. If None, the default collate_fn is used.

    Returns:
        If labelled is True, returns a tuple (metrics, predictions)
        If labelled is False, returns the predictions.
    """
    # Create the DataLoader
    eval_dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=data_collator)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    inference_model.to(device)
    inference_model.eval()

    all_predictions = []
    if labelled:
        metric = evaluate.load('accuracy')

    # Loop over the DataLoader
    for batch in tqdm(eval_dataloader):
        # Move each tensor in the batch to the device
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = inference_model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        all_predictions.append(predictions.cpu())

        if labelled:
            # Expecting that labels are provided under the "labels" key.
            references = batch["labels"]
            metric.add_batch(
                predictions=predictions.cpu().numpy(),
                references=references.cpu().numpy()
            )

    # Concatenate predictions from all batches
    all_predictions = torch.cat(all_predictions, dim=0)

    if labelled:
        eval_metric = metric.compute()
        print("Evaluation Metric:", eval_metric)
        return eval_metric, all_predictions
    else:
        return all_predictions

In [None]:
#Load your unlabelled data
unlabelled_dataset = pd.read_pickle("/kaggle/input/deep-learning-spring-2025-project-2/test_unlabelled.pkl")
test_dataset = unlabelled_dataset.map(preprocess_function, batched=True, remove_columns=["text"])
unlabelled_dataset

In [None]:
output_dir = "/kaggle/working/"

In [None]:
# Run inference and save predictions
preds = evaluate_model(peft_model, test_dataset, False, 8, data_collator)
df_output = pd.DataFrame({
    'ID': range(len(preds)),
    'Label': preds.numpy()  # or preds.tolist()
})
df_output.to_csv(os.path.join(output_dir,"inference_output.csv"), index=False)
print("Inference complete. Predictions saved to inference_output.csv")

In [None]:
df_output.to_csv(os.path.join(output_dir,"inference_output_no_padding.csv"), index=False)
print("Inference complete. Predictions saved to inference_output_no_padding.csv")