# Starter Notebook

Install and import required libraries

In [1]:
!pip install transformers datasets evaluate accelerate peft trl bitsandbytes
!pip install nvidia-ml-py3

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [2]:
import os
import pandas as pd
import torch
from transformers import RobertaModel, RobertaTokenizer, TrainingArguments, Trainer, DataCollatorWithPadding, RobertaForSequenceClassification
from peft import LoraConfig, get_peft_model, PeftModel
from datasets import load_dataset, Dataset, ClassLabel
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
# Import necessary libraries first
import os
import pandas as pd
import torch
import torch.nn as nn
from transformers import RobertaModel, RobertaTokenizer, TrainingArguments, Trainer, DataCollatorWithPadding, RobertaForSequenceClassification, RobertaConfig
from peft import LoraConfig, get_peft_model, PeftModel, TaskType
from datasets import load_dataset, Dataset, ClassLabel
import pickle
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Load and prepare dataset (keeping your original code)
base_model = 'roberta-base'
dataset = load_dataset('ag_news', split='train')
tokenizer = RobertaTokenizer.from_pretrained(base_model)

def preprocess(examples):
    tokenized = tokenizer(examples['text'], truncation=True, padding=True)
    return tokenized

tokenized_dataset = dataset.map(preprocess, batched=True, remove_columns=["text"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

# Extract class info
num_labels = dataset.features['label'].num_classes
class_names = dataset.features["label"].names
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")
id2label = {i: label for i, label in enumerate(class_names)}
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

# Create a custom configuration with increased dropout
config = RobertaConfig.from_pretrained(
    base_model,
    num_labels=num_labels,
    id2label=id2label,
    hidden_dropout_prob=0.2,     # Increase from default 0.1
    attention_probs_dropout_prob=0.2,  # Increase from default 0.1
)

# Load model with custom configuration
model = RobertaForSequenceClassification.from_pretrained(
    base_model,
    config=config
)

# Create a custom model class with additional dropout layers
class EnhancedRobertaClassifier(nn.Module):
    def __init__(self, roberta_model):
        super().__init__()
        self.roberta = roberta_model
        # Additional dropout before classification layer
        self.extra_dropout = nn.Dropout(0.3)
        
    def forward(self, **inputs):
        outputs = self.roberta(**inputs)
        # Apply extra dropout to pooled output before classification
        if hasattr(outputs, 'logits'):
            # For sequence classification models
            return outputs
        # For base models that don't have classification head
        pooled_output = outputs.pooler_output
        pooled_output = self.extra_dropout(pooled_output)
        # Return modified outputs
        outputs.pooler_output = pooled_output
        return outputs

# Split the original training set
split_datasets = tokenized_dataset.train_test_split(test_size=640, seed=42)
train_dataset = split_datasets['train']
eval_dataset = split_datasets['test']

# Metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Apply our enhanced dropout wrapper
# No need to wrap when using RobertaForSequenceClassification with custom config
# enhanced_model = EnhancedRobertaClassifier(model)

# PEFT Config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias='none',
    target_modules=["roberta.encoder.layer.0.attention.self.query",
    "roberta.encoder.layer.0.attention.self.key",
    "roberta.encoder.layer.0.attention.self.value",
    "roberta.encoder.layer.1.attention.self.query",
    "roberta.encoder.layer.5.attention.self.query",
    "roberta.encoder.layer.10.attention.self.query",
    "roberta.encoder.layer.10.output.dense",
    "roberta.encoder.layer.11.output.dense"],
    task_type="SEQ_CLS",
)

# Apply PEFT to the model
print("Applying PEFT adapters to the model...")
peft_model = get_peft_model(model, peft_config)
print("PEFT Model Configuration:")
peft_model.print_trainable_parameters()

# Enhanced Training Arguments
print("Defining Training Arguments...")
training_args = TrainingArguments(
    output_dir="./results_lora_enhanced_dropout",
    learning_rate=3e-5,             # Slightly increased learning rate
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=5,             # More epochs for better convergence
    weight_decay=0.1,
    eval_strategy="epoch",          # Corrected parameter name
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    logging_dir='./logs_lora_enhanced_dropout',
    logging_steps=100,
    report_to="none",
    warmup_ratio=0.15,              # Increased warmup
    # bf16=True,                    # Uncomment if hardware supports it
    gradient_accumulation_steps=2,  # Added gradient accumulation for stability
    lr_scheduler_type="cosine",     # Changed to cosine scheduler
    metric_for_best_model="accuracy", # Optimize for accuracy
)

# Initialize Trainer
print("Initializing Trainer...")
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train the model
result = trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print(f"Final Evaluation Results: {eval_results}")

# Save the model
trainer.save_model("./final_enhanced_roberta_lora_model")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


number of labels: 4
the labels: ['World', 'Sports', 'Business', 'Sci/Tech']
Applying PEFT adapters to the model...
PEFT Model Configuration:
trainable params: 864,004 || all params: 125,512,712 || trainable%: 0.6884
Defining Training Arguments...


  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Initializing Trainer...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3041,0.317594,0.901563,0.901378,0.902127,0.901563
2,0.2881,0.273533,0.917188,0.916939,0.917498,0.917188
3,0.2318,0.261088,0.928125,0.927931,0.927887,0.928125
4,0.2554,0.252758,0.928125,0.927972,0.927885,0.928125
5,0.2555,0.253756,0.926562,0.926527,0.92651,0.926562




Final Evaluation Results: {'eval_loss': 0.26108819246292114, 'eval_accuracy': 0.928125, 'eval_f1': 0.9279305319802875, 'eval_precision': 0.9278869218261899, 'eval_recall': 0.928125, 'eval_runtime': 2.7095, 'eval_samples_per_second': 236.206, 'eval_steps_per_second': 7.381, 'epoch': 5.0}


## Load Tokenizer and Preprocess Data

## Anything from here on can be modified

## Setup LoRA Config
Setup PEFT config and get peft model for finetuning

## Training Setup

### Start Training

## Evaluate Finetuned Model


### Performing Inference on Custom Input
Uncomment following functions for running inference on custom inputs

In [12]:
def classify(model, tokenizer, text):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inputs = tokenizer(text, truncation=True, padding=True, return_tensors="pt").to(device)
    output = model(**inputs)

    prediction = output.logits.argmax(dim=-1).item()

    print(f'\n Class: {prediction}, Label: {id2label[prediction]}, Text: {text}')
    return id2label[prediction]

In [13]:
classify( peft_model, tokenizer, "Kederis proclaims innocence Olympic champion Kostas Kederis today left hospital ahead of his date with IOC inquisitors claiming his ...")
classify( peft_model, tokenizer, "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.")


 Class: 0, Label: World, Text: Kederis proclaims innocence Olympic champion Kostas Kederis today left hospital ahead of his date with IOC inquisitors claiming his ...

 Class: 2, Label: Business, Text: Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindlinand of ultra-cynics, are seeing green again.


'Business'

### Run Inference on eval_dataset

In [14]:
from torch.utils.data import DataLoader
import evaluate
from tqdm import tqdm

def evaluate_model(inference_model, dataset, labelled=True, batch_size=8, data_collator=None):
    """
    Evaluate a PEFT model on a dataset.

    Args:
        inference_model: The model to evaluate.
        dataset: The dataset (Hugging Face Dataset) to run inference on.
        labelled (bool): If True, the dataset includes labels and metrics will be computed.
                         If False, only predictions will be returned.
        batch_size (int): Batch size for inference.
        data_collator: Function to collate batches. If None, the default collate_fn is used.

    Returns:
        If labelled is True, returns a tuple (metrics, predictions)
        If labelled is False, returns the predictions.
    """
    # Create the DataLoader
    eval_dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=data_collator)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    inference_model.to(device)
    inference_model.eval()

    all_predictions = []
    if labelled:
        metric = evaluate.load('accuracy')

    # Loop over the DataLoader
    for batch in tqdm(eval_dataloader):
        # Move each tensor in the batch to the device
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = inference_model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        all_predictions.append(predictions.cpu())

        if labelled:
            # Expecting that labels are provided under the "labels" key.
            references = batch["labels"]
            metric.add_batch(
                predictions=predictions.cpu().numpy(),
                references=references.cpu().numpy()
            )

    # Concatenate predictions from all batches
    all_predictions = torch.cat(all_predictions, dim=0)

    if labelled:
        eval_metric = metric.compute()
        print("Evaluation Metric:", eval_metric)
        return eval_metric, all_predictions
    else:
        return all_predictions

In [15]:
# Check evaluation accuracy
_, _ = evaluate_model(peft_model, eval_dataset, True, 8, data_collator)

100%|██████████| 80/80 [00:02<00:00, 26.86it/s]

Evaluation Metric: {'accuracy': 0.928125}





### Run Inference on unlabelled dataset

In [16]:
#Load your unlabelled data
unlabelled_dataset = pd.read_pickle("test_unlabelled.pkl")
test_dataset = unlabelled_dataset.map(preprocess, batched=True, remove_columns=["text"])
unlabelled_dataset

Map: 100%|██████████| 8000/8000 [00:05<00:00, 1376.08 examples/s]


Dataset({
    features: ['text'],
    num_rows: 8000
})

In [17]:
# Run inference and save predictions
preds = evaluate_model(peft_model, test_dataset, False, 8, data_collator)

df_output = pd.DataFrame({
    'ID': range(len(preds)),
    'Label': preds.numpy()  # or preds.tolist()
})

df_output.to_csv(os.path.join(training_args.output_dir, "inference_outputkrittinfinal.csv"), index=False)
print("Inference complete. Predictions saved to inference_outputKrittin3.csv")


100%|██████████| 1000/1000 [00:28<00:00, 35.25it/s]

Inference complete. Predictions saved to inference_outputKrittin3.csv



