## DistilRoBERTa

In [None]:
!pip install datasets
!pip install transformers[torch]
!pip install accelerate>={0.26.0}
!pip install evaluate
!pip install transformers
!pip install 'accelerate>=0.26.0'


In [None]:
!pip show transformers accelerate torch

In [4]:
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pandas as pd
import random

In [5]:
# Load data 
train_df = pd.read_csv("../2_preprocessing/train_case_sensitive.csv")
val_df = pd.read_csv("../2_preprocessing/validation_case_sensitive.csv")

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
data = DatasetDict({"train": train_dataset, "validation": val_dataset})

In [6]:
# Load DistilRoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained("distilroberta-base")

# Tokenize datasets
def tokenize_function(examples):
    return tokenizer(examples["script"], padding="max_length", truncation=True)

tokenized_datasets = data.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["script"])  # Remove raw text after tokenization
tokenized_datasets = tokenized_datasets.rename_column("passed_bechdel", "labels")  # Rename for Trainer compatibility
tokenized_datasets.set_format("torch")  # Use PyTorch tensors


Map:   0%|          | 0/1418 [00:00<?, ? examples/s]

Map:   0%|          | 0/177 [00:00<?, ? examples/s]

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",       # Evaluate every N steps
    eval_steps=500,                   # Frequency of evaluation
    save_steps=500,                   # Frequency of saving checkpoints
    logging_steps=100,                # Log progress every 100 steps
    learning_rate=5e-5,               # Adjust learning rate
    num_train_epochs=5,               # Train for 5 epochs
    per_device_train_batch_size=16,   # Train batch size
    per_device_eval_batch_size=16,    # Eval batch size
    gradient_accumulation_steps=2,    # Simulate larger batch size
    weight_decay=0.01,                # Apply weight decay
    fp16=True,                        # Enable mixed precision training
    warmup_steps=500,                 # Learning rate warmup
    lr_scheduler_type="cosine",       # Use cosine decay for learning rate
    save_total_limit=2,               # Save only the last 2 checkpoints
    load_best_model_at_end=True,      # Load best model at the end of training
    metric_for_best_model="accuracy", # Use accuracy to determine the best model
    report_to="none",                 # Disable reporting (e.g., WandB)
)


In [7]:
#random.seed(42)
# Load RobertaForSequenceClassification with DistilRoBERTa's pre-trained weights
model = RobertaForSequenceClassification.from_pretrained("distilroberta-base", num_labels=2)  

# Define evaluation metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",       # Evaluate at the end of each epoch
    save_strategy="epoch",             # Save checkpoint at each epoch
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,       # Load the best model based on validation accuracy
    metric_for_best_model="accuracy",  # Select best model based on validation accuracy
    save_total_limit=2,                 # Save at most 2 checkpoints
    seed = 42
)

# Set up the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],  # Validation dataset for evaluation
    processing_class=tokenizer,
    compute_metrics=compute_metrics
)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


If code gives a weird error - restart kernel

In [None]:
# Train the model
trainer.train()

# Explicitly evaluate the model on the validation dataset after training
validation_results = trainer.evaluate(eval_dataset=tokenized_datasets["validation"])
print("\nValidation Results After Training:")
validation_results


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.577126,0.728814,0.722114,0.754702,0.728814
2,No log,0.576338,0.706215,0.700474,0.722151,0.706215
3,No log,0.53209,0.762712,0.762712,0.762712,0.762712



Validation Results After Training:


{'eval_loss': 0.5320897698402405,
 'eval_accuracy': 0.7627118644067796,
 'eval_f1': 0.7627118644067796,
 'eval_precision': 0.7627118644067796,
 'eval_recall': 0.7627118644067796,
 'eval_runtime': 1.2661,
 'eval_samples_per_second': 139.801,
 'eval_steps_per_second': 9.478,
 'epoch': 3.0}

In [1]:
validation_results

NameError: name 'validation_results' is not defined