## DistilRoBERTa

In [None]:
!pip install datasets
!pip install transformers[torch]
!pip install evaluate
!pip install transformers
!pip install 'accelerate>=0.26.0'


In [2]:
# import packages
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pandas as pd
import random

In [3]:
# Load data 
train_df = pd.read_csv("../2_preprocessing/train_case_sensitive.csv")
val_df = pd.read_csv("../2_preprocessing/validation_case_sensitive.csv")
test_df = pd.read_csv("../2_preprocessing/test_case_sensitive.csv")

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)
data = DatasetDict({"train": train_dataset, "validation": val_dataset, "test": test_dataset})

In [4]:
# Load DistilRoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained("distilroberta-base")

# Tokenize datasets
def tokenize_function(examples):
    return tokenizer(examples["script"], padding="max_length", truncation=True)

tokenized_datasets = data.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["script"])  # Remove raw text after tokenization
tokenized_datasets = tokenized_datasets.rename_column("passed_bechdel", "labels")  # Rename for Trainer compatibility
tokenized_datasets.set_format("torch")  # Use PyTorch tensors


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

Map:   0%|          | 0/1418 [00:00<?, ? examples/s]

Map:   0%|          | 0/177 [00:00<?, ? examples/s]

Map:   0%|          | 0/178 [00:00<?, ? examples/s]

In [None]:
#### Overview of possible parameters
# training_args = TrainingArguments(
#     output_dir="./results",
#     evaluation_strategy="steps",       # Evaluate every N steps
#     eval_steps=500,                   # Frequency of evaluation
#     save_steps=500,                   # Frequency of saving checkpoints
#     logging_steps=100,                # Log progress every 100 steps
#     learning_rate=5e-5,               # Adjust learning rate
#     num_train_epochs=5,               # Train for 5 epochs
#     per_device_train_batch_size=16,   # Train batch size
#     per_device_eval_batch_size=16,    # Eval batch size
#     gradient_accumulation_steps=2,    # Simulate larger batch size
#     weight_decay=0.01,                # Apply weight decay
#     fp16=True,                        # Enable mixed precision training
#     warmup_steps=500,                 # Learning rate warmup
#     lr_scheduler_type="cosine",       # Use cosine decay for learning rate
#     save_total_limit=2,               # Save only the last 2 checkpoints
#     load_best_model_at_end=True,      # Load best model at the end of training
#     metric_for_best_model="accuracy", # Use accuracy to determine the best model
#     report_to="none",                 # Disable reporting (e.g., WandB)
# )


In [5]:
#random.seed(42)
# Load RobertaForSequenceClassification with DistilRoBERTa's pre-trained weights
model = RobertaForSequenceClassification.from_pretrained("distilroberta-base", num_labels=2)  

# Define evaluation metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",       # Evaluate at the end of each epoch
    save_strategy="epoch",             # Save checkpoint at each epoch
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,       # Load the best model based on validation accuracy
    metric_for_best_model="accuracy",  # Select best model based on validation accuracy
    save_total_limit=2,                 # Save at most 2 checkpoints
    seed = 42
)

# Set up the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],  # Validation dataset for evaluation
    processing_class=tokenizer,
    compute_metrics=compute_metrics
)


model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


If code gives a weird error - restart kernel

In [6]:
# Train the model
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.550846,0.768362,0.765216,0.784657,0.768362
2,No log,0.528529,0.762712,0.762287,0.764996,0.762712
3,No log,0.525395,0.757062,0.757016,0.757404,0.757062




TrainOutput(global_step=135, training_loss=0.6129492865668402, metrics={'train_runtime': 60.3423, 'train_samples_per_second': 70.498, 'train_steps_per_second': 2.237, 'total_flos': 563516313882624.0, 'train_loss': 0.6129492865668402, 'epoch': 3.0})

In [7]:
# Save the model and tokenizer
model.save_pretrained("./fine_tuned_distilroberta")
tokenizer.save_pretrained("./fine_tuned_distilroberta")

# Load the model and tokenizer later
#from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
#model = DistilBertForSequenceClassification.from_pretrained("./fine_tuned_distilroberta")
#tokenizer = DistilBertTokenizer.from_pretrained("./fine_tuned_distilroberta")


('./fine_tuned_distilroberta/tokenizer_config.json',
 './fine_tuned_distilroberta/special_tokens_map.json',
 './fine_tuned_distilroberta/vocab.json',
 './fine_tuned_distilroberta/merges.txt',
 './fine_tuned_distilroberta/added_tokens.json')

In [8]:
# Explicitly evaluate the model on the validation dataset after training
validation_results = trainer.evaluate(eval_dataset=tokenized_datasets["validation"])
print("\nValidation Results After Training:")
validation_results





Validation Results After Training:


{'eval_loss': 0.5508459210395813,
 'eval_accuracy': 0.768361581920904,
 'eval_f1': 0.7652160506850663,
 'eval_precision': 0.7846567331644211,
 'eval_recall': 0.768361581920904,
 'eval_runtime': 0.7795,
 'eval_samples_per_second': 227.059,
 'eval_steps_per_second': 7.697,
 'epoch': 3.0}

In [9]:
# Explicitly evaluate the model on the test dataset after training
test_results = trainer.evaluate(eval_dataset=tokenized_datasets["test"])
print("\nTest Results After Training:")
test_results




Test Results After Training:


{'eval_loss': 0.6314682960510254,
 'eval_accuracy': 0.6685393258426966,
 'eval_f1': 0.6601551001000001,
 'eval_precision': 0.6897462660955017,
 'eval_recall': 0.6685393258426966,
 'eval_runtime': 0.7759,
 'eval_samples_per_second': 229.413,
 'eval_steps_per_second': 7.733,
 'epoch': 3.0}

In [None]:
# adding to the performance df for plotting 
