In [1]:
import pandas as pd
import numpy as np
import torch
import mlflow
import os 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset

# 1. Experiment Settings
# Set the MLflow tracking URI to save logs in the parent directory
mlflow.set_tracking_uri("file:../mlruns")

# Define the experiment name
experiment_name = "Spoiler_Detection_DistilBERT"

# Create the experiment if it doesn't exist, otherwise use it
try:
    # Try to retrieve the experiment ID
    experiment_id = mlflow.get_experiment_by_name(experiment_name).experiment_id
    if experiment is None:
        print(f"Creating new experiment: {experiment_name}")
        experiment_id = mlflow.create_experiment(experiment_name)
    else:
        experiment_id = experiment.experiment_id
        print(f"Using existing experiment ID: {experiment_id}")
        
except AttributeError:
    # If experiment doesn't exist , create a new one
    print(f"Creating new experiment: {experiment_name}")
    experiment_id = mlflow.create_experiment(experiment_name)

# Set the active experiment
mlflow.set_experiment(experiment_name)

MODEL_NAME = "distilbert-base-uncased"
DATA_PATH = "../data/cleaned_data.csv"

Creating new experiment: Spoiler_Detection_DistilBERT


  return FileStore(store_uri, store_uri)


In [2]:
class SpoilerDataset(Dataset):
    """
    Custom Dataset class for handling encoded text and labels for PyTorch.
    """
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        # Retrieve the token IDs and attention masks for the given index
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        # Include labels if they exist (for training)
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

def compute_metrics(pred):
    """
    Compute accuracy and F1 score for model evaluation.
    """
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1) # Select the class with the highest probability
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [3]:
# Select Device (MPS for Mac, CUDA for NVIDIA, or CPU)
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("üöÄ Device: Mac GPU (MPS) is being used! Fast training mode.")
elif torch.cuda.is_available():
    device = torch.device("cuda")
    print("üöÄ Device: NVIDIA GPU (CUDA) is being used!")
else:
    device = torch.device("cpu")
    print("‚ö†Ô∏è Device: CPU is being used. Training might be slow.")

# Load Data
print("Loading data...")
try:
    df = pd.read_csv(DATA_PATH)
    df = df.dropna(subset=["cleaned_text", "label"])
except FileNotFoundError:
    print(f"Error: Data file not found at {DATA_PATH}.")
    # Stop execution if data is missing
    raise 

# Prepare X and y
X = df["cleaned_text"].tolist() 
y = df["label"].tolist()

# Split into Training and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Load Tokenizer
print("Loading tokenizer...")
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)

# Tokenize Data
# Note: max_length=256 helps capture context at the end of reviews (where spoilers often are)
train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=256)
test_encodings = tokenizer(X_test, truncation=True, padding=True, max_length=256)

# Create Dataset Objects
train_dataset = SpoilerDataset(train_encodings, y_train)
test_dataset = SpoilerDataset(test_encodings, y_test)

üöÄ Device: Mac GPU (MPS) is being used! Fast training mode.
Loading data...
Loading tokenizer...


In [4]:
def train_and_evaluate():
    with mlflow.start_run():
        # Load Model
        print("Downloading DistilBERT model...")
        model = DistilBertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
        model.to(device)

        # Define Training Arguments
        training_args = TrainingArguments(
            output_dir="../results",
            num_train_epochs=3,              # Increased to 3 for better convergence
            per_device_train_batch_size=8,   # Reduced to 8 to fit larger max_length in memory
            per_device_eval_batch_size=8,
            warmup_steps=500,
            learning_rate=2e-5,              # Lower learning rate for fine-tuning
            weight_decay=0.01,
            logging_dir="./logs",
            logging_steps=10,
            eval_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,
        )

        # Initialize Trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=test_dataset,
            compute_metrics=compute_metrics
        )

        # Start Training
        print("Training starting! This might take 15-20 mins...")
        trainer.train()

        # Evaluate
        print("Evaluating on test set...")
        results = trainer.evaluate()
        
        print("\n--- RESULTS ---")
        print(f"Accuracy: {results['eval_accuracy']:.4f}")
        print(f"F1 Score: {results['eval_f1']:.4f}")

        # Log Metrics to MLflow
        mlflow.log_metric("eval_accuracy", results["eval_accuracy"])
        mlflow.log_metric("eval_f1", results["eval_f1"])
        
        # Log Model to MLflow
        mlflow.transformers.log_model(
            transformers_model={"model": model, "tokenizer": tokenizer},
            artifact_path="distilbert_spoiler_model",
            task="text-classification"
        )
        print("\nModel saved to MLflow successfully!")

# Run the training function
if __name__ == "__main__":
    train_and_evaluate()

Downloading DistilBERT model...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training starting! This might take 15-20 mins...




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6369,0.638866,0.6825,0.709382
2,0.6299,0.577577,0.71,0.725118
3,0.4926,0.610478,0.675,0.670051




Evaluating on test set...




Device set to use mps:0



--- RESULTS ---
Accuracy: 0.7100
F1 Score: 0.7251

Model saved to MLflow successfully!
