# DistilBERT

## Setup

### Packages Setup

#### Install Packages

In [1]:
%pip install datasets
%pip install evaluate
%pip install fastapi
%pip install gdown
%pip install hf_xet
%pip install pandas
%pip install matplotlib
%pip install numpy
%pip install "optimum[onnxruntime]" onnxruntime-gpu
%pip install optuna
%pip install scikit-learn
%pip install tensorflow
%pip install tf-keras
%pip install transformers
%pip install transformers[torch]
%pip install torch torchvision --index-url https://download.pytorch.org/whl/cu126
%pip install uvicorn

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6
Collecting onnxruntime-gpu
  Downloading onnxruntime_gpu-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.4 kB)
Collecting optimum[onnxruntime]
  Downloading optimum-2.0.0-py3-none-any.whl.metadata (14 kB)
Collecting optimum-onnx[onnxruntime] (from optimum[onnxruntime])
  Downloading optimum_onnx-0.0.3-py3-none-any.whl.metadata (4.6 kB)
Collecting coloredlogs (from onnxruntime-gpu)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime-gpu)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Collecting transformers>=4.29 (from op

### Import Packages

In [2]:
import evaluate
import gdown
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import tensorflow as tf
import torch
from datasets import Dataset, Value
from fastapi import FastAPI
from optimum.onnxruntime import ORTModelForSequenceClassification
from optimum.onnxruntime import ORTQuantizer, QuantizationConfig
from sklearn.metrics import confusion_matrix, classification_report, matthews_corrcoef, balanced_accuracy_score, brier_score_loss
from sklearn.metrics import RocCurveDisplay, PrecisionRecallDisplay, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from transformers import DataCollatorForLanguageModeling
from transformers import DataCollatorWithPadding
from transformers import AutoConfig, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from torch.utils.data import DataLoader

Multiple distributions found for package optimum. Picked distribution: optimum-onnx


In [3]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available and being used.")
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU.")

GPU is available and being used.


### Data Setup

#### Read Data

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
train_cleaned = pd.read_json("/content/drive/MyDrive/CS3244/CS3244_Project/IMDB_reviews_train_cleaned.json")
test = pd.read_json("/content/drive/MyDrive/CS3244/CS3244_Project/IMDB_reviews_test.json")

train_cleaned.head()
test.head()

Unnamed: 0,review_date,movie_id,user_id,is_spoiler,review_text,rating,review_summary
391376,24 October 2006,tt0424136,ur0023796,False,Most films do best if you know next to nothing...,9,Hard Candy breaks minds as hard candy breaks t...
573647,2 September 2001,tt0139239,ur1235973,False,Go has not gotten even half of the praise it d...,9,One of the most under appreciated films in his...
426616,3 March 2011,tt0480249,ur24994931,False,Personally I really enjoyed this movie from th...,7,Why Do People Hate This Movie?
493566,11 March 2004,tt0103874,ur0395246,False,"As far as videos go, this is one of the few th...",6,Aye shoood tayhke thee trahyne tew Byoodapest
174694,11 May 2013,tt1931533,ur17825945,True,While trying a little too hard to be Adaptatio...,4,Unlucky Number Seven


#### Feature Standardization

In [7]:
train_cleaned['is_spoiler'] = train_cleaned['is_spoiler'].astype('int64')
train_cleaned = train_cleaned.rename(columns={'is_spoiler': 'labels', 'review_text': 'text'})

test['is_spoiler'] = test['is_spoiler'].astype('int64')
test = train_cleaned.rename(columns={'is_spoiler': 'labels', 'review_text': 'text'})

In [8]:
train_cleaned.loc[0]

Unnamed: 0,0
review_date,10 February 2006
movie_id,tt0111161
user_id,ur1898687
labels,1
text,oscar year shawshank redemption written direct...
rating,10
review_summary,A classic piece of unforgettable film-making.


In [9]:
train_cleaned.dtypes

Unnamed: 0,0
review_date,object
movie_id,object
user_id,object
labels,int64
text,object
rating,int64
review_summary,object


#### Balancing Data

In [10]:
# Check the original distribution of the combined groups
group_counts = train_cleaned.groupby(['labels']).size()
print("Original joint counts:\n", group_counts)

# Determine the minimum and maximum size for balancing all groups
min_group_size = group_counts.min()
max_group_size = group_counts.max()
print(f"\nTarget minimum sample size per joint group: {min_group_size}")
print(f"\nTarget maximum sample size per joint group: {max_group_size}")


Original joint counts:
 labels
0    338245
1    120885
dtype: int64

Target minimum sample size per joint group: 120885

Target maximum sample size per joint group: 338245


In [12]:
# Undersample each train group to the minimum size found
undersampled_train = train_cleaned.groupby(['labels']).apply(
    lambda x: x.sample(n=min_group_size, replace=False, random_state=3244)
).reset_index(drop=True)

print("Undersampled Train shape:", undersampled_train.shape)
print("New joint counts:\n", undersampled_train.groupby(['labels']).size())


  undersampled_train = train_cleaned.groupby(['labels']).apply(


Undersampled Train shape: (241770, 7)
New joint counts:
 labels
0    120885
1    120885
dtype: int64


In [13]:
# Undersample each test group to the minimum size found
undersampled_test = test.groupby(['labels']).apply(
    lambda x: x.sample(n=test_min_group_size, replace=False, random_state=3244)
).reset_index(drop=True)

print("Undersampled Test shape:", undersampled_test.shape)
print("New joint counts:\n", undersampled_test.groupby(['labels']).size())

  undersampled_test = test.groupby(['labels']).apply(


Undersampled Test shape: (241770, 7)
New joint counts:
 labels
0    120885
1    120885
dtype: int64


In [None]:
# Oversample each train group to the minimum size found
oversampled_train = train_cleaned.groupby(['labels']).apply(
    lambda x: x.sample(n=max_group_size, replace=True, random_state=3244)
).reset_index(drop=True)

print("Oversampled Train shape:", oversampled_train.shape)
print("New joint counts:\n", oversampled_train.groupby(['labels']).size())


  oversampled_train = train_cleaned.groupby(['labels']).apply(


Oversampled Train shape: (676490, 7)
New joint counts:
 labels
0    338245
1    338245
dtype: int64


In [None]:
# Oversample each test group to the minimum size found
oversampled_test = test.groupby(['labels']).apply(
    lambda x: x.sample(n=test_max_group_size, replace=True, random_state=3244)
).reset_index(drop=True)

print("Oversampled Test shape:", oversampled_test.shape)
print("New joint counts:\n", oversampled_test.groupby(['labels']).size())

  oversampled_test = test.groupby(['labels']).apply(


Oversampled Test shape: (676490, 7)
New joint counts:
 labels
0    338245
1    338245
dtype: int64


#### Convert to Dataframe

In [14]:
undersampled_train_dataset = Dataset.from_pandas(undersampled_train[['text', 'labels']])
undersampled_train_dataset = undersampled_train_dataset.cast_column('labels', Value('int64'))
undersampled_test_dataset = Dataset.from_pandas(undersampled_test[['text', 'labels']])
undersampled_test_dataset = undersampled_test_dataset.cast_column('labels', Value('int64'))
print(undersampled_train_dataset)
print(undersampled_test_dataset)

Casting the dataset:   0%|          | 0/241770 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/241770 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'labels'],
    num_rows: 241770
})
Dataset({
    features: ['text', 'labels'],
    num_rows: 241770
})


In [None]:
oversampled_train_dataset = Dataset.from_pandas(oversampled_train[['text', 'labels']])
oversampled_train_dataset = oversampled_train_dataset.cast_column('labels', Value('int64'))
oversampled_test_dataset = Dataset.from_pandas(oversampled_test[['text', 'labels']])
oversampled_test_dataset = oversampled_test_dataset.cast_column('labels', Value('int64'))
print(oversampled_train_dataset)
print(oversampled_test_dataset)

Casting the dataset:   0%|          | 0/676490 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/676490 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'labels'],
    num_rows: 676490
})
Dataset({
    features: ['text', 'labels'],
    num_rows: 676490
})


### Model Setup

In [15]:
model_name = "distilbert-base-uncased"
num_labels = 2  # For spoiler/non-spoiler classification
config = AutoConfig.from_pretrained(model_name, num_labels=num_labels, problem_type="single_label_classification")
distilbert_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", config=config)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Tokenizer Setup

In [16]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", use_fast=True)

def tokenize(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

### Metrics Setup

In [None]:
acc_m = evaluate.load("accuracy")
prec_m = evaluate.load("precision")
rec_m = evaluate.load("recall")
f1_m = evaluate.load("f1")
roc_m = evaluate.load("roc_auc")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    e_x = np.exp(logits - logits.max(axis=1, keepdims=True))
    prob_pos = (e_x / e_x.sum(axis=1, keepdims=True))[:, 1]
    return {
        "accuracy": acc_m.compute(predictions=preds, references=labels)["accuracy"],
        "precision": prec_m.compute(predictions=preds, references=labels, average="binary")["precision"],
        "recall": rec_m.compute(predictions=preds, references=labels, average="binary")["recall"],
        "f1": f1_m.compute(predictions=preds, references=labels, average="binary")["f1"],
        "f1_macro": f1_m.compute(predictions=preds, references=labels, average="macro")["f1"],
        "f1_weighted": f1_m.compute(predictions=preds, references=labels, average="weighted")["f1"],
        "f2": f1_m.compute(predictions=preds, references=labels, average="binary", beta=2)["fmeasure"],
        "roc_auc": roc_m.compute(references=labels, prediction_scores=prob_pos)["roc_auc"],
        "mcc": matthews_corrcoef(labels, preds),
        "balanced_accuracy": balanced_accuracy_score(labels, preds),
        "brier": brier_score_loss(labels, prob_pos),
    }

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

## Processing

### Undersampling


#### Data Processing

##### Tokenize *Data*

Tokenize the data and rename is_spoiler to labels so the transformer model can recognize as y value.

In [18]:
undersampled_tokenized_train_eval = undersampled_train_dataset.map(tokenize, batched=True, )
undersampled_tokenized_train_eval = undersampled_tokenized_train_eval.remove_columns(["text"])
undersampled_tokenized_train_eval.set_format(type='torch')

undersampled_tokenized_test = undersampled_test_dataset.map(tokenize, batched=True)
undersampled_tokenized_test = undersampled_tokenized_test.remove_columns(["text"])
undersampled_tokenized_test.set_format(type='torch')

Map:   0%|          | 0/241770 [00:00<?, ? examples/s]

Map:   0%|          | 0/241770 [00:00<?, ? examples/s]

In [19]:
undersampled_first = undersampled_tokenized_train_eval[0]
print(type(undersampled_first['labels']), undersampled_first['labels']) # with set_format('torch'), this is a torch.Tensor

<class 'torch.Tensor'> tensor(0)


In [20]:
collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=8)
undersampled_loader = DataLoader(undersampled_tokenized_train_eval, batch_size=16, collate_fn=collator)
undersampled_batch = next(iter(undersampled_loader))
print(undersampled_batch['labels'].dtype, undersampled_batch['labels'].shape) # should be torch.int64 (Long) and shape [batch]

torch.int64 torch.Size([16])


##### Split Train and Eval Data

In [21]:
undersampled_split_datasets = undersampled_tokenized_train_eval.train_test_split(test_size=0.2, seed=42)

undersampled_tokenized_train = undersampled_split_datasets['train']
undersampled_tokenized_eval = undersampled_split_datasets['test']

### Oversampling

#### Data Processing

##### Tokenize Data

Tokenize the data and rename is_spoiler to labels so the transformer model can recognize as y value.

In [None]:
oversampled_tokenized_train_eval = oversampled_train_dataset.map(tokenize, batched=True, )
oversampled_tokenized_train_eval = oversampled_tokenized_train_eval.remove_columns(["text"])
oversampled_tokenized_train_eval.set_format(type='torch')

oversampled_tokenized_test = oversampled_test_dataset.map(tokenize, batched=True)
oversampled_tokenized_test = oversampled_tokenized_test.remove_columns(["text"])
oversampled_tokenized_test.set_format(type='torch')

Map:   0%|          | 0/676490 [00:00<?, ? examples/s]

Map:   0%|          | 0/676490 [00:00<?, ? examples/s]

In [None]:
oversampled_first = oversampled_tokenized_train_eval[0]
print(type(oversampled_first['labels']), oversampled_first['labels']) # with set_format('torch'), this is a torch.Tensor

<class 'torch.Tensor'> tensor(0)


In [None]:
collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=8)
oversampled_loader = DataLoader(oversampled_tokenized_train_eval, batch_size=16, collate_fn=collator)
oversampled_batch = next(iter(oversampled_loader))
print(oversampled_batch['labels'].dtype, oversampled_batch['labels'].shape) # should be torch.int64 (Long) and shape [batch]

torch.int64 torch.Size([16])


##### Split Train and Eval Data

In [None]:
oversampled_split_datasets = oversampled_tokenized_train_eval.train_test_split(test_size=0.2, seed=42)

oversampled_tokenized_train = oversampled_split_datasets['train']
oversampled_tokenized_eval = oversampled_split_datasets['test']

## Modeling

### Undersampling

#### Model Initialization

In [22]:
training_args = TrainingArguments(
  output_dir="./results",
  num_train_epochs=3,
  per_device_train_batch_size=32, # adjust based on GPU memory
  per_device_eval_batch_size=32,
  eval_strategy="epoch",
  save_strategy="epoch",
  load_best_model_at_end=True,
  metric_for_best_model="accuracy",
  fp16=True, # enables mixed precision on GPU
  dataloader_num_workers=2, # speed up input pipeline
  logging_steps=200,
  report_to="none",
)

In [23]:
undersampled_trainer = Trainer(
  model=distilbert_model,
  args=training_args,
  train_dataset=undersampled_tokenized_train,
  eval_dataset=undersampled_tokenized_eval,
  tokenizer=tokenizer,
  data_collator=collator,
  compute_metrics=compute_metrics,
)

  undersampled_trainer = Trainer(


#### Train Model

In [24]:
undersampled_trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,F1 Macro,F1 Weighted,Roc Auc,Mcc,Balanced Accuracy,Brier
1,0.5592,0.550305,0.712102,0.73266,0.663438,0.696333,0.711324,0.711398,0.791806,0.425812,0.711864,0.186404
2,0.4967,0.556085,0.713736,0.705322,0.729362,0.717141,0.713695,0.713678,0.793637,0.4278,0.713813,0.18801
3,0.4133,0.614859,0.704761,0.701907,0.70675,0.70432,0.70476,0.704762,0.781111,0.409537,0.70477,0.204053


TrainOutput(global_step=18135, training_loss=0.49589281736867696, metrics={'train_runtime': 6880.5928, 'train_samples_per_second': 84.331, 'train_steps_per_second': 2.636, 'total_flos': 7.686394313534669e+16, 'train_loss': 0.49589281736867696, 'epoch': 3.0})

#### Save Model

In [30]:
undersampled_model_save_path = '/content/drive/MyDrive/CS3244/CS3244_Project/undersampled_distilbert_base_trained.h5'
undersampled_trainer.save_model(undersampled_model_save_path)

### Oversampling


#### Model Initialization

In [None]:
training_args = TrainingArguments(
  output_dir="./results",
  num_train_epochs=3,
  per_device_train_batch_size=32, # adjust based on GPU memory
  per_device_eval_batch_size=32,
  eval_strategy="epoch",
  save_strategy="epoch",
  load_best_model_at_end=True,
  metric_for_best_model="accuracy",
  fp16=True, # enables mixed precision on GPU
  dataloader_num_workers=2, # speed up input pipeline
  logging_steps=200,
  report_to="none",
)

In [None]:
oversampled_trainer = Trainer(
  model=distilbert_model,
  args=training_args,
  train_dataset=oversampled_tokenized_train,
  eval_dataset=oversampled_tokenized_eval,
  tokenizer=tokenizer,
  data_collator=collator,
  compute_metrics=compute_metrics,
)

  oversampled_trainer = Trainer(


#### Train Model

In [None]:
oversampled_trainer.train()

Epoch,Training Loss,Validation Loss


#### Save Model

In [None]:
oversampled_model_save_path = '/content/drive/MyDrive/CS3244/CS3244_Project/oversampled_distilbert_base_trained.h5'
oversampled_trainer.save_model(oversampled_model_save_path)

## Evaluate Model

Evaluate model with unseen test data.

### Undersampling

#### Test Predicting by Loading Saved Model

In [None]:
undersampled_model_loaded = AutoModelForSequenceClassification.from_pretrained(undersampled_model_save_path)
undersampled_trainer_loaded = Trainer(model = undersampled_model_loaded)
undersampled_test_results = undersampled_trainer_loaded.predict(undersampled_tokenized_test)

#### Predict Test Data

In [26]:
undersampled_predictions = undersampled_trainer.predict(undersampled_tokenized_test)
# Process predictions to determine spoiler/non-spoiler

#### Evaluate Predictions

print("Test Metrics:", predictions.metrics)

logits = predictions.predictions
labels = predictions.label_ids

predicted_class_ids = np.argmax(logits, axis=-1)

metric = evaluate.load("f1")
f1_score = metric.compute(predictions=predicted_class_ids, references=labels, average="weighted")
print(f"F1 Score on test set: {f1_score}")

In [27]:
print("Undersampled Data Test Metrics:", undersampled_predictions.metrics)

undersampled_logits = undersampled_predictions.predictions
undersampled_labels = undersampled_predictions.label_ids

undersampled_predicted_class_ids = np.argmax(undersampled_logits, axis=-1)

metric = evaluate.load("f1")
undersampled_f1_score = metric.compute(predictions=undersampled_predicted_class_ids, references=undersampled_labels, average="weighted")
print(f"Undersampled Data F1 Score on test set: {undersampled_f1_score}")

Undersampled Data Test Metrics: {'test_loss': 0.45013192296028137, 'test_accuracy': 0.7919220746990941, 'test_precision': 0.78340936104597, 'test_recall': 0.8069404806220788, 'test_f1': 0.7950008353674191, 'test_f1_macro': 0.7918751316112896, 'test_f1_weighted': 0.7918751316112896, 'test_roc_auc': 0.8725578817889624, 'test_mcc': 0.5841077027812319, 'test_balanced_accuracy': 0.7919220746990941, 'test_brier': 0.1455642074611506, 'test_runtime': 826.1209, 'test_samples_per_second': 292.657, 'test_steps_per_second': 9.146}
Undersampled Data F1 Score on test set: {'f1': 0.7918751316112896}


In [28]:
undersampled_logits = undersampled_predictions.predictions
undersampled_labels = undersampled_predictions.label_ids
undersampled_preds = np.argmax(undersampled_logits, axis=-1)

# Probabilities for the positive class (index 1)
undersampled_e_x = np.exp(undersampled_logits - undersampled_logits.max(axis=1, keepdims=True))
undersampled_probs = undersampled_e_x / undersampled_e_x.sum(axis=1, keepdims=True)
undersampled_prob_pos = undersampled_probs[:, 1]

# Evaluate metrics
undersampled_accuracy = evaluate.load("accuracy").compute(predictions=undersampled_preds, references=undersampled_labels)["accuracy"]
undersampled_precision = evaluate.load("precision").compute(predictions=undersampled_preds, references=undersampled_labels, average="binary")["precision"]
undersampled_recall = evaluate.load("recall").compute(predictions=undersampled_preds, references=undersampled_labels, average="binary")["recall"]
undersampled_f1_binary = evaluate.load("f1").compute(predictions=undersampled_preds, references=undersampled_labels, average="binary")["f1"]

# F2 Score (beta = 2, prioritizes recall over precision)
beta_sq = 2**2
undersampled_f2_binary = (1 + beta_sq) * (undersampled_precision * undersampled_recall) / ((beta_sq * undersampled_precision) + undersampled_recall)

undersampled_f1_macro = evaluate.load("f1").compute(predictions=undersampled_preds, references=undersampled_labels, average="macro")["f1"]
undersampled_f1_weighted = evaluate.load("f1").compute(predictions=undersampled_preds, references=undersampled_labels, average="weighted")["f1"]
undersampled_roc_auc = evaluate.load("roc_auc").compute(references=undersampled_labels, prediction_scores=undersampled_prob_pos)["roc_auc"]

# Extra (sklearn)
undersampled_mcc = matthews_corrcoef(undersampled_labels, undersampled_preds)
undersampled_balanced_acc = balanced_accuracy_score(undersampled_labels, undersampled_preds)
undersampled_brier = brier_score_loss(undersampled_labels, undersampled_prob_pos)
undersampled_cm = confusion_matrix(undersampled_labels, undersampled_preds, labels=[0, 1])
undersampled_report = classification_report(undersampled_labels, undersampled_preds, target_names=["non_spoiler", "spoiler"], digits=4)

print("Undersampled Data Test Metrics:")
print(f"- accuracy: {undersampled_accuracy:.4f}")
print(f"- precision (binary): {undersampled_precision:.4f}")
print(f"- recall (binary): {undersampled_recall:.4f}")
print(f"- f1 (binary): {undersampled_f1_binary:.4f}")
print(f"- f2 (binary): {undersampled_f2_binary:.4f}") # Added F2 print
print(f"- f1 (macro): {undersampled_f1_macro:.4f}")
print(f"- f1 (weighted): {undersampled_f1_weighted:.4f}")
print(f"- ROC-AUC: {undersampled_roc_auc:.4f}")
print(f"- MCC: {undersampled_mcc:.4f}")
print(f"- balanced_accuracy: {undersampled_balanced_acc:.4f}")
print(f"- Brier score: {undersampled_brier:.4f}")
print("Undersample Data Confusion matrix [[TN, FP], [FN, TP]]:")
print(undersampled_cm)
print("Undersampled Data Classification report:")
print(undersampled_report)

Undersampled Data Test Metrics:
- accuracy: 0.7919
- precision (binary): 0.7834
- recall (binary): 0.8069
- f1 (binary): 0.7950
- f2 (binary): 0.8021
- f1 (macro): 0.7919
- f1 (weighted): 0.7919
- ROC-AUC: 0.8726
- MCC: 0.5841
- balanced_accuracy: 0.7919
- Brier score: 0.1456
Undersample Data Confusion matrix [[TN, FP], [FN, TP]]:
[[93916 26969]
 [23338 97547]]
Undersampled Data Classification report:
              precision    recall  f1-score   support

 non_spoiler     0.8010    0.7769    0.7887    120885
     spoiler     0.7834    0.8069    0.7950    120885

    accuracy                         0.7919    241770
   macro avg     0.7922    0.7919    0.7919    241770
weighted avg     0.7922    0.7919    0.7919    241770



In [29]:
plt.figure(figsize=(15, 5))

# 1. ROC AUC Curve
plt.subplot(1, 3, 1)
RocCurveDisplay.from_predictions(undersampled_labels, undersampled_prob_pos, name="Undersampled Model", ax=plt.gca())
plt.title("ROC AUC Curve")
plt.grid(linestyle="--")

# 2. Precision-Recall Curve
plt.subplot(1, 3, 2)
PrecisionRecallDisplay.from_predictions(undersampled_labels, undersampled_prob_pos, name="Undersampled Model", ax=plt.gca())
plt.title("Precision-Recall Curve")
plt.grid(linestyle="--")

# 3. Confusion Matrix
plt.subplot(1, 3, 3)
ConfusionMatrixDisplay.from_predictions(undersampled_labels, undersampled_preds, display_labels=["non_spoiler", "spoiler"], cmap=plt.cm.Blues, ax=plt.gca())
plt.title("Confusion Matrix")

plt.tight_layout()
plt.savefig("undersampled_classification_curves.png")
plt.close()

Notes:

For ROC-AUC must use the positive-class probability (prob_pos).
If prefer a different positive class, adjust which column we  take from probs.

### Oversampling

#### Test Predicting by Loading Saved Model

In [None]:
oversampled_model_loaded = AutoModelForSequenceClassification.from_pretrained(oversampled_model_save_path)
oversampled_trainer_loaded = Trainer(model = oversampled_model_loaded)
oversampled_test_results = undersampled_trainer_loaded.predict(oversampled_tokenized_test)

#### Predict Test Data

In [None]:
oversampled_predictions = oversampled_trainer.predict(oversampled_tokenized_test)
# Process predictions to determine spoiler/non-spoiler

#### Evaluate Predictions

In [None]:
print("Oversampled Data Test Metrics:", oversampled_predictions.metrics)

oversampled_logits = oversampled_predictions.predictions
oversampled_labels = oversampled_predictions.label_ids

oversampled_predicted_class_ids = np.argmax(oversampled_logits, axis=-1)

metric = evaluate.load("f1")
oversampled_f1_score = metric.compute(predictions=oversampled_predicted_class_ids, references=labels, average="weighted")
print(f"Undersampled Data F1 Score on test set: {oversampled_f1_score}")

In [None]:
oversampled_logits = oversampled_predictions.predictions
oversampled_labels = oversampled_predictions.label_ids
oversampled_preds = np.argmax(oversampled_logits, axis=-1)

# Probabilities for the positive class (index 1)
oversampled_e_x = np.exp(oversampled_logits - oversampled_logits.max(axis=1, keepdims=True))
oversampled_probs = oversampled_e_x / oversampled_e_x.sum(axis=1, keepdims=True)
oversampled_prob_pos = oversampled_probs[:, 1]

# Evaluate metrics
oversampled_accuracy = evaluate.load("accuracy").compute(predictions=oversampled_preds, references=oversampled_labels)["accuracy"]
oversampled_precision = evaluate.load("precision").compute(predictions=oversampled_preds, references=oversampled_labels, average="binary")["precision"]
oversampled_recall = evaluate.load("recall").compute(predictions=oversampled_preds, references=oversampled_labels, average="binary")["recall"]
oversampled_f1_binary = evaluate.load("f1").compute(predictions=oversampled_preds, references=oversampled_labels, average="binary")["f1"]

# F2 Score (beta = 2, prioritizes recall over precision)
beta_sq = 2**2
oversampled_f2_binary = (1 + beta_sq) * (oversampled_precision * oversampled_recall) / ((beta_sq * oversampled_precision) + oversampled_recall)

oversampled_f1_macro = evaluate.load("f1").compute(predictions=oversampled_preds, references=oversampled_labels, average="macro")["f1"]
oversampled_f1_weighted = evaluate.load("f1").compute(predictions=oversampled_preds, references=oversampled_labels, average="weighted")["f1"]
oversampled_roc_auc = evaluate.load("roc_auc").compute(references=oversampled_labels, prediction_scores=oversampled_prob_pos)["roc_auc"]

# Extra (sklearn)
oversampled_mcc = matthews_corrcoef(oversampled_labels, oversampled_preds)
oversampled_balanced_acc = balanced_accuracy_score(oversampled_labels, oversampled_preds)
oversampled_brier = brier_score_loss(oversampled_labels, oversampled_prob_pos)
oversampled_cm = confusion_matrix(oversampled_labels, oversampled_preds, labels=[0, 1])
oversampled_report = classification_report(oversampled_labels, oversampled_preds, target_names=["non_spoiler", "spoiler"], digits=4)

print("Oversampled Data Test Metrics:")
print(f"- accuracy: {oversampled_accuracy:.4f}")
print(f"- precision (binary): {oversampled_precision:.4f}")
print(f"- recall (binary): {oversampled_recall:.4f}")
print(f"- f1 (binary): {oversampled_f1_binary:.4f}")
print(f"- f2 (binary): {oversampled_f2_binary:.4f}")
print(f"- f1 (macro): {oversampled_f1_macro:.4f}")
print(f"- f1 (weighted): {oversampled_f1_weighted:.4f}")
print(f"- ROC-AUC: {oversampled_roc_auc:.4f}")
print(f"- MCC: {oversampled_mcc:.4f}")
print(f"- balanced_accuracy: {oversampled_balanced_acc:.4f}")
print(f"- Brier score: {oversampled_brier:.4f}")
print("Oversample Data Confusion matrix [[TN, FP], [FN, TP]]:")
print(oversampled_cm)
print("Oversampled Data Classification report:")
print(oversampled_report)

In [None]:
plt.figure(figsize=(15, 5))

# 1. ROC AUC Curve
plt.subplot(1, 3, 1)
RocCurveDisplay.from_predictions(oversampled_labels, oversampled_prob_pos, name="Oversampled Model", ax=plt.gca())
plt.title("ROC AUC Curve")
plt.grid(linestyle="--")

# 2. Precision-Recall Curve
plt.subplot(1, 3, 2)
PrecisionRecallDisplay.from_predictions(oversampled_labels, oversampled_prob_pos, name="Oversampled Model", ax=plt.gca())
plt.title("Precision-Recall Curve")
plt.grid(linestyle="--")

# 3. Confusion Matrix
plt.subplot(1, 3, 3)
ConfusionMatrixDisplay.from_predictions(oversampled_labels, oversampled_preds, display_labels=["non_spoiler", "spoiler"], cmap=plt.cm.Blues, ax=plt.gca())
plt.title("Confusion Matrix")

plt.tight_layout()
plt.savefig("oversampled_classification_curves.png")
plt.close()

### Final Chosen Sampling Method: Undersampling

We have a big dataset, so undersampling would not be a problem.
Having slightly less data to train for is better than risking unfounded oversampled data that will not exist in real life. Use undersampling as a base comparator for all models for fairness.

In [32]:
tokenized_train = undersampled_tokenized_train
tokenized_eval = undersampled_tokenized_eval

#### Inspect Errors

In [None]:
for i, (true, pred) in enumerate(zip(true_labels, predicted_labels)):
    if true != pred:
        print(f"Example {i}:")
        print(f"Text: {encoded_dataset['test']['sentence'][i]}")
        print(f"True Label: {true}, Predicted Label: {pred}")

## Finetune Model

### Hyperparameter Tuning

In [None]:
def objective(trial):
  # Hyperparameters
  learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
  batch_size = trial.suggest_categorical("batch_size", [8, 16, 32])

  # Fresh model per trial
  model = AutoModelForSequenceClassification.from_pretrained(
  model_name,
  num_labels=num_labels,
  )

  # Unique output directory per trial
  out_dir = f"./results/optuna/trial-{trial.number}"
  run_name = f"distilbert-lr{learning_rate:.2e}-bs{batch_size}-trial{trial.number}"

  training_args = TrainingArguments(
  output_dir=out_dir,
  run_name=run_name, # avoids W&B naming clashes if W&B is enabled
  learning_rate=learning_rate,
  per_device_train_batch_size=batch_size,
  num_train_epochs=3,
  weight_decay=0.01,
  eval_strategy="epoch", # preferred argument name
  save_strategy="epoch",
  load_best_model_at_end=True,
  metric_for_best_model="eval_loss",
  greater_is_better=False,
  overwrite_output_dir=True,
  save_total_limit=1,
  report_to="none", # disable W&B; change to ["wandb"] if you want to log
  seed=42,
  logging_steps=50,
  )

  trainer = Trainer(
  model=model,
  args=training_args,
  train_dataset=tokenized_train,
  eval_dataset=tokenized_eval,
  )

  trainer.train()
  eval_results = trainer.evaluate()
  return eval_results["eval_loss"]
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=2)
print(study.best_params)

[I 2025-11-27 18:46:20,989] A new study created in memory with name: no-name-5f4ffa18-94b0-4b87-b88d-0b0e8e2a9e28
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.5637,0.549748
2,0.5313,0.544225
3,0.4706,0.556743


[I 2025-11-27 20:45:55,088] Trial 0 finished with value: 0.5442250967025757 and parameters: {'learning_rate': 1.1438797245069308e-05, 'batch_size': 32}. Best is trial 0 with value: 0.5442250967025757.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss


### Distillation and Pruning (Quantization)



In [None]:
quantizer = ORTQuantizer.from_pretrained("distilbert-base-uncased")
quantizer.quantize(
    save_dir="./quantized_model",
    quantization_config=QuantizationConfig(is_static=False),
)