# DistilBERT (GPU)

## Setup

### Packages Setup

#### Install Packages

In [None]:
%pip install datasets
%pip install evaluate
%pip install hf_xet
%pip install pandas
%pip install numpy
%pip install scikit-learn
%pip install tensorflow
%pip install tf-keras
%pip install transformers
%pip install transformers[torch]
%pip install torch torchvision --index-url https://download.pytorch.org/whl/cu126

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6
Looking in indexes: https://download.pytorch.org/whl/cu126


### Import Packages

In [None]:
from datasets import Dataset, Value
import evaluate
import numpy as np
import pandas as pd
import tensorflow as tf
import torch
from sklearn.metrics import confusion_matrix, classification_report, matthews_corrcoef, balanced_accuracy_score, brier_score_loss
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoConfig, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from torch.utils.data import DataLoader

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available and being used.")
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU.")

GPU is available and being used.


### Data Setup

#### Read Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
train_cleaned = pd.read_json("/content/drive/MyDrive/CS3244/CS3244_Project/IMDB_reviews_train_cleaned.json")
test = pd.read_json("/content/drive/MyDrive/CS3244/CS3244_Project/IMDB_reviews_test.json")

train_cleaned.head()
test.head()

Unnamed: 0,review_date,movie_id,user_id,is_spoiler,review_text,rating,review_summary
391376,24 October 2006,tt0424136,ur0023796,False,Most films do best if you know next to nothing...,9,Hard Candy breaks minds as hard candy breaks t...
573647,2 September 2001,tt0139239,ur1235973,False,Go has not gotten even half of the praise it d...,9,One of the most under appreciated films in his...
426616,3 March 2011,tt0480249,ur24994931,False,Personally I really enjoyed this movie from th...,7,Why Do People Hate This Movie?
493566,11 March 2004,tt0103874,ur0395246,False,"As far as videos go, this is one of the few th...",6,Aye shoood tayhke thee trahyne tew Byoodapest
174694,11 May 2013,tt1931533,ur17825945,True,While trying a little too hard to be Adaptatio...,4,Unlucky Number Seven


In [None]:
train_cleaned['is_spoiler'] = train_cleaned['is_spoiler'].astype('int64')
train_cleaned = train_cleaned.rename(columns={'is_spoiler': 'labels', 'review_text': 'text'})

test['is_spoiler'] = test['is_spoiler'].astype('int64')
test = train_cleaned.rename(columns={'is_spoiler': 'labels', 'review_text': 'text'})

In [None]:
train_cleaned.loc[0]

Unnamed: 0,0
review_date,10 February 2006
movie_id,tt0111161
user_id,ur1898687
labels,1
text,oscar year shawshank redemption written direct...
rating,10
review_summary,A classic piece of unforgettable film-making.


In [None]:
train_cleaned.dtypes

Unnamed: 0,0
review_date,object
movie_id,object
user_id,object
labels,int64
text,object
rating,int64
review_summary,object


In [None]:
train_dataset = Dataset.from_pandas(train_cleaned[['text', 'labels']])
train_dataset = train_dataset.cast_column('labels', Value('int64'))
test_dataset = Dataset.from_pandas(test[['text', 'labels']])
test_dataset = test_dataset.cast_column('labels', Value('int64'))
print(train_dataset)
print(test_dataset)

Casting the dataset:   0%|          | 0/459130 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/459130 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'labels', '__index_level_0__'],
    num_rows: 459130
})
Dataset({
    features: ['text', 'labels', '__index_level_0__'],
    num_rows: 459130
})


### Model Setup

#### DistilBERT

In [None]:
model_name = "distilbert-base-uncased"
num_labels = 2  # For spoiler/non-spoiler classification
config = AutoConfig.from_pretrained(model_name, num_labels=num_labels, problem_type="single_label_classification")
distilbert_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", config=config)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Tokenizer Setup

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", use_fast=True)

def tokenize(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length")

### Metrics Setup

In [None]:
acc_m = evaluate.load("accuracy")
prec_m = evaluate.load("precision")
rec_m = evaluate.load("recall")
f1_m = evaluate.load("f1")
roc_m = evaluate.load("roc_auc")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    e_x = np.exp(logits - logits.max(axis=1, keepdims=True))
    prob_pos = (e_x / e_x.sum(axis=1, keepdims=True))[:, 1]
    return {
        "accuracy": acc_m.compute(predictions=preds, references=labels)["accuracy"],
        "precision": prec_m.compute(predictions=preds, references=labels, average="binary")["precision"],
        "recall": rec_m.compute(predictions=preds, references=labels, average="binary")["recall"],
        "f1": f1_m.compute(predictions=preds, references=labels, average="binary")["f1"],
        "f1_macro": f1_m.compute(predictions=preds, references=labels, average="macro")["f1"],
        "f1_weighted": f1_m.compute(predictions=preds, references=labels, average="weighted")["f1"],
        "roc_auc": roc_m.compute(references=labels, prediction_scores=prob_pos)["roc_auc"],
        "mcc": matthews_corrcoef(labels, preds),
        "balanced_accuracy": balanced_accuracy_score(labels, preds),
        "brier": brier_score_loss(labels, prob_pos),
    }

## Processing

### Data Processing

#### Tokenize Data

Tokenize the data and rename is_spoiler to labels so the transformer model can recognize as y value.

In [None]:
tokenized_train_eval = train_dataset.map(tokenize, batched=True, )
tokenized_train_eval = tokenized_train_eval.remove_columns(["text"])
tokenized_train_eval.set_format(type='torch')

tokenized_test = test_dataset.map(tokenize, batched=True)
tokenized_test = tokenized_test.remove_columns(["text"])
tokenized_test.set_format(type='torch')

Map:   0%|          | 0/459130 [00:00<?, ? examples/s]

Map:   0%|          | 0/459130 [00:00<?, ? examples/s]

In [None]:
first = tokenized_train_eval[0]
print(type(first['labels']), first['labels']) # with set_format('torch'), this is a torch.Tensor

<class 'torch.Tensor'> tensor(0)


In [None]:
collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=8)
loader = DataLoader(tokenized_train_eval, batch_size=16, collate_fn=collator)
batch = next(iter(loader))
print(batch['labels'].dtype, batch['labels'].shape) # should be torch.int64 (Long) and shape [batch]

torch.int64 torch.Size([16])


#### Split Train and Eval Data

In [None]:
split_datasets = tokenized_train_eval.train_test_split(test_size=0.2, seed=42)

tokenized_train = split_datasets['train']
tokenized_eval = split_datasets['test']

## Modeling

### DistilBERT

#### Model Initialization

In [None]:
training_args = TrainingArguments(
  output_dir="./results",
  num_train_epochs=3,
  per_device_train_batch_size=32, # adjust based on GPU memory
  per_device_eval_batch_size=32,
  eval_strategy="epoch",
  save_strategy="epoch",
  load_best_model_at_end=True,
  metric_for_best_model="accuracy",
  fp16=True, # enables mixed precision on GPU
  dataloader_num_workers=2, # speed up input pipeline
  logging_steps=200,
  report_to="none",
)
trainer = Trainer(
  model=distilbert_model,
  args=training_args,
  train_dataset=tokenized_train,
  eval_dataset=tokenized_eval,
  tokenizer=tokenizer,
  data_collator=collator,
  compute_metrics=compute_metrics,
)


  trainer = Trainer(


#### Train Model

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,F1 Macro,F1 Weighted,Roc Auc,Mcc,Balanced Accuracy,Brier
1,0.4718,0.46274,0.790441,0.674669,0.39958,0.501903,0.684605,0.770757,0.792977,0.400951,0.665192,0.149842
2,0.428,0.460555,0.792009,0.662841,0.43317,0.523941,0.695439,0.776308,0.797307,0.413015,0.677022,0.148199
3,0.3814,0.488765,0.783504,0.618185,0.472448,0.535579,0.697216,0.773435,0.789091,0.40381,0.683828,0.156749


TrainOutput(global_step=34437, training_loss=0.4272095398346673, metrics={'train_runtime': 12999.5658, 'train_samples_per_second': 84.765, 'train_steps_per_second': 2.649, 'total_flos': 1.4596741618783027e+17, 'train_loss': 0.4272095398346673, 'epoch': 3.0})

#### Save Model

In [None]:
model_save_path = '/content/drive/MyDrive/CS3244/CS3244_Project/distilbert_base_trained.h5'
trainer.save_model(model_save_path)

## Evaluate Model

Evaluate model with unseen test data.

### DistilBERT

#### Test Predicting by Loading Saved Model

In [None]:
model_loaded = AutoModelForSequenceClassification.from_pretrained(model_save_path)
trainer_loaded = Trainer(model = model_loaded)
test_results = trainer_loaded.predict(tokenized_test)

#### Predict Test Data

In [None]:
predictions = trainer.predict(tokenized_test)
# Process predictions to determine spoiler/non-spoiler

  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msandrinaagnesnatalie[0m ([33msandrinaagnesnatalie-nus[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


#### Evaluate Predictions

In [None]:
print("Test Metrics:", predictions.metrics)

logits = predictions.predictions
labels = predictions.label_ids

predicted_class_ids = np.argmax(logits, axis=-1)

metric = evaluate.load("f1")
f1_score = metric.compute(predictions=predicted_class_ids, references=labels, average="weighted")
print(f"F1 Score on test set: {f1_score}")

Test Metrics: {'test_loss': 0.39549562335014343, 'test_model_preparation_time': 0.0029, 'test_runtime': 1660.6511, 'test_samples_per_second': 276.476, 'test_steps_per_second': 34.56}
F1 Score on test set: {'f1': 0.8175329790832467}


In [None]:
logits = predictions.predictions
labels = predictions.label_ids
preds = np.argmax(logits, axis=-1)

# Probabilities for the positive class (index 1)
e_x = np.exp(logits - logits.max(axis=1, keepdims=True))
probs = e_x / e_x.sum(axis=1, keepdims=True)
prob_pos = probs[:, 1]

# Evaluate metrics
accuracy = evaluate.load("accuracy").compute(predictions=preds, references=labels)["accuracy"]
precision = evaluate.load("precision").compute(predictions=preds, references=labels, average="binary")["precision"]
recall = evaluate.load("recall").compute(predictions=preds, references=labels, average="binary")["recall"]
f1_binary = evaluate.load("f1").compute(predictions=preds, references=labels, average="binary")["f1"]
f1_macro = evaluate.load("f1").compute(predictions=preds, references=labels, average="macro")["f1"]
f1_weighted = evaluate.load("f1").compute(predictions=preds, references=labels, average="weighted")["f1"]
roc_auc = evaluate.load("roc_auc").compute(references=labels, prediction_scores=prob_pos)["roc_auc"]

# Extra (sklearn)
mcc = matthews_corrcoef(labels, preds)
balanced_acc = balanced_accuracy_score(labels, preds)
brier = brier_score_loss(labels, prob_pos)
cm = confusion_matrix(labels, preds, labels=[0, 1])
report = classification_report(labels, preds, target_names=["non_spoiler", "spoiler"], digits=4)

print("Test Metrics:")
print(f"- accuracy: {accuracy:.4f}")
print(f"- precision (binary): {precision:.4f}")
print(f"- recall (binary): {recall:.4f}")
print(f"- f1 (binary): {f1_binary:.4f}")
print(f"- f1 (macro): {f1_macro:.4f}")
print(f"- f1 (weighted): {f1_weighted:.4f}")
print(f"- ROC-AUC: {roc_auc:.4f}")
print(f"- MCC: {mcc:.4f}")
print(f"- balanced_accuracy: {balanced_acc:.4f}")
print(f"- Brier score: {brier:.4f}")
print("Confusion matrix [[TN, FP], [FN, TP]]:")
print(cm)
print("Classification report:")
print(report)

Test Metrics:
- accuracy: 0.8294
- precision (binary): 0.7612
- recall (binary): 0.5132
- f1 (binary): 0.6131
- f1 (macro): 0.7518
- f1 (weighted): 0.8175
- ROC-AUC: 0.8579
- MCC: 0.5252
- balanced_accuracy: 0.7278
- Brier score: 0.1242
Confusion matrix [[TN, FP], [FN, TP]]:
[[318777  19468]
 [ 58844  62041]]
Classification report:
              precision    recall  f1-score   support

 non_spoiler     0.8442    0.9424    0.8906    338245
     spoiler     0.7612    0.5132    0.6131    120885

    accuracy                         0.8294    459130
   macro avg     0.8027    0.7278    0.7518    459130
weighted avg     0.8223    0.8294    0.8175    459130



Notes:

For ROC-AUC must use the positive-class probability (prob_pos).
If prefer a different positive class, adjust which column you take from probs.

## Finetune Model

### Hyperparameter Tuning

In [None]:
 def objective(trial):
    # Hyperparameters
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
    batch_size = trial.suggest_categorical("batch_size", [8, 16, 32])

    # Fresh model per trial
    model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    )

    # Unique output directory per trial
    out_dir = f"./results/optuna/trial-{trial.number}"
    run_name = f"distilbert-lr{learning_rate:.2e}-bs{batch_size}-trial{trial.number}"

    training_args = TrainingArguments(
    output_dir=out_dir,
    run_name=run_name, # avoids W&B naming clashes if W&B is enabled
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch", # preferred argument name
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    overwrite_output_dir=True,
    save_total_limit=1,
    report_to="none", # disable W&B; change to ["wandb"] if you want to log
    seed=42,
    logging_steps=50,
    )

    trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    )

    trainer.train()
    eval_results = trainer.evaluate()
    return eval_results["eval_loss"]
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=2)
print(study.best_params)

[I 2025-11-21 07:36:35,815] A new study created in memory with name: no-name-f7e97d9c-6530-480c-bb98-683129bce0f4
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.6178,0.580298
2,0.5856,0.57538
3,0.5414,0.592657


[I 2025-11-21 10:00:29,353] Trial 0 finished with value: 0.5753803253173828 and parameters: {'learning_rate': 3.2628961959979934e-05, 'batch_size': 8}. Best is trial 0 with value: 0.5753803253173828.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.5579,0.550308


### Distillation and Pruning (Quantization)



In [None]:
quantizer = ORTQuantizer.from_pretrained("distilbert-base-uncased")
quantizer.quantize(
    save_dir="./quantized_model",
    quantization_config=QuantizationConfig(is_static=False),
)