# Setup

In [1]:
# Imports
from datasets import load_dataset
from transformers import (
    AutoModelForSequenceClassification,
    BertTokenizer,
    DataCollatorWithPadding,
    EarlyStoppingCallback,
    TrainingArguments,
    Trainer,
)
import evaluate
import numpy as np


In [2]:
import sys; print(sys.executable)
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

if device.type == "cuda":
    print("CUDA device name:", torch.cuda.get_device_name(0))

C:\Users\Rasmus\anaconda3\envs\AML4NLP\python.exe
Using device: cuda
CUDA device name: NVIDIA GeForce RTX 4060


In [3]:
# Load dataset
dataset = load_dataset("stanfordnlp/imdb")

train_validation_dataset = dataset["train"].train_test_split(test_size=0.1)  
train_dataset = train_validation_dataset["train"]
validation_dataset = train_validation_dataset["test"]
test_dataset = dataset["test"]          

print("Train size:", len(train_dataset))
print("Validation size:", len(validation_dataset))
print("Test size:", len(test_dataset))

Train size: 22500
Validation size: 2500
Test size: 25000


In [4]:
# Load tokenizer and model
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(
    model_name
)


In [5]:
def preprocess_datasets(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=256
    )

In [6]:
# Encode splits and remove column "text"
encoded_train = train_dataset.map(preprocess_datasets, batched=True)
encoded_validation = validation_dataset.map(preprocess_datasets, batched=True)
encoded_test = test_dataset.map(preprocess_datasets, batched=True)

Map:   0%|          | 0/22500 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

In [7]:
encoded_train = encoded_train.remove_columns(["text"])
encoded_validation = encoded_validation.remove_columns(["text"])
encoded_test = encoded_test.remove_columns(["text"])

In [8]:
encoded_train = encoded_train.with_format("torch")
encoded_validation = encoded_validation.with_format("torch")
encoded_test = encoded_test.with_format("torch")

In [9]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Metrics
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")
precision = evaluate.load("precision")
recall = evaluate.load("recall")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "f1": f1.compute(predictions=preds, references=labels, average="weighted")["f1"],
        "precision": precision.compute(predictions=preds, references=labels)["precision"],
        "recall": recall.compute(predictions=preds, references=labels)["recall"]
    }


In [12]:
import wandb
import datatime
for run in range(5):
    print(f"Starting run {run+1}")
    training_args = TrainingArguments(
        output_dir=f"./bert_uncased_output{run+1}",
        eval_strategy="epoch",  
        save_strategy="epoch",           
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=16,
        num_train_epochs=10,
        weight_decay=0.01,
        logging_steps=100,
        logging_first_step=True,
        load_best_model_at_end=True,
        report_to="wandb", #set to 'none' if no report
    )

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=2,
        id2label=id2label,
        label2id=label2id,
    )
    now = datetime.datetime.now()
    run = wandb.init(
        project="AML4NLPMiniProjectGroupX",    # Your project name (shows in wandb dashboard)
        name=f"{model_name}:{now.timer()}",       # Run name
        config={
            "model": model_name,
            "dataset": "IMDb",
            "task": "Sentiment Analysis",
            "train_size": len(train_dataset),
            "val_size": len(validation_dataset),
            "test_size": len(test_dataset),
            "learning_rate": training_args.learning_rate,
            "batch_train": training_args.per_device_train_batch_size,
            "batch_eval": training_args.per_device_eval_batch_size,
            "epochs": training_args.num_train_epochs,
            "weight_decay": training_args.weight_decay,
            "optimizer": "AdamW",
            "scheduler": "linear",   # Trainer default
            "device": "cuda" if torch.cuda.is_available() else "cpu",
        }
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=encoded_train, 
        eval_dataset=encoded_validation,
        processing_class=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,  
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )
    trainer.train()
    run.finish()

Starting run 1


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2749,0.3363,0.8968,0.896715,0.872163,0.928341
2,0.2266,0.434036,0.8908,0.890601,0.857038,0.936393
3,0.0953,0.522591,0.9012,0.901195,0.892041,0.911433


0,1
eval/accuracy,▅▁█
eval/f1,▅▁█
eval/loss,▁▅█
eval/precision,▄▁█
eval/recall,▆█▁
eval/runtime,█▁▁
eval/samples_per_second,▁██
eval/steps_per_second,▁██
train/epoch,▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇██
train/global_step,▁▁▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇██

0,1
eval/accuracy,0.9012
eval/f1,0.9012
eval/loss,0.52259
eval/precision,0.89204
eval/recall,0.91143
eval/runtime,18.7212
eval/samples_per_second,133.538
eval/steps_per_second,8.386
total_flos,8873699897512320.0
train/epoch,3


Starting run 2


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2802,0.32087,0.896,0.896001,0.891547,0.900161
2,0.2718,0.399039,0.8948,0.894696,0.868322,0.929147
3,0.1374,0.503208,0.9008,0.900704,0.874247,0.934783


0,1
eval/accuracy,▂▁█
eval/f1,▃▁█
eval/loss,▁▄█
eval/precision,█▁▃
eval/recall,▁▇█
eval/runtime,▁█▂
eval/samples_per_second,█▁▇
eval/steps_per_second,█▁▇
train/epoch,▁▁▁▁▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇█
train/global_step,▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇██

0,1
eval/accuracy,0.9008
eval/f1,0.9007
eval/loss,0.50321
eval/precision,0.87425
eval/recall,0.93478
eval/runtime,18.6063
eval/samples_per_second,134.363
eval/steps_per_second,8.438
total_flos,8873699897512320.0
train/epoch,3


Starting run 3


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2802,0.32087,0.896,0.896001,0.891547,0.900161
2,0.2571,0.42146,0.892,0.891792,0.857353,0.938808
3,0.1218,0.612643,0.8892,0.888941,0.851933,0.940419


0,1
eval/accuracy,█▄▁
eval/f1,█▄▁
eval/loss,▁▃█
eval/precision,█▂▁
eval/recall,▁██
eval/runtime,█▁▂
eval/samples_per_second,▁█▇
eval/steps_per_second,▁█▇
train/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇███

0,1
eval/accuracy,0.8892
eval/f1,0.88894
eval/loss,0.61264
eval/precision,0.85193
eval/recall,0.94042
eval/runtime,18.6379
eval/samples_per_second,134.135
eval/steps_per_second,8.424
total_flos,8873699897512320.0
train/epoch,3


Starting run 4


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2802,0.32087,0.896,0.896001,0.891547,0.900161
2,0.2718,0.399039,0.8948,0.894696,0.868322,0.929147
3,0.1244,0.561591,0.8932,0.892934,0.854545,0.946055


0,1
eval/accuracy,█▅▁
eval/f1,█▅▁
eval/loss,▁▃█
eval/precision,█▄▁
eval/recall,▁▅█
eval/runtime,▁█▆
eval/samples_per_second,█▁▃
eval/steps_per_second,█▁▃
train/epoch,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▇▇▇▇▇▇▇███

0,1
eval/accuracy,0.8932
eval/f1,0.89293
eval/loss,0.56159
eval/precision,0.85455
eval/recall,0.94605
eval/runtime,18.6893
eval/samples_per_second,133.766
eval/steps_per_second,8.401
total_flos,8873699897512320.0
train/epoch,3


Starting run 5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2638,0.345382,0.8884,0.888139,0.851204,0.939614
2,0.2459,0.387233,0.8992,0.899103,0.872741,0.933172
3,0.0954,0.519802,0.9036,0.903601,0.901363,0.904992


0,1
eval/accuracy,▁▆█
eval/f1,▁▆█
eval/loss,▁▃█
eval/precision,▁▄█
eval/recall,█▇▁
eval/runtime,▄█▁
eval/samples_per_second,▅▁█
eval/steps_per_second,▅▁█
train/epoch,▁▁▁▁▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███

0,1
eval/accuracy,0.9036
eval/f1,0.9036
eval/loss,0.5198
eval/precision,0.90136
eval/recall,0.90499
eval/runtime,18.615
eval/samples_per_second,134.3
eval/steps_per_second,8.434
total_flos,8873699897512320.0
train/epoch,3


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [13]:
trainer.save_model("bert_uncased_model")

In [None]:
trainer.evaluate(encoded_test)