In [1]:
# -----------------------------
# BASELINE EXPERIMENT – DISTILBERT IMDB
# Tränar, utvärderar och sparar resultat i results.csv
# -----------------------------

from datasets import load_dataset
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import csv, os


# -------------------------------------------------------------------------
# 1. Funktion för att spara resultat i CSV
# -------------------------------------------------------------------------

def save_results(attack_type, attack_rate, accuracy, f1, train_size, confusion_matrix, filename="results/logs/baseline.csv"):
    
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    
    file_exists = os.path.isfile(filename)

    with open(filename, mode="a", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)

        if not file_exists:
            writer.writerow(["attack_type", "attack_rate", "accuracy", "f1", "train_size", "confusion_matrix"])

        writer.writerow([
            attack_type,
            attack_rate,
            accuracy,
            f1,
            train_size,
            confusion_matrix.tolist()
        ])

    print(f"✔ Resultat sparat i {filename}")


# -------------------------------------------------------------------------
# 2. Ladda dataset
# -------------------------------------------------------------------------

dataset = load_dataset("imdb")

train = dataset["train"].shuffle(seed=42).select(range(500))
val   = dataset["test"].shuffle(seed=42).select(range(250))
test  = dataset["test"].shuffle(seed=42).select(range(250))

print("Dataset loaded:", len(train), len(val), len(test))


# -------------------------------------------------------------------------
# 3. Tokenizer + tokenisering
# -------------------------------------------------------------------------

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=256
    )

train_tok = train.map(tokenize, batched=True)
val_tok   = val.map(tokenize, batched=True)
test_tok  = test.map(tokenize, batched=True)

train_tok = train_tok.rename_column("label", "labels")
val_tok   = val_tok.rename_column("label", "labels")
test_tok  = test_tok.rename_column("label", "labels")

train_tok.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_tok.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_tok.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


# -------------------------------------------------------------------------
# 4. Modell + trainer
# -------------------------------------------------------------------------

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

def compute_metrics(pred):
    logits, labels = pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds)
    }

args = TrainingArguments(
    output_dir="baseline_output",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,

    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=32,

    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_ratio=0.1,
    seed=42
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    compute_metrics=compute_metrics
)


# -------------------------------------------------------------------------
# 5. Träna baseline
# -------------------------------------------------------------------------

trainer.train()


# -------------------------------------------------------------------------
# 6. Utvärdera baseline
# -------------------------------------------------------------------------

print("\nEvaluating on test set...")
test_results = trainer.evaluate(test_tok)
print(test_results)

test_accuracy = test_results["eval_accuracy"]
test_f1 = test_results["eval_f1"]


# -------------------------------------------------------------------------
# 7. Confusion matrix
# -------------------------------------------------------------------------

pred_out = trainer.predict(test_tok)
logits = pred_out.predictions
y_pred = np.argmax(logits, axis=-1)
y_true = pred_out.label_ids

cm = confusion_matrix(y_true, y_pred)
print("\nConfusion Matrix (baseline):")
print(cm)


# -------------------------------------------------------------------------
# 8. Spara resultat
# -------------------------------------------------------------------------

save_results(
    attack_type="baseline",
    attack_rate=0.0,
    accuracy=test_accuracy,
    f1=test_f1,
    train_size=len(train),
    confusion_matrix=cm
)

print("\n✔ BASELINE KLAR!")


  from .autonotebook import tqdm as notebook_tqdm


Dataset loaded: 500 250 250


Map: 100%|██████████| 500/500 [00:00<00:00, 3456.19 examples/s]
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5578,0.422667,0.84,0.84



Evaluating on test set...




{'eval_loss': 0.4226665198802948, 'eval_accuracy': 0.84, 'eval_f1': 0.84, 'eval_runtime': 28.0145, 'eval_samples_per_second': 8.924, 'eval_steps_per_second': 0.286, 'epoch': 1.0}





Confusion Matrix (baseline):
[[105  24]
 [ 16 105]]
✔ Resultat sparat i results/logs/baseline.csv

✔ BASELINE KLAR!
