In [None]:
# ------------------------------------------------------
# LABEL FLIP ATTACK – 10%
# Sparar resultat i results.csv och kör IMDB-distilBERT
# ------------------------------------------------------

from datasets import load_dataset
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import csv, os, random


# -------------------------------------------------------------------------
# 1. Save results funktion (samma som baseline)
# -------------------------------------------------------------------------

def save_results(attack_type, attack_rate, accuracy, f1, train_size, confusion_matrix, filename="results/logs/flip.csv"):
    
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    
    file_exists = os.path.isfile(filename)

    with open(filename, mode="a", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)

        if not file_exists:
            writer.writerow(["attack_type", "attack_rate", "accuracy", "f1", "train_size", "confusion_matrix"])

        writer.writerow([
            attack_type,
            attack_rate,
            accuracy,
            f1,
            train_size,
            confusion_matrix.tolist()
        ])

    print(f"✔ Resultat sparat i {filename}")


attack_rate = 0.50
# -------------------------------------------------------------------------
# 2. Flip-label funktion (korrekt version)
# -------------------------------------------------------------------------

def flip_labels(dataset, percentage=0.1):
    """
    Flips 10% av labels (1 → 0, 0 → 1).
    dataset: HuggingFace dataset
    """
    n = len(dataset)
    k = int(n * percentage)

    poisoned = dataset.select(range(n))
    flip_idx = random.sample(range(n), k)

    def flip(example, idx):
        lbl = example["label"]
        if idx in flip_idx:
            example["label"] = 1 - lbl
        return example

    poisoned = poisoned.map(flip, with_indices=True)
    return poisoned, flip_idx


# -------------------------------------------------------------------------
# 3. Ladda dataset (2000 / 500 / 500)
# -------------------------------------------------------------------------

dataset = load_dataset("imdb")

train = dataset["train"].shuffle(seed=42).select(range(500))
val   = dataset["test"].shuffle(seed=42).select(range(250))
test  = dataset["test"].shuffle(seed=42).select(range(250))

print("Dataset loaded:", len(train), len(val), len(test))


# -------------------------------------------------------------------------
# 4. Skapa poisoned träningsdata (10% flip)
# -------------------------------------------------------------------------


poisoned_train, flipped_idx = flip_labels(train, percentage=attack_rate)
print(f"Antal flippade exempel: {len(flipped_idx)}")


# -------------------------------------------------------------------------
# 5. Tokenizer + tokenisering
# -------------------------------------------------------------------------

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=256
    )

train_tok = poisoned_train.map(tokenize, batched=True)
val_tok   = val.map(tokenize, batched=True)
test_tok  = test.map(tokenize, batched=True)

train_tok = train_tok.rename_column("label", "labels")
val_tok   = val_tok.rename_column("label", "labels")
test_tok  = test_tok.rename_column("label", "labels")

train_tok.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_tok.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_tok.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


# -------------------------------------------------------------------------
# 6. Modell + trainer
# -------------------------------------------------------------------------

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

def compute_metrics(pred):
    logits, labels = pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds)
    }

args = TrainingArguments(
    output_dir="label_flip_10_output",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,

    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=32,

    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_ratio=0.1,
    seed=42
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    compute_metrics=compute_metrics
)


# -------------------------------------------------------------------------
# 7. Träna modellen
# -------------------------------------------------------------------------

trainer.train()


# -------------------------------------------------------------------------
# 8. Utvärdera modellen
# -------------------------------------------------------------------------

print("\nEvaluating on test set...")
test_results = trainer.evaluate(test_tok)
print(test_results)

test_accuracy = test_results["eval_accuracy"]
test_f1 = test_results["eval_f1"]


# -------------------------------------------------------------------------
# 9. Confusion matrix
# -------------------------------------------------------------------------

pred_out = trainer.predict(test_tok)
logits = pred_out.predictions
y_pred = np.argmax(logits, axis=-1)
y_true = pred_out.label_ids

cm = confusion_matrix(y_true, y_pred)
print("\nConfusion Matrix (label flip 10%):")
print(cm)


# -------------------------------------------------------------------------
# 10. Spara resultat
# -------------------------------------------------------------------------

save_results(
    attack_type="label_flip",
    attack_rate=attack_rate,
    accuracy=test_accuracy,
    f1=test_f1,
    train_size=len(train),
    confusion_matrix=cm
)

print("\n✔ LABEL FLIP ", attack_rate,"% KLAR!")


Dataset loaded: 500 250 250


Map: 100%|██████████| 500/500 [00:00<00:00, 11702.40 examples/s]

Antal flippade exempel: 250



Map: 100%|██████████| 500/500 [00:00<00:00, 4271.98 examples/s]
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6969,0.692957,0.516,0.366492



Evaluating on test set...




{'eval_loss': 0.692956805229187, 'eval_accuracy': 0.516, 'eval_f1': 0.36649214659685864, 'eval_runtime': 31.8284, 'eval_samples_per_second': 7.855, 'eval_steps_per_second': 0.251, 'epoch': 1.0}





Confusion Matrix (label flip 10%):
[[94 35]
 [86 35]]
✔ Resultat sparat i results_flip_500.csv

✔ LABEL FLIP  0.5 % KLAR!
