In [1]:
# ===========================================
# Tiny detoxifiers on ParaDeHate (GPU-ready)
# Models: DistilBART, T5-small, T5-base
# Metrics: StyleAcc, BERTScore-F1, Fluency (inv PPL), BLEU
# ===========================================

!pip install -U transformers peft accelerate
!pip install -U evaluate bert-score

import numpy as np, pandas as pd, torch
from datasets import load_dataset, DatasetDict
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM,
    AutoModelForSequenceClassification, GPT2LMHeadModel, GPT2TokenizerFast,
    DataCollatorForSeq2Seq, Trainer, TrainingArguments
)
import evaluate, bert_score

# -------------------------
# Device
# -------------------------
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("CUDA available? ", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))
SEED = 42
np.random.seed(SEED); torch.manual_seed(SEED)

Collecting transformers
  Downloading transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Collecting peft
  Downloading peft-0.18.0-py3-none-any.whl.metadata (14 kB)
Collecting accelerate
  Downloading accelerate-1.12.0-py3-none-any.whl.metadata (19 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Downloading tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_

2025-11-26 14:12:08.426842: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764166328.628150      20 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764166328.688245      20 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


CUDA available?  True
GPU: Tesla T4


<torch._C.Generator at 0x79c542d4e6b0>

In [2]:
# -------------------------
# Load ParaDeHate & rename columns
# (works even if columns are "Original Text"/"Converted ...")
# -------------------------
dataset = load_dataset("ScaDSAI/ParaDeHate")

def rename_cols(ds: DatasetDict):
    split0 = list(ds.keys())[0]
    cols = ds[split0].column_names
    # Try canonical first
    src = "toxic" if "toxic" in cols else next((c for c in cols if c.lower().startswith("original")), None)
    tgt = "detoxified" if "detoxified" in cols else next((c for c in cols if c.lower().startswith("converted")), None)
    if src is None or tgt is None:
        raise KeyError(f"Could not find source/target columns. Found: {cols}")
    if src != "toxic": ds = ds.rename_column(src, "toxic")
    if tgt != "detoxified": ds = ds.rename_column(tgt, "detoxified")
    return ds

dataset = rename_cols(dataset)

# Ensure we have train/val/test
if "test" not in dataset:
    if "validation" in dataset:
        vt = dataset["validation"].train_test_split(test_size=0.5, seed=SEED)
        dataset = DatasetDict({"train": dataset["train"], "validation": vt["train"], "test": vt["test"]})
    else:
        tr_te = dataset["train"].train_test_split(test_size=0.2, seed=SEED)
        va_te = tr_te["test"].train_test_split(test_size=0.5, seed=SEED)
        dataset = DatasetDict({"train": tr_te["train"], "validation": va_te["train"], "test": va_te["test"]})
print(dataset)

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8276 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['toxic', 'detoxified'],
        num_rows: 6620
    })
    validation: Dataset({
        features: ['toxic', 'detoxified'],
        num_rows: 828
    })
    test: Dataset({
        features: ['toxic', 'detoxified'],
        num_rows: 828
    })
})


In [3]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

# -------------------------
# Metrics
# -------------------------
tox_name = "unitary/toxic-bert"
tox_tok  = AutoTokenizer.from_pretrained(tox_name)
tox_clf  = AutoModelForSequenceClassification.from_pretrained(tox_name).to(DEVICE).eval()

def style_accuracy(pred_texts):
    outs = []
    with torch.no_grad():
        for i in range(0, len(pred_texts), 32):
            batch = pred_texts[i:i+32]
            enc = tox_tok(batch, return_tensors="pt", padding=True, truncation=True, max_length=256).to(DEVICE)
            probs = torch.softmax(tox_clf(**enc).logits, dim=-1)
            outs.append((probs[:,1] < 0.5).float().cpu().numpy())  # assume idx 1 = toxic
    return float(np.concatenate(outs).mean())

def content_preservation(src_texts, pred_texts):
    P,R,F1 = bert_score.score(pred_texts, src_texts, lang="en", rescale_with_baseline=True, verbose=False)
    return float(F1.mean().item())


gpt2_tok = GPT2TokenizerFast.from_pretrained("gpt2")
gpt2_tok.pad_token = gpt2_tok.eos_token
gpt2_lm  = GPT2LMHeadModel.from_pretrained("gpt2").to(DEVICE).eval()

def fluency_inverse_perplexity(pred_texts):
    ppls = []
    with torch.no_grad():
        for i in range(0, len(pred_texts), 16):
            batch = pred_texts[i:i+16]
            enc = gpt2_tok(batch, return_tensors="pt", padding=True, truncation=True, max_length=256).to(DEVICE)

            if enc["input_ids"].shape[-1] == 0:
                continue
            
            loss = gpt2_lm(**enc, labels=enc["input_ids"]).loss
            ppls.append(torch.exp(loss).item())

    # Handle edge case: all were empty
    if not ppls:
        return 0.0
        
    mean_ppl = float(np.mean(ppls))
    return 1.0 / (1.0 + mean_ppl)  # higher = better


def compute_baseline_ppl(ref_texts):
    ppls = []
    with torch.no_grad():
        for i in range(0, len(ref_texts), 16):
            enc = gpt2_tok(ref_texts[i:i+16],
                           return_tensors="pt",
                           padding=True,
                           truncation=True,
                           max_length=256).to(DEVICE)
            loss = gpt2_lm(**enc, labels=enc["input_ids"]).loss
            ppls.append(torch.exp(loss).item())
    return float(np.mean(ppls))

baseline_ppl = compute_baseline_ppl(dataset["validation"]["detoxified"])
print("Baseline GPT-2 perplexity:", baseline_ppl)


def fluency_normalized(pred_texts, baseline_ppl=20):
    ppls = []
    with torch.no_grad():
        for i in range(0, len(pred_texts), 16):
            enc = gpt2_tok(pred_texts[i:i+16], return_tensors="pt", padding=True, truncation=True).to(DEVICE)

            if enc["input_ids"].shape[-1] == 0:
                continue
            loss = gpt2_lm(**enc, labels=enc["input_ids"]).loss
            ppls.append(torch.exp(loss).item())

    # Handle edge case: all were empty
    if not ppls:
        return 0.0
    mean_ppl = np.mean(ppls)
    return baseline_ppl / mean_ppl  # closer to 1 for fluent text

tokenizer_config.json:   0%|          | 0.00/174 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/811 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Baseline GPT-2 perplexity: 1396.2207829402043


In [4]:
# -------------------------
# Models (tiny / compact seq2seq)
# -------------------------
MODELS = [
    "sshleifer/distilbart-cnn-12-6",  # DistilBART
    "t5-base",
]

MAX_SRC = 128
MAX_TGT = 128
BATCH   = 8
EPOCHS  = 3   # bump to 3–5 for better scores

results = {}

for ckpt in MODELS:
    print(f"\n==== Fine-tuning {ckpt} ====")
    tok = AutoTokenizer.from_pretrained(ckpt, use_fast=True)
    model = AutoModelForSeq2SeqLM.from_pretrained(ckpt).to(DEVICE)

    def preprocess(batch):
        model_inputs = tok(batch["toxic"], max_length=MAX_SRC, truncation=True)
        with tok.as_target_tokenizer():
            labels = tok(batch["detoxified"], max_length=MAX_TGT, truncation=True)
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    remove_cols = dataset["train"].column_names
    tokenized = dataset.map(preprocess, batched=True, remove_columns=remove_cols)
    collator  = DataCollatorForSeq2Seq(tok, model=model)

    args = Seq2SeqTrainingArguments(
        output_dir=f"./checkpoints/{ckpt.replace('/','_')}",
        learning_rate=3e-5,
        per_device_train_batch_size=BATCH,
        per_device_eval_batch_size=BATCH,
        num_train_epochs=EPOCHS,
        eval_strategy="epoch",
        save_strategy="no",
        logging_steps=50,
        predict_with_generate=True,
        fp16=torch.cuda.is_available(),
        report_to=[]
    )

    trainer = Trainer(
        model=model, args=args,
        train_dataset=tokenized["train"],
        eval_dataset=tokenized["validation"],
        data_collator=collator,
        tokenizer=tok
    )

    trainer.train()
    trainer.save_model(f"./checkpoints/{ckpt.replace('/', '_')}")
    save_dir = f"/kaggle/working/{ckpt.replace('/', '_')}_finetuned"
    model.save_pretrained(save_dir)
    tok.save_pretrained(save_dir)
    print(f"✅ Saved fine-tuned model to {save_dir}")


==== Fine-tuning sshleifer/distilbart-cnn-12-6 ====


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

Map:   0%|          | 0/6620 [00:00<?, ? examples/s]



Map:   0%|          | 0/828 [00:00<?, ? examples/s]

Map:   0%|          | 0/828 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,1.3828,1.317295
2,0.9968,1.264255
3,0.7513,1.299215




✅ Saved fine-tuned model to /kaggle/working/sshleifer_distilbart-cnn-12-6_finetuned

==== Fine-tuning t5-base ====


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/6620 [00:00<?, ? examples/s]



Map:   0%|          | 0/828 [00:00<?, ? examples/s]

Map:   0%|          | 0/828 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,1.5826,1.400162
2,1.4669,1.328606
3,1.4114,1.312156


✅ Saved fine-tuned model to /kaggle/working/t5-base_finetuned


In [5]:
import pandas as pd
from tqdm import tqdm

results_summary = {}  # overall averages for each model

N_SAMPLES = 828  # number of test examples to evaluate

for ckpt in MODELS:
    print(f"\n==== Evaluating {ckpt} on {N_SAMPLES} samples ====")
    path = f"./checkpoints/{ckpt.replace('/', '_')}"
    tok = AutoTokenizer.from_pretrained(path)
    model = AutoModelForSeq2SeqLM.from_pretrained(path).to(DEVICE)
    model.eval()

    src_full = dataset["test"]["toxic"]
    ref_full = dataset["test"]["detoxified"]

    # Only take first N_SAMPLES
    src = src_full[:N_SAMPLES]
    ref = ref_full[:N_SAMPLES]
    preds = []

    with torch.no_grad():
        for i in range(0, len(src), BATCH):
            batch = tok(src[i:i+BATCH], return_tensors="pt",
                        padding=True, truncation=True,
                        max_length=MAX_SRC).to(DEVICE)
            out = model.generate(**batch,
                                 max_new_tokens=96,
                                 num_beams=4,
                                 early_stopping=True,
                                 length_penalty=1.2)
            preds.extend(tok.batch_decode(out, skip_special_tokens=True))

    # ---- Per-example metric computation ----
    rows = []
    style_scores, content_scores, fluency_scores = [], [], []

    print(f"Computing metrics for {len(preds)} examples...")
    for s, r, p in tqdm(zip(src, ref, preds), total=len(preds)):
        style = style_accuracy([p])
        content = content_preservation([s], [p])
        fluency = fluency_inverse_perplexity([p])

        rows.append({
            "Original (toxic)": s,
            "Reference (detoxified)": r,
            "Model output": p,
            "Style Accuracy": round(style, 3),
            "Content Preservation": round(content, 3),
            "Fluency": round(fluency, 3)
        })

        style_scores.append(style)
        content_scores.append(content)
        fluency_scores.append(fluency)

    # ---- Create DataFrame ----
    df = pd.DataFrame(rows)

    csv_path = f"./{ckpt.replace('/', '_')}_detailed_results.csv"
    df.to_csv(csv_path, index=False)
    print(f"✅ Saved detailed results to {csv_path}")
    print("Done.")
    


==== Evaluating sshleifer/distilbart-cnn-12-6 on 828 samples ====




Computing metrics for 828 examples...


  0%|          | 0/828 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 1/828 [00:09<2:05:18,  9.09s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 2/828 [00:10<1:00:54,  4.42s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 3/828 [00:11<39:24,  2.87s/it]  Some weights of RobertaMo

✅ Saved detailed results to ./sshleifer_distilbart-cnn-12-6_detailed_results.csv
Done.

==== Evaluating t5-base on 828 samples ====
Computing metrics for 828 examples...


  0%|          | 0/828 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 1/828 [00:00<13:26,  1.03it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 2/828 [00:01<13:21,  1.03it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 3/828 [00:02<13:17,  1

✅ Saved detailed results to ./t5-base_detailed_results.csv
Done.



