In [103]:
from torch.optim import AdamW
from torch.utils.data import DataLoader
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, default_data_collator
from datasets import Dataset, load_dataset, Features, Value, ClassLabel
from evaluate import load
from accelerate import Accelerator
from tqdm import tqdm
from sklearn.metrics import f1_score

from collections import Counter
import torch.nn.functional as F

In [None]:
# Check device availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

features = Features({
    "id":    Value("string"),
    "img":   Value("string"),
    "label": ClassLabel(names=["clean", "hateful"]),
    "text":  Value("string"),
})

raw = load_dataset(
    "json",
    data_files={
        "train":      "../data/train.jsonl",
        "validation": "../data/dev.jsonl",
        "test":       "../data/test.jsonl",
    },
    split=None,
    features=features,    


train_it  = raw["train"]                  
valid_it  = raw['validation']
test_it   = raw['test']

labels = train_it["label"]            # list of 0/1
counts = Counter(labels)
total   = counts[0] + counts[1]
freqs   = [counts[0] / total, counts[1] / total]

class_weights = torch.tensor([1/f for f in freqs], dtype=torch.float32, device=device)
print("Using class‑weights:", class_weights.tolist())

tokzr = AutoTokenizer.from_pretrained("GroNLP/hatebert")

def preprocess(batch):
    enc = tokzr(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=128,           # memes are short
        return_attention_mask=True,
        # return_tensors="pt"
    )
    enc["labels"] = batch["label"]
    return enc

tokenised_train = train_it.map(preprocess, remove_columns=["id", "img", "text","label"])
tokenised_val   = valid_it.map(preprocess, remove_columns=["id", "img", "text","label"])
tokenised_test  = test_it.map(preprocess, remove_columns=["id", "img", "text","label"])

train_loader = DataLoader(tokenised_train, batch_size=8, collate_fn=default_data_collator)
val_loader   = DataLoader(tokenised_val,   batch_size=8, collate_fn=default_data_collator)
test_loader  = DataLoader(tokenised_test,  batch_size=8, collate_fn=default_data_collator)

model = AutoModelForSequenceClassification.from_pretrained(
            "GroNLP/hatebert", num_labels=2)

optimizer = AdamW(model.parameters(), lr=2e-5)
accelerator = Accelerator(mixed_precision="fp16")
model, optimizer, train_loader, val_loader = accelerator.prepare(
    model, optimizer, train_loader, val_loader)

SyntaxError: '(' was never closed (330623600.py, line 11)

In [101]:
num_epochs      = 3

for epoch in range(num_epochs):
    # ───────────────── TRAIN ──────────────────
    model.train()
    train_bar = tqdm(train_loader,  # iterate over the loader itself
                     desc=f"Epoch {epoch+1} – training",
                     total=None)    # unknown length is fine

    for batch in train_bar:
        logits = model(**batch).logits
        loss    = F.cross_entropy(
            logits,
            batch['labels'],
            weight=class_weights
        )
        accelerator.backward(loss)
        optimizer.step()
        optimizer.zero_grad()
        train_bar.set_postfix(loss=float(loss))
    
    model.eval()
    model.eval()
    preds, golds = [], []
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Epoch {epoch+1} – validation"):
            logits = model(**batch).logits
            preds.extend(torch.argmax(logits, dim=-1).cpu().tolist())
            golds.extend(batch["labels"].cpu().tolist())

    macro_f1 = f1_score(golds, preds, average="macro")

Epoch 1 – training:   3%|▎         | 30/1063 [00:38<21:49,  1.27s/it, loss=0.525] 


KeyboardInterrupt: 

In [None]:
macro_f1 = f1_score(golds, preds, average="macro")

In [None]:
print(f"\nEpoch {epoch+1} – Val macro‑F1: {macro_f1:.3f}\n")


Epoch 1 – Val macro‑F1: 0.363



In [None]:

def count_loader(loader, max_batches=None):
    """
    Iterate through a (streaming) DataLoader once and return:
      • total number of samples
      • Counter mapping class‑id → count
    If max_batches is given, stop after that many batches.
    """
    total  = 0
    label_counts = Counter()

    iterable = loader
    if max_batches is not None:
        iterable = iter(loader)
        iterable = (next(iterable) for _ in range(max_batches))

    for batch in tqdm(iterable, desc="Counting"):
        labels = batch["labels"].cpu().tolist()
        label_counts.update(labels)
        total += len(labels)

    return total, label_counts

In [94]:
train_total, train_split = count_loader(train_loader)
print("Train samples:", train_total)
print("Class split:", train_split)    # e.g. Counter({0: 8600, 1: 4320})

Counting: 100%|██████████| 1063/1063 [00:02<00:00, 440.25it/s]

Train samples: 8500
Class split: Counter({0: 5450, 1: 3050})





In [None]:
6913 / (6913+5974)

0.5364320633196245

In [95]:
val_total, val_split = count_loader(val_loader)
print("Validation samples:", val_total)
print("Class split:", val_split)

Counting: 100%|██████████| 63/63 [00:00<00:00, 406.89it/s]

Validation samples: 500
Class split: Counter({0: 253, 1: 247})





In [97]:
8500//8

1062