In [25]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
import string
import re
import os

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve, f1_score

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts

from transformers import (
    BertTokenizerFast,
    BertForSequenceClassification,
)

In [26]:
seed = 42
torch.random.manual_seed(seed)

device = "cuda" if torch.cuda.is_available() else "cpu"

batch_size = 4
max_len = 128

epochs = 1
learning_rate = 2e-5

root_path = "/home/stefan/ioai-prep/kits/roai-2025/toxic"

In [27]:
nltk.download("stopwords", quiet=True)
nltk.download("punkt_tab", quiet=True)
nltk.download("wordnet", quiet=True)

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

# Data preparation

In [None]:
class CommentDataset(Dataset):
    """Torch Dataset for BERT fine-tuning."""

    def __init__(self, texts, labels, tokenizer: BertTokenizerFast, max_len: int = 128):
        self.texts = texts.reset_index(drop=True)
        self.labels = labels.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt",
        )

        item = {key: val.squeeze(0) for key, val in inputs.items()}
        item["labels"] = torch.tensor(self.labels.iloc[idx].values.astype(float))
        
        return item

In [29]:
def clean_text(text: str):
    text = text.lower()

    # remove short forms
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"im", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)

    text = re.sub(r"http\S+|www\S+|https\S+", ' WEB ', text) # URLs
    text = re.sub(r"@\w+|#\w+", ' USER ', text) # mentions and hashtags
    text = re.sub(r"<.*?>", "", text)  # HTML tags
    text = text.translate(str.maketrans("", "", string.punctuation))  # punctuation
    text = re.sub(r"\d+", "", text)  # numbers
    text = re.sub(r"\s+", " ", text).strip()  # extra whitespace

    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]

    return ' '.join(tokens)

def prep_df(df: pd.DataFrame):
    df = df.drop(["id"], errors='ignore')

    X = df["comment_text"].apply(clean_text)

    if "toxic" in df.columns:
        y = df[['toxic', 'severe_toxic', 'obscene', 'insult']]
        return X, y
    return X

In [43]:
df = pd.read_csv(f"{root_path}/train_data.csv")
label_names = ["toxic", "severe_toxic", "obscene", "insult"]

X, y = prep_df(df)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y["severe_toxic"], random_state=seed)

In [None]:
model_name = "bert-base-uncased"
tokenizer = BertTokenizerFast.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(label_names),
    problem_type="multi_label_classification",
).to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
train_dataset = CommentDataset(X_train, y_train, tokenizer, max_len)
val_dataset = CommentDataset(X_test, y_test, tokenizer, max_len)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [33]:
# sanity check
batch = next(iter(train_loader))
[{k:batch[k].shape} for k in batch.keys()]

[{'input_ids': torch.Size([4, 128])},
 {'token_type_ids': torch.Size([4, 128])},
 {'attention_mask': torch.Size([4, 128])},
 {'labels': torch.Size([4, 4])}]

# Finetuning

In [34]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
scheduler = CosineAnnealingWarmRestarts(optimizer=optimizer, eta_min=1e-6, T_0=5)

In [None]:
def train_epoch(epoch: int):
    model.train()
    running_loss = 0.0
    
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # stats        
        running_loss += loss.item()
    scheduler.step()
    
    avg_loss = running_loss / len(train_loader)
    print(f"Epoch {epoch+1}; train_loss={avg_loss:.4f}")

In [None]:
def val_epoch(epoch: int):
    model.eval()
    running_loss = 0.0
    preds, true_labels = [], []
    
    for batch in tqdm(val_loader, desc=f"Validating Epoch {epoch+1}"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids,
                            attention_mask=attention_mask,
                            labels=labels)
        running_loss += outputs.loss.item()

        preds.append(outputs.logits.cpu())
        true_labels.append(labels.cpu())

    # prepare data    
    preds = F.sigmoid(torch.cat(preds, dim=0)).cpu().numpy()
    preds_binary = (preds > 0.5).astype(int)
    
    # compute f1 score
    true_labels = np.concatenate(true_labels, axis=0)
    f1 = f1_score(true_labels, preds_binary, average="macro")

    avg_loss = running_loss / len(val_loader)
    print(f"Epoch {epoch+1}; val_loss={avg_loss:.4f}; val_f1={f1:.4f}")

In [None]:
for epoch in range(epochs):
    train_epoch(epoch)
    val_epoch(epoch)

                                                                   

Epoch 1; val_loss=0.1218; val_f1=0.5716




In [None]:
# save Model
output_dir = "./bert-toxic-finetuned"
os.makedirs(output_dir, exist_ok=True)
print(f"Saving model to {output_dir}…")
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

Saving model to ./bert-toxic-finetuned…


('./bert-toxic-finetuned/tokenizer_config.json',
 './bert-toxic-finetuned/special_tokens_map.json',
 './bert-toxic-finetuned/vocab.txt',
 './bert-toxic-finetuned/added_tokens.json',
 './bert-toxic-finetuned/tokenizer.json')

# Evaluation

In [53]:
model.eval()
all_probs, all_labels = [], []
macro = 0

with torch.no_grad():
    for batch in tqdm(val_loader, desc="Evaluating", leave=False):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        logits = model(input_ids=input_ids, attention_mask=attention_mask).logits
        probs  = torch.sigmoid(logits)

        all_probs.append(probs.cpu().numpy())
        all_labels.append(labels.cpu().numpy())

probs  = np.vstack(all_probs) # shape (N, 4)
all_labels = np.vstack(all_labels) # shape (N, 4)

                                                           

In [54]:
# find best threshold for each label
thresholds = {}
for i, name in enumerate(label_names):
    p, r, t = precision_recall_curve(all_labels[:, i], probs[:, i])
    f1 = 2*p*r/(p+r+1e-8)
    idx = np.nanargmax(f1[:-1])
    thresholds[name] = t[idx]
    print(f"{name}: best_thresh={thresholds[name]:.3f}")

toxic: best_thresh=0.802
severe_toxic: best_thresh=0.429
obscene: best_thresh=0.816
insult: best_thresh=0.755


In [55]:
preds = np.zeros_like(probs, dtype=int)
for i, name in enumerate(thresholds):
    preds[:, i] = (probs[:, i] >= thresholds[name]).astype(int)
    print(f"{name}: {f1_score(all_labels[:,i], preds[:,i]):.4f}")

print(f"Macro F1: {np.mean([f1_score(all_labels[:,i], preds[:,i]) for i in range(probs.shape[1])]):.4f}")

toxic: 0.8000
severe_toxic: 0.5000
obscene: 0.8571
insult: 0.7586
Macro F1: 0.7289


# Submission

In [56]:
class TestDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len):
        self.texts = texts.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt",
        )
        return {k: v.squeeze(0) for k, v in inputs.items()}

In [57]:
df_test = pd.read_csv(f"{root_path}/test_data.csv")
X_submit = prep_df(df_test)

test_ds = TestDataset(X_submit, tokenizer, max_len)
test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False)

In [58]:
model.eval()
all_probs = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Predicting"):
        logits = model(input_ids=batch["input_ids"].to(device),
                       attention_mask=batch["attention_mask"].to(device)).logits
        all_probs.append(torch.sigmoid(logits).cpu().numpy())

probs = np.vstack(all_probs)

th_array = np.array([thresholds[name] for name in thresholds])
preds_submit = (probs >= th_array).astype(int)

Predicting: 100%|██████████| 1131/1131 [00:27<00:00, 41.19it/s]


In [59]:
submission = pd.DataFrame({
    "datapointID": df_test["id"],
    "answer": preds_submit.tolist(),
    "subtaskID": 1
})

submission.head()

Unnamed: 0,datapointID,answer,subtaskID
0,00091c35fa9d0465,"[1, 0, 0, 0]",1
1,0071940212267fea,"[1, 0, 0, 0]",1
2,0072b9c3697ab8cc,"[1, 0, 0, 0]",1
3,0081b14d79f54b31,"[1, 0, 0, 1]",1
4,00950f0fae33869f,"[1, 0, 1, 1]",1


In [60]:
submission.to_csv("submission.csv", index=False)