In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
import string
import re
import os

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve, f1_score

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
from torch.amp import autocast, GradScaler

from transformers.modeling_outputs import SequenceClassifierOutput
from transformers import (
    AutoConfig,
    BertTokenizerFast,
    BertForSequenceClassification,
)

In [2]:
seed = 42
torch.random.manual_seed(seed)

device = "cuda" if torch.cuda.is_available() else "cpu"

batch_size = 8
max_len = 128

epochs = 10
learning_rate = 1e-5

root_path = "/home/stefan/ioai-prep/kits/roai-2025/toxic"

In [3]:
nltk.download("stopwords", quiet=True)
nltk.download("punkt_tab", quiet=True)
nltk.download("wordnet", quiet=True)

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

# Data preparation

In [4]:
class CommentDataset(Dataset):
    """Torch Dataset for BERT fine-tuning."""

    def __init__(self, texts, labels, tokenizer: BertTokenizerFast, max_len: int = 128):
        self.texts = texts.reset_index(drop=True)
        self.labels = labels.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt",
        )

        item = {key: val.squeeze(0) for key, val in inputs.items()}
        item["labels"] = torch.tensor(self.labels.iloc[idx].values.astype(float))

        return item

In [5]:
def clean_text(text: str):
    text = text.lower()

    # remove short forms
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"im", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)

    text = re.sub(r"http\S+|www\S+|https\S+", ' WEB ', text) # URLs
    text = re.sub(r"@\w+|#\w+", ' USER ', text) # mentions and hashtags
    text = re.sub(r"<.*?>", "", text)  # HTML tags
    text = text.translate(str.maketrans("", "", string.punctuation))  # punctuation
    text = re.sub(r"\d+", "", text)  # numbers
    text = re.sub(r"\s+", " ", text).strip()  # extra whitespace

    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]

    return ' '.join(tokens)

def prep_df(df: pd.DataFrame):
    df = df.drop(["id"], errors='ignore')

    X = df["comment_text"].apply(clean_text)

    if "toxic" in df.columns:
        y = df[['toxic', 'severe_toxic', 'obscene', 'insult']]
        return X, y
    return X

In [6]:
df = pd.read_csv(f"{root_path}/train_data.csv")
label_names = ["toxic", "severe_toxic", "obscene", "insult"]

X, y = prep_df(df)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y["severe_toxic"], random_state=seed)

In [7]:
lengths = X.map(lambda x: len(x))

lengths.mean(), lengths.std()

(np.float64(234.6633), np.float64(386.88616562981275))

In [8]:
model_name = "bert-base-uncased"
tokenizer = BertTokenizerFast.from_pretrained(model_name)

In [9]:
train_dataset = CommentDataset(X_train, y_train, tokenizer, max_len)
val_dataset = CommentDataset(X_test, y_test, tokenizer, max_len)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [10]:
# sanity check
batch = next(iter(train_loader))
[{k:batch[k].shape} for k in batch.keys()]

[{'input_ids': torch.Size([8, 128])},
 {'token_type_ids': torch.Size([8, 128])},
 {'attention_mask': torch.Size([8, 128])},
 {'labels': torch.Size([8, 4])}]

# Model

In [11]:
class WeightedBertForMultiLabelClassification(BertForSequenceClassification):
    def __init__(self, config, pos_weight=None):
        super().__init__(config)
        if pos_weight is not None:
            self.loss_fct = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
        else:
            self.loss_fct = nn.BCEWithLogitsLoss()

    def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
        outputs = self.bert(
            input_ids=input_ids, attention_mask=attention_mask, **kwargs
        )
        pooled_output = self.dropout(outputs.pooler_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss = self.loss_fct(logits, labels.float())

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

In [12]:
def compute_pos_weights(y, label_names, device='cpu'):
    total_samples = len(y)
    pos_counts = [y[label].sum() for label in label_names]
    pos_weights = [(total_samples - p) / p for p in pos_counts]
    return torch.tensor(pos_weights, dtype=torch.float32, device=device)

pos_weight = compute_pos_weights(y, label_names, device=device)

[len(y[y[l] == 1]) for l in label_names], label_names

([7884, 825, 4355, 4077], ['toxic', 'severe_toxic', 'obscene', 'insult'])

In [13]:
config = AutoConfig.from_pretrained(
    model_name, num_labels=len(label_names), problem_type="multi_label_classification"
)

model = WeightedBertForMultiLabelClassification.from_pretrained(
    model_name, config=config, pos_weight=pos_weight
).to(device)

Some weights of WeightedBertForMultiLabelClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'loss_fct.pos_weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Finetuning

In [14]:
scaler = GradScaler(device)
optimizer = AdamW(model.parameters(), lr=learning_rate)
scheduler = CosineAnnealingWarmRestarts(optimizer=optimizer, T_0=5, eta_min=1e-6)

In [15]:
def train_epoch(epoch: int):
    model.train()
    running_loss = 0.0

    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()

        # forward pass
        with autocast(device):
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

        # backward pass
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        # stats
        running_loss += loss.item()
    scheduler.step()

    avg_loss = running_loss / len(train_loader)
    print(f"Epoch {epoch+1}; train_loss={avg_loss:.4f}")

In [16]:
def val_epoch(epoch: int):
    model.eval()
    running_loss = 0.0
    preds, true_labels = [], []

    for batch in tqdm(val_loader, desc=f"Validating Epoch {epoch+1}"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        with autocast(device):
          with torch.no_grad():
              outputs = model(input_ids=input_ids,
                              attention_mask=attention_mask,
                              labels=labels)
        running_loss += outputs.loss.item()

        preds.append(outputs.logits.cpu())
        true_labels.append(labels.cpu())

    # prepare data
    preds = F.sigmoid(torch.cat(preds, dim=0)).cpu().numpy()
    preds_binary = (preds > 0.5).astype(int)

    # compute f1 score
    true_labels = np.concatenate(true_labels, axis=0)
    f1 = f1_score(true_labels, preds_binary, average="macro")

    avg_loss = running_loss / len(val_loader)
    print(f"Epoch {epoch+1}; val_loss={avg_loss:.4f}; val_f1={f1:.4f}")
    return f1

In [17]:
def save_model():
  output_dir = "./bert-finetuned"
  os.makedirs(output_dir, exist_ok=True)
  model.save_pretrained(output_dir)
  tokenizer.save_pretrained(output_dir)

def load_model(path="./bert-finetuned", device="cuda" if torch.cuda.is_available() else "cpu"):
    tokenizer = BertTokenizerFast.from_pretrained(path)
    model = BertForSequenceClassification.from_pretrained(path).to(device)
    return tokenizer, model

In [18]:
best_so_far = 0

for epoch in range(epochs):
    train_epoch(epoch)
    f1 = val_epoch(epoch)

    if f1 > best_so_far:
      best_so_far = f1
      save_model()
      print(f"Saving model with F1={f1} for epoch={epoch+1}")

Training Epoch 1: 100%|██████████| 4000/4000 [06:40<00:00,  9.98it/s]


Epoch 1; train_loss=0.4011


Validating Epoch 1: 100%|██████████| 1000/1000 [00:22<00:00, 44.56it/s]


Epoch 1; val_loss=0.3164; val_f1=0.6849
Saving model with F1=0.6848778002790561 for epoch=1


Training Epoch 2: 100%|██████████| 4000/4000 [06:36<00:00, 10.09it/s]


Epoch 2; train_loss=0.2710


Validating Epoch 2: 100%|██████████| 1000/1000 [00:19<00:00, 51.11it/s]


Epoch 2; val_loss=0.2925; val_f1=0.6593


Training Epoch 3: 100%|██████████| 4000/4000 [06:05<00:00, 10.95it/s]


Epoch 3; train_loss=0.2118


Validating Epoch 3: 100%|██████████| 1000/1000 [00:19<00:00, 50.99it/s]


Epoch 3; val_loss=0.3108; val_f1=0.6729


Training Epoch 4: 100%|██████████| 4000/4000 [06:05<00:00, 10.94it/s]


Epoch 4; train_loss=0.1650


Validating Epoch 4: 100%|██████████| 1000/1000 [00:21<00:00, 45.93it/s]


Epoch 4; val_loss=0.3863; val_f1=0.7161
Saving model with F1=0.716131179212883 for epoch=4


Training Epoch 5: 100%|██████████| 4000/4000 [06:31<00:00, 10.21it/s]


Epoch 5; train_loss=0.1369


Validating Epoch 5: 100%|██████████| 1000/1000 [00:19<00:00, 52.33it/s]


Epoch 5; val_loss=0.4364; val_f1=0.7239
Saving model with F1=0.7238592710731182 for epoch=5


Training Epoch 6: 100%|██████████| 4000/4000 [06:12<00:00, 10.73it/s]


Epoch 6; train_loss=0.1579


Validating Epoch 6: 100%|██████████| 1000/1000 [00:20<00:00, 48.46it/s]


Epoch 6; val_loss=0.4727; val_f1=0.7184


Training Epoch 7: 100%|██████████| 4000/4000 [06:57<00:00,  9.59it/s]


Epoch 7; train_loss=0.1441


Validating Epoch 7: 100%|██████████| 1000/1000 [00:22<00:00, 45.42it/s]


Epoch 7; val_loss=0.5010; val_f1=0.7215


Training Epoch 8: 100%|██████████| 4000/4000 [06:35<00:00, 10.12it/s]


Epoch 8; train_loss=0.1154


Validating Epoch 8: 100%|██████████| 1000/1000 [00:21<00:00, 47.48it/s]


Epoch 8; val_loss=0.6824; val_f1=0.7308
Saving model with F1=0.7307519284349971 for epoch=8


Training Epoch 9: 100%|██████████| 4000/4000 [06:35<00:00, 10.11it/s]


Epoch 9; train_loss=0.0889


Validating Epoch 9: 100%|██████████| 1000/1000 [00:20<00:00, 48.91it/s]


Epoch 9; val_loss=0.6337; val_f1=0.7307


Training Epoch 10: 100%|██████████| 4000/4000 [06:29<00:00, 10.27it/s]


Epoch 10; train_loss=0.0730


Validating Epoch 10: 100%|██████████| 1000/1000 [00:20<00:00, 49.68it/s]


Epoch 10; val_loss=0.7137; val_f1=0.7342
Saving model with F1=0.7342290676075167 for epoch=10


In [37]:
tokenizer, model = load_model()

# Evaluation

In [38]:
model.eval()
all_probs, all_labels = [], []
macro = 0

with torch.no_grad():
    for batch in tqdm(val_loader, desc="Evaluating", leave=False):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        with autocast(device):
          with torch.no_grad():
            logits = model(input_ids=input_ids, attention_mask=attention_mask).logits
            probs  = torch.sigmoid(logits)

        all_probs.append(probs.cpu().numpy())
        all_labels.append(labels.cpu().numpy())

probs  = np.vstack(all_probs) # shape (N, 4)
all_labels = np.vstack(all_labels) # shape (N, 4)

                                                              

In [39]:
# find best threshold for each label
thresholds = {}
for i, name in enumerate(label_names):
    p, r, t = precision_recall_curve(all_labels[:, i], probs[:, i])
    f1 = 2*p*r/(p+r+1e-8)
    idx = np.nanargmax(f1[:-1])
    thresholds[name] = t[idx]
    print(f"{name}: best_thresh={thresholds[name]:.3f}")

toxic: best_thresh=0.825
severe_toxic: best_thresh=0.984
obscene: best_thresh=0.875
insult: best_thresh=0.314


In [40]:
preds = np.zeros_like(probs, dtype=int)
for i, name in enumerate(thresholds):
    preds[:, i] = (probs[:, i] >= thresholds[name]).astype(int)
    print(f"{name}: {f1_score(all_labels[:,i], preds[:,i]):.4f}")

print(f"Macro F1: {np.mean([f1_score(all_labels[:,i], preds[:,i]) for i in range(probs.shape[1])]):.4f}")

toxic: 0.8368
severe_toxic: 0.5239
obscene: 0.8535
insult: 0.7701
Macro F1: 0.7461


# Submission

In [41]:
class TestDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len):
        self.texts = texts.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt",
        )
        return {k: v.squeeze(0) for k, v in inputs.items()}

In [42]:
df_test = pd.read_csv(f"{root_path}/test_data.csv")
X_submit = prep_df(df_test)

test_ds = TestDataset(X_submit, tokenizer, max_len)
test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False)

In [47]:
model.eval()
all_probs = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Predicting"):
        logits = model(input_ids=batch["input_ids"].to(device),
                       attention_mask=batch["attention_mask"].to(device)).logits
        all_probs.append(torch.sigmoid(logits).cpu().numpy())

probs = np.vstack(all_probs)

th_array = np.array([thresholds[name] for name in thresholds])
preds_submit = (probs >= th_array).astype(int)

Predicting: 100%|██████████| 566/566 [00:28<00:00, 19.94it/s]


In [48]:
submission = pd.DataFrame({
    "datapointID": df_test["id"],
    "answer": preds_submit.tolist(),
    "subtaskID": 1
})

submission.head()

Unnamed: 0,datapointID,answer,subtaskID
0,00091c35fa9d0465,"[1, 0, 0, 0]",1
1,0071940212267fea,"[1, 0, 0, 0]",1
2,0072b9c3697ab8cc,"[1, 0, 0, 1]",1
3,0081b14d79f54b31,"[1, 0, 0, 0]",1
4,00950f0fae33869f,"[1, 1, 1, 1]",1


In [49]:
submission.to_csv("submission.csv", index=False)