In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

import re
import unicodedata

from transformers import BertTokenizer, BertForSequenceClassification

import torch
from torch.utils.data import Dataset, DataLoader, random_split
import torch.optim as optim

import torchmetrics

In [2]:
seed = 42
torch.random.manual_seed(seed)

device = "cuda" if torch.cuda.is_available() else "cpu"

test_split = 0.33
batch_size = 4
epochs = 4

root_path = "E:\\IOAI\\kits\\nitro-nlp-24"

# Data preparation

In [3]:
class TextDataset(Dataset):
    def __init__(self, csv_path):
        self.df = pd.read_csv(f"{root_path}\\{csv_path}")

        if not "test" in csv_path:
            self.df = self.df.sample(frac=0.1, random_state=seed)

        self.df[["title", "content"]] = self.df[["title", "content"]].fillna(value="")
        self.df["text"] = self.df["title"]# + " " + self.df["content"]
        self.df["text"] = self.df["text"].apply(self.clean_text)

        if "class" in self.df:
            self.df["class"] = self.df["class"].astype(np.uint8)

        # self.max_length = max(self.df["text"].apply(lambda s: len(s)))
        self.max_length = 64

        self.tokenizer = BertTokenizer.from_pretrained(
            "dumitrescustefan/bert-base-romanian-cased-v1",
            strip_accents=False, lower_case=True
        )

    def clean_text(self, text: str):
        text = re.sub(r"http[s]?://\S+|www\.\S+", " ", text)
        text = re.sub(r"<[^>]+>", " ", text)
        text = re.sub(r"[@#]\w+", " ", text)
        text = re.sub(r"[\$€£¥₹]", " moneda ", text)
        text = str(unicodedata.normalize('NFC', text).encode("ascii", "ignore"))
        text = re.sub(r"[^\w\s]", " ", text)

        def _reduce_repeats(match):
            char = match.group(1)
            return char * 2

        text = re.sub(r"(\w)\1{2,}", _reduce_repeats, text)
        text = text.lower()
        text = re.sub(r"\s{2,}", " ", text).strip()

        return text
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        encoding = self.tokenizer(
            row["text"],
            padding="max_length",
            truncation=True, 
            max_length=self.max_length,
            return_tensors="pt",
            return_attention_mask=True,
            return_token_type_ids=False, 
        )

        if "class" in self.df:
            return {
                'id': row["id"],
                'input_ids': encoding['input_ids'].squeeze(0), # (seq_len)
                'attention_mask': encoding['attention_mask'].squeeze(0), # (seq_len)
                'label': torch.tensor(row["class"], dtype=torch.long) # (1)
            }
        return {
            "id": row["id"],
            "input_ids": encoding["input_ids"].squeeze(0),  # (seq_len)
            "attention_mask": encoding["attention_mask"].squeeze(0),  # (seq_len)
        }

    def __len__(self):
        return len(self.df)

In [4]:
dataset = TextDataset("train.csv")

dataset_size = len(dataset)
test_size = int(test_split * dataset_size)
train_size = dataset_size - test_size

train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

len(train_loader), len(test_loader)

(1183, 583)

In [5]:
# sanity check
batch = next(iter(train_loader))

[v.shape for v in batch.values()]

[torch.Size([4]), torch.Size([4, 64]), torch.Size([4, 64]), torch.Size([4])]

# Model selection

In [6]:
model = BertForSequenceClassification.from_pretrained("dumitrescustefan/bert-base-romanian-cased-v1", num_labels=2)
model = model.to(device)

optimizer = optim.AdamW(model.parameters(), lr=1e-5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dumitrescustefan/bert-base-romanian-cased-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
input_ids = batch["input_ids"].to(device)
attention_mask = batch["attention_mask"].to(device)
labels = batch["label"].to(device)

outputs = model(
    input_ids=input_ids,
    attention_mask=attention_mask,
    labels=labels,
)

outputs.loss

tensor(0.8526, device='cuda:0', grad_fn=<NllLossBackward0>)

In [8]:
def evaluate():
    balanced_accuracy = torchmetrics.Accuracy(
        task="multiclass", num_classes=2, average="macro"
    ).to(device)

    model.eval()

    with torch.no_grad():
        for batch in tqdm(test_loader):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels,
            )

            preds = outputs.logits.argmax(dim=-1)
            balanced_accuracy.update(preds, labels)

    return balanced_accuracy.compute()

In [None]:
def train():
    for epoch in range(1, epochs+1):
        model.train()
        running_loss = 0.0
        for batch in tqdm(train_loader):
            input_ids, attention_mask, labels = (
                batch["input_ids"].to(device),
                batch["attention_mask"].to(device),
                batch["label"].to(device),
            )

            # forward pass
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels,
            )
            loss = outputs.loss

            # backward pass
            model.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        print(f"Epoch {epoch}; loss={(running_loss/len(train_loader)):.3f}")
    torch.save(model.state_dict(), "bert_sarcasm.pth")

should_train = True
if should_train:
    train()
    # 1/0 # to pause notebook execution

100%|██████████| 1183/1183 [01:28<00:00, 13.38it/s]


Epoch 1; loss=0.267


100%|██████████| 1183/1183 [01:30<00:00, 13.05it/s]


Epoch 2; loss=0.116


100%|██████████| 1183/1183 [01:28<00:00, 13.44it/s]


Epoch 3; loss=0.038


100%|██████████| 1183/1183 [01:28<00:00, 13.44it/s]


Epoch 4; loss=0.016


ZeroDivisionError: division by zero

In [10]:
evaluate()

100%|██████████| 583/583 [00:10<00:00, 57.17it/s]


tensor(0.9224, device='cuda:0')

In [11]:
model.load_state_dict(torch.load("bert_sarcasm.pth"))
model = model.to(device)
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

# Submission

In [12]:
dataset = TextDataset("test.csv")
val_loader = DataLoader(dataset, batch_size=1)
len(val_loader)

36669

In [13]:
subtask, ids = [], []

for batch in tqdm(val_loader):
    input_ids, attention_mask = (
        batch["input_ids"].to(device),
        batch["attention_mask"].to(device),
    )

    outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask,
    )

    pred = torch.argmax(outputs.logits).item()

    subtask.append(pred)
    ids.append(batch["id"].item())

100%|██████████| 36669/36669 [06:49<00:00, 89.48it/s]


In [15]:
submission = pd.DataFrame({"id": ids, "class": subtask})

submission["class"].value_counts()

class
0    25766
1    10903
Name: count, dtype: int64

In [16]:
submission.to_csv("submission.csv", index=False)