In [None]:
import os
import random
import time

import gqr
import numpy as np
import pandas as pd
import torch
import wandb
from datasets import Dataset
from dotenv import load_dotenv
from tqdm import tqdm
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
)

In [None]:
load_dotenv()


def set_seed(seed: int) -> None:
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)


set_seed(22)
os.environ["TOKENIZERS_PARALLELISM"] = "false"

model_name = "google-bert/bert-base-multilingual-cased"  # 'google-bert/bert-base-multilingual-cased') "answerdotai/ModernBERT-base"
save_dir = "models-bert-multilingual-cased"

In [None]:
wandb.init(
    project="ood-bert",
    tags=["bert", "classification"],
)

In [None]:
train_data, eval_data = gqr.load_train_dataset()

In [None]:
eval_labels = pd.get_dummies(eval_data["label"], prefix="class")
eval_data = pd.concat([eval_data, eval_labels], axis=1)
eval_labels = eval_labels.astype(np.float32).values.tolist()
eval_data = eval_data[["text", "class_0", "class_1", "class_2"]]
eval_texts = eval_data["text"].tolist()

In [None]:
train_labels = pd.get_dummies(train_data["label"], prefix="class")
train = pd.concat([train_data, train_labels], axis=1)
train_labels = train_labels.astype(np.float32).values.tolist()
train = train[["text", "class_0", "class_1", "class_2"]]
train_texts = train["text"].tolist()

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Prepare the datasets
train_dataset = Dataset.from_dict({"text": train_texts, "labels": train_labels})

eval_dataset = Dataset.from_dict({"text": eval_texts, "labels": eval_labels})


def tokenize_function(examples: dict) -> dict:
    return tokenizer(
        examples["text"], padding="max_length", truncation=True, max_length=512
    )


train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)

train_dataset = train_dataset.remove_columns(["text"])
eval_dataset = eval_dataset.remove_columns(["text"])

train_dataset.set_format(type="torch", device="cuda")
eval_dataset.set_format(type="torch", device="cuda")

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    problem_type="multi_label_classification",
    num_labels=3,
    reference_compile=False,
)

model = model.to("cuda")

In [None]:
training_args = TrainingArguments(
    output_dir=save_dir,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    evaluation_strategy="steps",
    eval_steps=250,
    save_strategy="steps",
    save_steps=250,
    logging_strategy="steps",
    logging_steps=1,
    load_best_model_at_end=True,
    push_to_hub=False,
    report_to="wandb",
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)

trainer.train()

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    f"{save_dir}/checkpoint-2000",
    problem_type="multi_label_classification",
    num_labels=3,
).to("cuda")

domain = gqr.load_id_test_dataset()
ood = gqr.load_ood_test_dataset()

In [None]:
model.eval()

all_probabilities = []
for text in tqdm(domain["text"].tolist(), desc="Domain"):
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    inputs = inputs.to("cuda")
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    probabilities = torch.sigmoid(logits)
    all_probabilities.append(probabilities.to("cpu").numpy().tolist()[0])
domain["pred"] = all_probabilities
domain.to_csv("data/results/bert_domain_results.csv", index=False)

In [None]:
# Process each text
all_probabilities = []
for text in tqdm(ood["text"].values.tolist()):
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    inputs = inputs.to("cuda")

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    probabilities = torch.sigmoid(logits)
    all_probabilities.append(probabilities.to("cpu").numpy().tolist()[0])
ood["pred"] = all_probabilities
ood.to_csv("data/results/bert_ood_results.csv", index=False)

In [None]:
def batch_inference(model: AutoModelForSequenceClassification, texts: list) -> list:
    model.eval()
    all_probabilities = []
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
    inputs = inputs.to("cuda")

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    probabilities = torch.sigmoid(logits)

    all_probabilities.extend(probabilities.to("cpu").numpy().tolist())
    return all_probabilities

In [None]:
def load_batch_data() -> list:
    batch_data = pd.read_csv("batch_data.csv")
    return batch_data["prompt"].values.tolist()
batch_data = load_batch_data()
batch_sizes = [1, 32, 64, 128, 256]

In [None]:
batch_results = []
for batch_size in tqdm(batch_sizes):
    batches = [
        batch_data[i : i + batch_size] for i in range(0, len(batch_data), batch_size)
    ]
    for batch in batches:

        start_time = time.perf_counter()
        preds = batch_inference(model, batch)
        time_taken = time.perf_counter() - start_time

        batch_results.append(
            {
                "batch_size": batch_size,
                "results": preds,
                "model_name": "bert-base-multilingual-cased",
                "time_taken": time_taken,
            }
        )
batch_results_df = pd.DataFrame(batch_results)
batch_results_df.to_csv("data/results/batch/bert_batch_results.csv")