In [2]:
import pandas as pd
import torch
from transformers import BigBirdTokenizer, BigBirdForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "1,2"
os.environ["PYTORCH_CUDA_ALLOC_CONF"]="expandable_segments:True"

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print(f"Using device: {device}")

# Load your dataset and drop rows with missing responses
df = pd.read_csv("./data/mrbench_v3_devset_train_data.csv").dropna(subset=["response"])

# Convert string labels to integer indices
# Create mapping dictionaries for encoding and later decoding
label_list = sorted(df["label"].unique())
label_to_id = {label: idx for idx, label in enumerate(label_list)}
id_to_label = {idx: label for label, idx in label_to_id.items()}

# Replace string labels with integer labels in the DataFrame
df["label"] = df["label"].map(label_to_id)

# Split by conversation_id
conversation_ids = df["conversation_id"].unique()
train_ids, test_ids = train_test_split(conversation_ids, test_size=0.2, random_state=42)

train_df = df[df["conversation_id"].isin(train_ids)].reset_index(drop=True)
test_df = df[df["conversation_id"].isin(test_ids)].reset_index(drop=True)

train_texts = [f"Conversation: {c}\n\nResponse: {r}" for c, r in zip(train_df["conversation_history"].tolist(), train_df["response"].tolist())]
test_texts = [f"Conversation: {c}\n\nResponse: {r}" for c, r in zip(test_df["conversation_history"].tolist(), test_df["response"].tolist())]

# Now make Hugging Face Dataset objects manually
train_dataset = Dataset.from_dict({"text": train_texts, "label": train_df["label"].tolist()})
test_dataset = Dataset.from_dict({"text": test_texts, "label": test_df["label"].tolist()})

# Initialize tokenizer & model
model_name = "google/bigbird-roberta-large"
tokenizer = BigBirdTokenizer.from_pretrained(model_name)
model = BigBirdForSequenceClassification.from_pretrained(model_name, num_labels=len(label_list))


# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=1024
    )

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Compute metrics function for evaluation
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "macro_f1": f1_score(labels, preds, average="macro"),
    }

# Training configuration
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",  # evaluates at the end of each epoch
    save_strategy="epoch",          # saves checkpoint at each epoch
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    report_to="none"
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train the model
torch.cuda.empty_cache()
trainer.train()

# Evaluate the model
predictions = trainer.predict(tokenized_test_dataset)
preds = np.argmax(predictions.predictions, axis=-1)
true_labels = predictions.label_ids

# Generate detailed classification report (using integer labels)
print("Detailed Classification Report (Integer Labels):")
print(classification_report(true_labels, preds, digits=4))

# Generate classification report with target names (map back to original string labels)
target_names = [id_to_label[i] for i in range(len(label_list))]
print("Detailed Classification Report (String Labels):")
print(classification_report(true_labels, preds, target_names=target_names, digits=4))

# --- Save the Fine-tuned Model ---
# Before saving, update the model configuration with label mappings so that id2label and label2id
# are stored in the saved config.json. This is necessary because categorical labels are not handled by default.
model.config.id2label = id_to_label
model.config.label2id = label_to_id

# Save the model and tokenizer; the config will be saved automatically
save_directory = "./saved_model"
trainer.save_model(save_directory)
tokenizer.save_pretrained(save_directory)

print(f"Model and tokenizer saved to {save_directory}")

Some weights of BigBirdForSequenceClassification were not initialized from the model checkpoint at google/bigbird-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1652/1652 [00:02<00:00, 583.94 examples/s]
Map: 100%|██████████| 415/415 [00:00<00:00, 973.95 examples/s]
  trainer = Trainer(
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,No log,1.982575,0.192771,0.099816
2,No log,0.897466,0.73494,0.72687
3,1.614800,0.667516,0.773494,0.763321
4,1.614800,0.640691,0.795181,0.779161
5,0.411900,0.650935,0.8,0.790676




Detailed Classification Report (Integer Labels):
              precision    recall  f1-score   support

           0     0.8723    0.8200    0.8454        50
           1     0.7895    0.9000    0.8411        50
           2     0.8679    0.9200    0.8932        50
           3     0.8788    0.5800    0.6988        50
           4     0.6200    0.6200    0.6200        50
           5     0.7143    0.8000    0.7547        50
           6     0.6875    0.7333    0.7097        15
           7     0.8600    0.8600    0.8600        50
           8     0.8679    0.9200    0.8932        50

    accuracy                         0.8000       415
   macro avg     0.7954    0.7948    0.7907       415
weighted avg     0.8045    0.8000    0.7975       415

Detailed Classification Report (String Labels):
              precision    recall  f1-score   support

      Expert     0.8723    0.8200    0.8454        50
        GPT4     0.7895    0.9000    0.8411        50
      Gemini     0.8679    0.9200  

In [3]:
trainer.model

BigBirdForSequenceClassification(
  (bert): BigBirdModel(
    (embeddings): BigBirdEmbeddings(
      (word_embeddings): Embedding(50358, 1024, padding_idx=0)
      (position_embeddings): Embedding(4096, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BigBirdEncoder(
      (layer): ModuleList(
        (0-23): 24 x BigBirdLayer(
          (attention): BigBirdAttention(
            (self): BigBirdBlockSparseAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
            )
            (output): BigBirdSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwi

In [14]:
import numpy as np
def bootstrap_ci(y_test, y_pred, calculate_metric, confidence=0.9, n_resamples=1000, sample_fraction=0.95, random_state=None):
    """
    Compute bootstrap confidence interval for a metric function.
    Parameters:
    - y_test: array-like, true labels.
    - y_pred: array-like, predicted labels.
    - calculate_metric: function, computes the metric given y_test and y_pred.
    - confidence: float, confidence level (default 0.9).
    - n_resamples: int, number of bootstrap resamples (default 100).
    - sample_fraction: float, fraction of data to sample in each resample (default 0.95).
    - random_state: int or None, random seed for reproducibility.
    Returns:
    - (lower_bound, upper_bound): tuple of floats, confidence interval bounds.
    """
    rng = np.random.default_rng(random_state)
    n = len(y_test)
    sample_size = int(n * sample_fraction)
    metrics = []
    for _ in range(n_resamples):
        indices = rng.choice(n, size=sample_size, replace=True)
        metric = calculate_metric(np.array(y_test)[indices], np.array(y_pred)[indices])
        metrics.append(metric)
    alpha = (1 - confidence) / 2
    lower = np.percentile(metrics, 100 * alpha)
    upper = np.percentile(metrics, 100 * (1 - alpha))
    return {"metric": round(calculate_metric(y_test, y_pred), 4), "confidence_interval": (round(lower, 4), round(upper, 4))}

# def calculate_metric(y_true, y_pred):
#     # Example: Mean Absolute Error
#     return np.mean(np.abs(np.array(y_true) - np.array(y_pred)))
# y_test = [1, 2, 3, 4, 5]
# y_pred = [1.1, 1.9, 3.2, 3.8, 5.1]

# print(bootstrap_ci(y_test, y_pred, calculate_metric, confidence=0.9))


{'metric': 0.14, 'confidence_interval': (0.1, 0.175)}


In [15]:
def accuracy_metric(y_true, y_pred):
    return np.mean(y_true == y_pred)
bootstrap_ci(true_labels, preds, accuracy_metric)

{'metric': 0.8, 'confidence_interval': (0.7665, 0.8325)}