In [1]:
import ast
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import os
import torch


In [2]:

gold = pd.read_csv("data/ct_dev.tsv", sep="\t")
gold["labels"] = gold["labels"].apply(eval)

# Unpack true labels
gold["cat1_true"] = gold["labels"].apply(lambda x: int(x[0]))
gold["cat2_true"] = gold["labels"].apply(lambda x: int(x[1]))
gold["cat3_true"] = gold["labels"].apply(lambda x: int(x[2]))

preds_bert = pd.read_csv("output/predictions_debert.csv")
preds_llama = pd.read_csv("output/predictions_llama.csv")

# Merge with gold by 'index'
df_bert = pd.merge(gold, preds_bert, on="index")
df_llama = pd.merge(gold, preds_llama, on="index")


In [3]:
def evaluate_predictions(df, model_name="Model"):
    metrics = {}
    for i, cat in enumerate(["cat1", "cat2", "cat3"]):
        y_true = df[f"{cat}_true"]
        y_pred = df[f"{cat}_pred"]
        
        metrics[f"{cat}_acc"] = accuracy_score(y_true, y_pred)
        metrics[f"{cat}_prec"] = precision_score(y_true, y_pred)
        metrics[f"{cat}_rec"] = recall_score(y_true, y_pred)
        metrics[f"{cat}_f1"] = f1_score(y_true, y_pred)

    # Macro F1 across all 3 categories
    macro_f1 = f1_score(
        df[["cat1_true", "cat2_true", "cat3_true"]].values,
        df[["cat1_pred", "cat2_pred", "cat3_pred"]].values,
        average="macro"
    )
    metrics["macro_f1"] = macro_f1
    print(f"\n {model_name} Metrics:")
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")
    return metrics


In [4]:
bert_metrics = evaluate_predictions(df_bert, model_name="DeBERTa")
llama_metrics = evaluate_predictions(df_llama, model_name="LLaMA")


 DeBERTa Metrics:
cat1_acc: 0.9197
cat1_prec: 0.7419
cat1_rec: 0.8846
cat1_f1: 0.8070
cat2_acc: 0.9197
cat2_prec: 0.7778
cat2_rec: 0.8077
cat2_f1: 0.7925
cat3_acc: 0.9197
cat3_prec: 0.8108
cat3_rec: 0.8824
cat3_f1: 0.8451
macro_f1: 0.8148

 LLaMA Metrics:
cat1_acc: 0.9197
cat1_prec: 0.7586
cat1_rec: 0.8462
cat1_f1: 0.8000
cat2_acc: 0.9197
cat2_prec: 0.7778
cat2_rec: 0.8077
cat2_f1: 0.7925
cat3_acc: 0.9197
cat3_prec: 0.8108
cat3_rec: 0.8824
cat3_f1: 0.8451
macro_f1: 0.8125


In [14]:
# Load and tokenize the test set (without labels)
print("Load and tokenize test data")
test_data = pd.read_csv("data/ct_test.tsv", sep='\t')  # Adjust file path if needed

# Add dummy labels for compatibility
test_data["labels"] = [[0.0, 0.0, 0.0]] * len(test_data)

# Save to a temporary file so the DataLoader can read it with labels
temp_test_path = "temp_ct_test_with_labels.tsv"
test_data.to_csv(temp_test_path, sep="\t", index=False)

# Reuse the data loader
test_ds, _ = dl.get_dataset(temp_test_path)

# Predict on the test set
print("Predicting on test set")
test_pred_output = trainer.predict(test_ds)

# Annotate test predictions
test_df = annotate_test_dataframe(test_data, test_pred_output)

# Save predictions
submission_test_df = test_df[["index", "cat1_pred", "cat2_pred", "cat3_pred"]]
submission_test_df.to_csv("test_predictions.csv", index=False)
print("Test predictions saved to test_predictions.csv")


Load and tokenize test data


Map:   0%|          | 0/240 [00:00<?, ? examples/s]

Predicting on test set


Test predictions saved to test_predictions.csv
