In [1]:
import json
import re
from sklearn.metrics import precision_score, recall_score, f1_score
from bert_score import score as bertscore
from collections import Counter

In [2]:
import pandas as pd
test_df = pd.read_csv("../../dataset_for_hf/test.csv")
#test_df


## calculate tokens

In [3]:
import json
import tiktoken

# Choose a tokenizer (gpt-3.5-turbo is similar to deepseek-chat)
enc = tiktoken.encoding_for_model("gpt-3.5-turbo")

total_tokens = 0
input_tokens_per_sample = []

with open("../../samples_for_eval/zero_shot/deepseek/batch_input.jsonl", "r") as f:
    for line in f:
        sample = json.loads(line.strip())
        messages = sample["body"]["messages"]

        # For chat messages, tokenize each message's content
        num_tokens = 0
        for msg in messages:
            # Add extra token for role + structure, approx. ~4 tokens per message
            num_tokens += 4
            num_tokens += len(enc.encode(msg["content"]))

        # Account for message formatting overhead
        num_tokens += 2  # ChatML priming
        total_tokens += num_tokens
        input_tokens_per_sample.append(num_tokens)

print(f"Total input tokens (all 168 samples): {total_tokens}")
print(f"Average tokens per sample: {total_tokens / len(input_tokens_per_sample):.2f}")


Total input tokens (all 168 samples): 34563
Average tokens per sample: 205.73


## eval functions

In [3]:
def normalize_text(text):
    """Lowercase, remove punctuation and extra spaces."""
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    return text.strip()

def exact_match(pred, gold):
    return int(normalize_text(pred) == normalize_text(gold))

def lexical_f1(pred, gold):
    pred_tokens = normalize_text(pred).split()
    gold_tokens = normalize_text(gold).split()
    common = Counter(pred_tokens) & Counter(gold_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0.0
    precision = num_same / len(pred_tokens)
    recall = num_same / len(gold_tokens)
    return 2 * precision * recall / (precision + recall)

In [4]:

def evaluate_all(pred_path, gold_dict):
    preds = []
    golds = []
    ems = []
    f1s = []

    with open(pred_path, 'r') as f:
        for line in f:
            obj = json.loads(line)
            pred = obj['response']
            cid = obj['custom_id']
            # Assume answer format is:
            # "REASON: ... \nANSWER: <answer>"
            match = re.search(r'ANSWER:\s*(.+)', pred, re.IGNORECASE)
            pred_answer = match.group(1).strip() if match else ""
            gold_answer = gold_dict.get(cid, "").strip()

            if gold_answer:
                preds.append(pred_answer)
                golds.append(gold_answer)
                ems.append(exact_match(pred_answer, gold_answer))
                f1s.append(lexical_f1(pred_answer, gold_answer))

    # Compute BERTScore
    P, R, F1_bert = bertscore(preds, golds, lang='en', verbose=False)

    print(f"Lexical Exact Match: {sum(ems) / len(ems):.4f}")
    print(f"Lexical F1 Score:    {sum(f1s) / len(f1s):.4f}")
    print(f"BERTScore F1:        {F1_bert.mean().item():.4f}")

## zero-shot

In [6]:

gold_dict_zs = {
    f"{row.Entities}-{row.Label}": row.Answer.strip()
    for row in test_df.itertuples()
}
evaluate_all("../../samples_for_eval/zero_shot/mistral/batch_results.jsonl", gold_dict_zs)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Lexical Exact Match: 0.0161
Lexical F1 Score:    0.0650
BERTScore F1:        0.7974


In [7]:
evaluate_all("../../samples_for_eval/upper_bound/mistral/batch_results.jsonl", gold_dict_zs)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Lexical Exact Match: 0.0484
Lexical F1 Score:    0.3457
BERTScore F1:        0.8445




In [9]:

gold_dict_zs = {
    f"{row.Entities}_{row.Label}:zero_shot": row.Answer.strip()
    for row in test_df.itertuples()
}

In [10]:
evaluate_all("../../samples_for_eval/zero_shot/txgemma/responses_new.jsonl", gold_dict_zs)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Lexical Exact Match: 0.0714
Lexical F1 Score:    0.1514
BERTScore F1:        0.7622




In [9]:
evaluate_all("../../samples_for_eval/zero_shot/txgemma/responses.jsonl", gold_dict_zs)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Lexical Exact Match: 0.0179
Lexical F1 Score:    0.1298
BERTScore F1:        0.7730




In [10]:
evaluate_all("../../samples_for_eval/zero_shot/deepseek/responses_r1.jsonl", gold_dict_zs)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Lexical Exact Match: 0.3333
Lexical F1 Score:    0.3998
BERTScore F1:        0.8771


In [13]:
evaluate_all("../../samples_for_eval/zero_shot/qwen/responses_new.jsonl", gold_dict_zs)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Lexical Exact Match: 0.2151
Lexical F1 Score:    0.2498
BERTScore F1:        0.5142




In [11]:
evaluate_all("../../samples_for_eval/zero_shot/qwen/responses.jsonl", gold_dict_zs)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Lexical Exact Match: 0.0893
Lexical F1 Score:    0.1116
BERTScore F1:        0.2366




In [11]:
evaluate_all("../../samples_for_eval/zero_shot/deepseek/responses.jsonl", gold_dict)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Lexical Exact Match: 0.2500
Lexical F1 Score:    0.3180
BERTScore F1:        0.8569


In [6]:
evaluate_all("../../samples_for_eval/zero_shot/llama/responses.jsonl", gold_dict)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Lexical Exact Match: 0.2738
Lexical F1 Score:    0.3433
BERTScore F1:        0.8548




## upper_bound


In [6]:

gold_dict = {
    f"{row.Entities}_{row.Label}:gold_injected": row.Answer.strip() #f"{row.Entities}_{row.Label}:upper_bound": row.Answer.strip()
    for row in test_df.itertuples()
}

In [11]:
evaluate_all("../../samples_for_eval/upper_bound/txgemma/responses_new.jsonl", gold_dict)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Lexical Exact Match: 0.1369
Lexical F1 Score:    0.1479
BERTScore F1:        0.2909




In [12]:
evaluate_all("../../samples_for_eval/upper_bound/deepseek/responses_r1.jsonl", gold_dict)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Lexical Exact Match: 0.9048
Lexical F1 Score:    0.9048
BERTScore F1:        0.9779


In [16]:
evaluate_all("../../samples_for_eval/upper_bound/qwen/responses.jsonl", gold_dict)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Lexical Exact Match: 0.7976
Lexical F1 Score:    0.7976
BERTScore F1:        0.8794




In [9]:
evaluate_all("../../samples_for_eval/upper_bound/llama/responses.jsonl", gold_dict)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Lexical Exact Match: 0.7143
Lexical F1 Score:    0.7143
BERTScore F1:        0.9319
