In [1]:
# Extract STATUS from nested assistant message content (stringified JSON) for first 3 examples
import json

with open('tac_outputs_test.jsonl', 'r', encoding='utf-8') as f_pred, open('tac2017_adrs_test.jsonl', 'r', encoding='utf-8') as f_gold:
    for i, (pred_line, gold_line) in enumerate(zip(f_pred, f_gold)):
        pred_obj = json.loads(pred_line)
        gold_obj = json.loads(gold_line)
        # Find assistant message
        status = None
        for msg in pred_obj.get('messages', []):
            if msg.get('role') == 'assistant':
                try:
                    content_json = json.loads(msg['content'])
                    status = content_json.get('STATUS', None)
                except Exception as e:
                    status = f"ERROR: {e}"
                break
        gold_adr_list = gold_obj.get('ADR_list', [])
        gold_label = bool(gold_adr_list)
        print(f"Example {i+1}")
        print(f"  Assistant STATUS: {status}")
        print(f"  Gold ADR_list: {gold_adr_list}")
        print(f"  Gold label (True if ADR_list non-empty): {gold_label}")
        print(f"  Prediction correct? {status == gold_label}")
        print('-'*40)
        if i >= 2:
            break

Example 1
  Assistant STATUS: True
  Gold ADR_list: []
  Gold label (True if ADR_list non-empty): False
  Prediction correct? False
----------------------------------------
Example 2
  Assistant STATUS: True
  Gold ADR_list: []
  Gold label (True if ADR_list non-empty): False
  Prediction correct? False
----------------------------------------
Example 3
  Assistant STATUS: True
  Gold ADR_list: []
  Gold label (True if ADR_list non-empty): False
  Prediction correct? False
----------------------------------------


In [3]:
# Complete evaluation: extract STATUS from assistant message, compare to gold, compute metrics
import json

def extract_statuses_and_gold(pred_path, gold_path):
    pred_statuses = []
    gold_labels = []
    with open(pred_path, 'r', encoding='utf-8') as f_pred, open(gold_path, 'r', encoding='utf-8') as f_gold:
        for pred_line, gold_line in zip(f_pred, f_gold):
            pred_obj = json.loads(pred_line)
            gold_obj = json.loads(gold_line)
            # Find assistant message
            status = None
            for msg in pred_obj.get('messages', []):
                if msg.get('role') == 'assistant':
                    try:
                        content_json = json.loads(msg['content'])
                        status = content_json.get('STATUS', None)
                    except Exception:
                        status = None
                    break
            pred_statuses.append(bool(status))
            gold_adr_list = gold_obj.get('ADR_list', [])
            gold_label = bool(gold_adr_list)
            gold_labels.append(gold_label)
    return pred_statuses, gold_labels

# Paths
pred_path = 'tac_outputs_test.jsonl'
gold_path = 'tac2017_adrs_test.jsonl'

pred_statuses, gold_labels = extract_statuses_and_gold(pred_path, gold_path)

# Metrics
TP = sum(p and g for p, g in zip(pred_statuses, gold_labels))
TN = sum((not p) and (not g) for p, g in zip(pred_statuses, gold_labels))
FP = sum(p and (not g) for p, g in zip(pred_statuses, gold_labels))
FN = sum((not p) and g for p, g in zip(pred_statuses, gold_labels))

precision = TP / (TP + FP) if (TP + FP) else 0.0
recall = TP / (TP + FN) if (TP + FN) else 0.0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0.0
accuracy = (TP + TN) / len(gold_labels) if gold_labels else 0.0

print(f"Total examples: {len(gold_labels)}")
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-score:  {f1:.4f}")

Total examples: 11
Accuracy:  0.0000
Precision: 0.0000
Recall:    0.0000
F1-score:  0.0000
