In [1]:
import nltk
from nltk.metrics.scores import precision, recall, f_measure

# --- 1. Define the Ground Truth and Predicted Relations ---
# Ground Truth Relations (Gold Set)
gold_set = {
    "BBNJ_M043 -> C1", "I1 -> C1", "BBNJ_M044 -> C1", "BBNJ_M047 -> C1", 
    "BBNJ_M045 -> C1", "BBNJ_M035 -> C1", "BBNJ_M046 -> C1", 
    "BBNJ_M136 -> C1", "I2 -> C1"
}

# LLM Generated Relations (Predicted/Test Set)
pred_set = {
    "BBNJ_M043 -> C1", "I1 -> C1", "BBNJ_M044 -> C1", "BBNJ_M047 -> C1", 
    "BBNJ_M045 -> C1", "BBNJ_M050 -> C1", "I3 -> C1"
}

# --- 2. Compute Metrics using NLTK ---

# Precision: How many of the predicted items are in the gold set (TP / (TP + FP))
p_nltk = precision(gold_set, pred_set)

# Recall: How many of the gold items are captured by the predicted set (TP / (TP + FN))
r_nltk = recall(gold_set, pred_set)

# F-Measure: Harmonic mean of precision and recall
f1_nltk = f_measure(gold_set, pred_set)

# --- 3. Manual Verification (For Comparison) ---
TP = len(gold_set.intersection(pred_set))  # True Positives: 5
FP = len(pred_set.difference(gold_set))     # False Positives: 2
FN = len(gold_set.difference(pred_set))     # False Negatives: 4

p_manual = TP / (TP + FP)
r_manual = TP / (TP + FN)

print("--- Counts ---")
print(f"TP: {TP}, FP: {FP}, FN: {FN}")

print("\n--- NLTK Results ---")
print(f"Precision (NLTK): {p_nltk:.4f}")
print(f"Recall (NLTK): {r_nltk:.4f}")
print(f"F1-Score (NLTK): {f1_nltk:.4f}")

print("\n--- Manual Verification ---")
print(f"Precision (Manual): {p_manual:.4f}")
print(f"Recall (Manual): {r_manual:.4f}")

--- Counts ---
TP: 5, FP: 2, FN: 4

--- NLTK Results ---
Precision (NLTK): 0.7143
Recall (NLTK): 0.5556
F1-Score (NLTK): 0.6250

--- Manual Verification ---
Precision (Manual): 0.7143
Recall (Manual): 0.5556


In [1]:
from nltk.metrics.scores import precision, recall, f_measure

# ---------- Example gold/pred edge lists in structured JSON form ----------
gold_edges = [
    {"from": "BBNJ_M043", "to": "C1", "relation": "support"},
    {"from": "I1",       "to": "C1", "relation": "support"},
    {"from": "BBNJ_M044","to": "C1", "relation": "support"},
    {"from": "BBNJ_M047","to": "C1", "relation": "support"},
    {"from": "BBNJ_M045","to": "C1", "relation": "support"},
    {"from": "BBNJ_M035","to": "C1", "relation": "support"},
    {"from": "BBNJ_M046","to": "C1", "relation": "support"},
    {"from": "BBNJ_M136","to": "C1", "relation": "support"},
    {"from": "I2",       "to": "C1", "relation": "support"},
]

pred_edges = [
    {"from": "BBNJ_M043","to": "C1", "relation": "support"},
    {"from": "I1",       "to": "C1", "relation": "support"},
    {"from": "BBNJ_M044","to": "C1", "relation": "support"},
    {"from": "BBNJ_M047","to": "C1", "relation": "support"},
    {"from": "BBNJ_M045","to": "C1", "relation": "support"},
    {"from": "BBNJ_M050","to": "C1", "relation": "support"},
    {"from": "I3",       "to": "C1", "relation": "support"},
]

# ---------- Normalization: dict edge -> hashable tuple ----------
def edge_to_tuple(e: dict) -> tuple:
    # Make it robust to whitespace/casing issues
    frm = str(e["from"]).strip()
    to = str(e["to"]).strip()
    rel = str(e.get("relation", "support")).strip().lower()
    return (frm, to, rel)

gold_set = {edge_to_tuple(e) for e in gold_edges}
pred_set = {edge_to_tuple(e) for e in pred_edges}

# ---------- Metrics (NLTK) ----------
p_nltk = precision(gold_set, pred_set)
r_nltk = recall(gold_set, pred_set)
f1_nltk = f_measure(gold_set, pred_set)

# ---------- Manual verification ----------
TP = len(gold_set.intersection(pred_set))
FP = len(pred_set.difference(gold_set))
FN = len(gold_set.difference(pred_set))

p_manual = TP / (TP + FP) if (TP + FP) else 0.0
r_manual = TP / (TP + FN) if (TP + FN) else 0.0

print("--- Counts ---")
print(f"TP: {TP}, FP: {FP}, FN: {FN}")

print("\n--- NLTK Results ---")
print(f"Precision (NLTK): {p_nltk:.4f}")
print(f"Recall (NLTK): {r_nltk:.4f}")
print(f"F1-Score (NLTK): {f1_nltk:.4f}")

print("\n--- Manual Verification ---")
print(f"Precision (Manual): {p_manual:.4f}")
print(f"Recall (Manual): {r_manual:.4f}")


--- Counts ---
TP: 5, FP: 2, FN: 4

--- NLTK Results ---
Precision (NLTK): 0.7143
Recall (NLTK): 0.5556
F1-Score (NLTK): 0.6250

--- Manual Verification ---
Precision (Manual): 0.7143
Recall (Manual): 0.5556


In [2]:
gold_set

{('BBNJ_M035', 'C1', 'support'),
 ('BBNJ_M043', 'C1', 'support'),
 ('BBNJ_M044', 'C1', 'support'),
 ('BBNJ_M045', 'C1', 'support'),
 ('BBNJ_M046', 'C1', 'support'),
 ('BBNJ_M047', 'C1', 'support'),
 ('BBNJ_M136', 'C1', 'support'),
 ('I1', 'C1', 'support'),
 ('I2', 'C1', 'support')}

In [3]:
pred_set

{('BBNJ_M043', 'C1', 'support'),
 ('BBNJ_M044', 'C1', 'support'),
 ('BBNJ_M045', 'C1', 'support'),
 ('BBNJ_M047', 'C1', 'support'),
 ('BBNJ_M050', 'C1', 'support'),
 ('I1', 'C1', 'support'),
 ('I3', 'C1', 'support')}