In [1]:
# ============================================================
# 0) Imports + OpenRouter client setup
# ============================================================
import os, json, re
from pathlib import Path

import pandas as pd
from nltk.metrics.scores import precision, recall, f_measure
from openai import OpenAI

# open router key
os.environ["OPENROUTER_API_KEY"] = "set your key"

client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=os.environ["OPENROUTER_API_KEY"],
)

MODEL_NAME = "google/gemini-2.5-pro"


In [2]:
# ============================================================
# 1) Paths
# ============================================================
SYSTEM_PROMPT_PATH = Path("/Users/rohanpersonal/git_projs/LLM_arg_paper/question_specific_dataset/system_prompt.txt")
GROUND_TRUTH_PATH  = Path("/Users/rohanpersonal/git_projs/LLM_arg_paper/question_specific_dataset/ground_truth_irac.json")

Q_SUPERSET_CSV = {
    "Q1": Path("/Users/rohanpersonal/git_projs/LLM_arg_paper/question_specific_dataset/Questions_superset_rows - Q1.csv"),
    "Q2": Path("/Users/rohanpersonal/git_projs/LLM_arg_paper/question_specific_dataset/Questions_superset_rows - Q2.csv"),
}

OUTDIR = Path("/Users/rohanpersonal/git_projs/LLM_arg_paper/irac_outputs")
OUTDIR.mkdir(parents=True, exist_ok=True)


In [3]:
# ============================================================
# 2) Load system prompt + ground truth JSON
# ============================================================
SYSTEM_PROMPT = SYSTEM_PROMPT_PATH.read_text(encoding="utf-8").strip()
GROUND_TRUTH = json.loads(GROUND_TRUTH_PATH.read_text(encoding="utf-8"))


In [4]:
# ============================================================
# 3) Load superset rows CSVs (NOW including Hohfeldian columns)
#    Required columns:
#      - Row_ID
#      - treatytext
#      - actorholder
#      - action
#      - actoraffected
# ============================================================
def load_superset_rows(csv_path: Path) -> pd.DataFrame:
    df = pd.read_csv(csv_path)

    required = ["Row_ID", "treatytext", "actorholder", "action", "actoraffected"]
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise ValueError(f"{csv_path.name} missing columns: {missing}")

    return df[required].copy()

SUPERSETS = {q: load_superset_rows(path) for q, path in Q_SUPERSET_CSV.items()}

In [5]:
# ============================================================
# 4) Fact patterns + C1/C2 text (fill these; later load from file)
# ============================================================
QUESTIONS = {
    "Q1": {
        "fact_pattern": "An Australian biotechnology company conducts a private expedition to collect marine genetic resources (MGRs) in areas beyond national jurisdiction (ABNJ). The collected materials are sequenced in-house, and the resulting digital sequence information (DSI) is published in open-access repositories. The same company later relies on this DSI to develop a commercially valuable enzyme. No prior informed consent (PIC) or mutually agreed terms (MAT) were established, and no monetary or non-monetary benefit-sharing has occurred. The company argues that because the material was collected in ABNJ and only DSI is being used, rather than the physical samples, no benefit-sharing obligations apply under current international law. The competent authority of Australia has not taken regulatory steps to monitor or require disclosure of the activity. This scenario raises questions about the extent to which companies and States bear substantive obligations to ensure benefit-sharing when MGRs are accessed in ABNJ and subsequently used in commercial applications through DSI, particularly in the absence of clear jurisdictional triggers, prior agreements, or internationally coordinated oversight mechanisms.",
        "C1": "Benefit-sharing obligations apply to the commercial utilisation of DSI derived from ABNJ MGRs in this scenario, and Australia must ensure fair and equitable benefit-sharing (including non-monetary and, where applicable, monetary benefits).",
        "C2": "No benefit-sharing obligations apply to the commercial utilisation of DSI derived from ABNJ MGRs in this scenario, so Australia has no duty (on these facts) to ensure benefit-sharing from the company’s use of the published DSI.",
    },
    "Q2": {
        "fact_pattern": "A university-affiliated researcher from Brazil applies for access to marine genetic resources (MGRs) located within the exclusive economic zone (EEZ) of Argentina. Both Brazil and Argentina are Parties to the Convention on Biological Diversity (CBD) and the Nagoya Protocol. The target species are found in an area inhabited by an Indigenous community with longstanding custodianship of the surrounding marine environment and traditional knowledge (TK) associated with the biological properties of the organisms. Argentina’s national competent authority grants the researcher an access permit for academic sample collection. The researcher does not engage with the Indigenous community, and neither the permitting process nor any domestic procedure includes consultation or prior informed consent (PIC) from the TK holders. No mutually agreed terms (MAT) are established, and no plans are made for benefit-sharing with the community. The samples are collected and exported to Brazil for analysis without further engagement. This fact pattern raises questions about the legal adequacy of State-issued permits that omit community-level consent when TK is clearly associated with marine genetic resources within the granting State’s jurisdiction.",
        "C1": "Access to MGRs and associated TK located within national jurisdiction requires engagement with Indigenous Peoples and Local Communities (IPLCs), including PIC/approval and involvement and the establishment of MAT, where TK is associated with the genetic resources and where IPLCs have recognised rights.",
        "C2": "Access to MGRs and associated TK within national jurisdiction does not require engagement with IPLCs in this scenario, because State authority over genetic resources is sufficient and IPLC PIC/approval is not legally required unless specifically mandated under domestic law.",
    },
}


In [6]:
# ============================================================
# 5) Prompt builder + LLM call + JSON parse
#    (NOW includes actor_holder / action / actor_affected per row)
# ============================================================
def build_user_prompt(q: str, fact_pattern: str, df_rows: pd.DataFrame, c1: str, c2: str) -> str:
    rows = "\n\n".join(
        "\n".join([
            str(r.Row_ID).strip(),
            f"actorholder: {str(r.actorholder).strip()}",
            f"action: {str(r.action).strip()}",
            f"actoraffected: {str(r.actoraffected).strip()}",
            str(r.treatytext).strip(),
        ])
        for _, r in df_rows.iterrows()
    )

    return f"""question_number: {q}

FACT PATTERN:
{fact_pattern}

TREATY ROWS (ID + Hohfeldian mapping + exact treaty text):
{rows}

CONCLUSIONS:
C1: {c1}
C2: {c2}
""".strip()

def extract_json(text: str) -> dict:
    m = re.search(r"\{.*\}", text, flags=re.DOTALL)
    if not m:
        raise ValueError("No JSON object found in model output.")
    return json.loads(m.group(0))

def call_llm(system_prompt: str, user_prompt: str) -> dict:
    resp = client.chat.completions.create(
        model=MODEL_NAME,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ],
        temperature=0,
    extra_body={
                "seed": 42,
                "provider": {"allow_fallbacks": False},
                "reasoning": {"effort": "high", "exclude": True},
            },
    )
    return extract_json(resp.choices[0].message.content)


In [7]:
# ============================================================
# 6) Validation + evaluation (rules + edges)
# ============================================================
def validate_pred(pred: dict):
    assert "question_number" in pred
    for k in ["irac_C1", "irac_C2"]:
        obj = pred[k]
        for needed in ["issue", "rules_selected", "analysis", "conclusion", "edges_support"]:
            assert needed in obj

        for rr in obj["rules_selected"]:
            assert "id" in rr and "text" in rr

        for e in obj["edges_support"]:
            assert "|" in e and " " not in e, f"Edge has spaces or wrong format: {e}"

def prf(gold_set: set, pred_set: set):
    p = precision(gold_set, pred_set) or 0.0
    r = recall(gold_set, pred_set) or 0.0
    f1 = f_measure(gold_set, pred_set) or 0.0
    return p, r, f1

def eval_one(q: str, pred: dict, gt: dict) -> pd.DataFrame:
    rows = []
    for concl in ["C1", "C2"]:
        pred_obj = pred[f"irac_{concl}"]
        gt_obj = gt[q][concl]

        pred_rules = {x["id"].strip() for x in pred_obj["rules_selected"]}
        gold_rules = set(gt_obj["rules_selected"])

        pred_edges = {x.strip() for x in pred_obj["edges_support"]}
        gold_edges = set(gt_obj["edges_support"])

        p_r, r_r, f1_r = prf(gold_rules, pred_rules)
        p_e, r_e, f1_e = prf(gold_edges, pred_edges)

        rows += [
            {"question_number": q, "conclusion": concl, "metric": "rules_selected",
             "precision": p_r, "recall": r_r, "f1": f1_r,
             "gold_size": len(gold_rules), "pred_size": len(pred_rules)},
            {"question_number": q, "conclusion": concl, "metric": "edges_support",
             "precision": p_e, "recall": r_e, "f1": f1_e,
             "gold_size": len(gold_edges), "pred_size": len(pred_edges)},
        ]
    return pd.DataFrame(rows)


In [8]:
# ============================================================
# 7) Run Q1 + Q2, save prediction files + metrics.csv
# ============================================================
all_metrics = []

for q in ["Q1", "Q2"]:
    user_prompt = build_user_prompt(
        q=q,
        fact_pattern=QUESTIONS[q]["fact_pattern"],
        df_rows=SUPERSETS[q],
        c1=QUESTIONS[q]["C1"],
        c2=QUESTIONS[q]["C2"],
    )

    pred = call_llm(SYSTEM_PROMPT, user_prompt)
    validate_pred(pred)

    (OUTDIR / f"pred_{q}.json").write_text(json.dumps(pred, indent=2), encoding="utf-8")
    all_metrics.append(eval_one(q, pred, GROUND_TRUTH))

metrics = pd.concat(all_metrics, ignore_index=True)
metrics.to_csv(OUTDIR / "metrics.csv", index=False)

print("Saved outputs to:", OUTDIR)
display(metrics)


Saved outputs to: /Users/rohanpersonal/git_projs/LLM_arg_paper/irac_outputs


Unnamed: 0,question_number,conclusion,metric,precision,recall,f1,gold_size,pred_size
0,Q1,C1,rules_selected,1.0,0.666667,0.8,6,4
1,Q1,C1,edges_support,0.571429,0.4,0.470588,10,7
2,Q1,C2,rules_selected,0.75,0.6,0.666667,5,4
3,Q1,C2,edges_support,0.6,0.6,0.6,5,5
4,Q2,C1,rules_selected,1.0,0.454545,0.625,11,5
5,Q2,C1,edges_support,0.75,0.375,0.5,16,8
6,Q2,C2,rules_selected,1.0,0.454545,0.625,11,5
7,Q2,C2,edges_support,0.625,0.454545,0.526316,11,8
