# Gold Dataset Construction (10 Trials)

## Objective
After selecting the best upstream method from Study 2, extract all atomic criteria and manually annotate all fields using OpenAD schema.


In [9]:
# Optional: install missing dependencies directly in the notebook
! pip install --quiet seaborn matplotlib pandas numpy scipy scikit-learn



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import cohen_kappa_score
import warnings

warnings.filterwarnings("ignore")
sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (12, 6)
plt.rcParams["font.size"] = 10


In [11]:
import os
import json
import textwrap
from typing import Any, Dict, List

BASE_DIR = "/Users/guoshuyan/Desktop/OpenAD"
RAW_DATA = os.path.join(BASE_DIR, "Raw_data")
OUT_DIR = os.path.join(BASE_DIR, "Study2_outputs")
SEED_OUTPUT_CSV = os.path.join(OUT_DIR, "gold_seed_kimi_12_trials.csv")
ANNOTATION_TEMPLATE_CSV = os.path.join(OUT_DIR, "gold_annotation_template_kimi.csv")

os.makedirs(OUT_DIR, exist_ok=True)

GOLD_TRIAL_IDS = [
    "NCT01767311",
    "NCT02008357",
    "NCT02477800",
    "NCT02484547",
    "NCT03443973",
    "NCT03444870",
    "NCT03887455",
    "NCT04437511",
    "NCT04770220",
    "NCT04777396",
    "NCT05026866",
    "NCT05108922",
]

print(f"Configured {len(GOLD_TRIAL_IDS)} trials for gold dataset seed extraction.")


Configured 12 trials for gold dataset seed extraction.


In [12]:
def load_json(path: str) -> Dict[str, Any]:
    if not os.path.exists(path):
        raise FileNotFoundError(path)
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)


def extract_study_obj(data: Dict[str, Any]) -> Dict[str, Any]:
    if not data:
        return {}
    if "studies" in data and isinstance(data["studies"], list) and data["studies"]:
        return data["studies"][0]
    if "protocolSection" in data:
        return data
    if "FullStudiesResponse" in data:
        try:
            return data["FullStudiesResponse"]["FullStudies"][0]["Study"]
        except Exception:
            return {}
    return {}


def get_eligibility_text(study: Dict[str, Any]) -> str:
    if not study:
        return ""
    try:
        return study["protocolSection"]["eligibilityModule"]["eligibilityCriteria"] or ""
    except Exception:
        pass
    try:
        return study["ProtocolSection"]["EligibilityModule"]["EligibilityCriteria"] or ""
    except Exception:
        return ""


def split_inclusion_exclusion(raw_text: str) -> Dict[str, str]:
    text = raw_text.replace("\r\n", "\n")
    lower = text.lower()
    inc_idx = lower.find("inclusion criteria")
    exc_idx = lower.find("exclusion criteria")

    inclusion = ""
    exclusion = ""

    if inc_idx != -1:
        if exc_idx != -1:
            inclusion = text[inc_idx:exc_idx].strip()
            exclusion = text[exc_idx:].strip()
        else:
            inclusion = text[inc_idx:].strip()
    elif exc_idx != -1:
        exclusion = text[exc_idx:].strip()
    else:
        inclusion = text.strip()

    return {"inclusion": inclusion, "exclusion": exclusion}


def build_segmentation_prompt(trial_id: str, criterion_type: str, section_text: str) -> str:
    return textwrap.dedent(
        f"""
        You are an expert clinical trial information extraction system.

        Task: From the following eligibility section of trial {trial_id}, extract a list of
        atomic eligibility criteria of type "{criterion_type}".

        "Atomic" means:
          - Each criterion is a single logical condition.
          - Do NOT merge multiple logical conditions into one sentence.
          - Do NOT split a single logically unified sentence into tiny fragments.

        Output format:
          Return ONLY a valid JSON array, with NO extra text.
          Each element must be an object with fields:
            - "trial_id" (string)
            - "criterion_type" (string: "inclusion" or "exclusion")
            - "source_sentence" (string: the extracted sentence as it appears, with minimal edits)

        Important:
          - Preserve negations ("no history of stroke") and constraints.
          - Do not invent criteria; only use text actually present.
          - If the section is empty, return [].

        Here is the section text:

        \"\"\"{section_text}\"\"\"
        """
    ).strip()


from openai import OpenAI


def call_kimi(prompt: str) -> str | None:
    api_key = os.getenv("KIMI_API_KEY") or os.getenv("MOONSHOT_API_KEY")
    if not api_key:
        raise RuntimeError("KIMI_API_KEY or MOONSHOT_API_KEY not set.")

    client = OpenAI(api_key=api_key, base_url="https://api.moonshot.ai/v1")

    try:
        completion = client.chat.completions.create(
            model="kimi-k2-turbo-preview",
            messages=[
                {
                    "role": "system",
                    "content": (
                        "You are Kimi, an AI assistant that extracts atomic eligibility criteria."
                    ),
                },
                {"role": "user", "content": prompt},
            ],
            temperature=0.0,
        )
        return completion.choices[0].message.content.strip()
    except Exception as exc:
        print(f"⚠️ Kimi API error: {exc}")
        return None


def extract_json_array(text: str):
    text = text.strip()
    if text.startswith("```"):
        text = text.strip("`")
    start = text.find("[")
    end = text.rfind("]")
    if start == -1 or end == -1 or end <= start:
        raise ValueError("No JSON array found in model output.")
    json_str = text[start : end + 1]
    return json.loads(json_str)


def llm_output_to_df(model: str, trial_id: str, criterion_type: str, raw_output: str) -> pd.DataFrame:
    arr = extract_json_array(raw_output)
    if not isinstance(arr, list):
        raise ValueError("Expected a JSON list from model.")

    rows = []
    for obj in arr:
        if not isinstance(obj, dict):
            continue
        t_id = obj.get("trial_id", trial_id)
        ctype = (obj.get("criterion_type", criterion_type) or "").lower()
        sent = (obj.get("source_sentence", "") or "").strip()
        if not sent:
            continue
        rows.append(
            {
                "model": model,
                "trial_id": t_id,
                "criterion_type": ctype,
                "source_sentence": sent,
            }
        )
    return pd.DataFrame(rows)



In [None]:
def run_kimi_seed_extraction(trial_ids: List[str]) -> pd.DataFrame:
    all_outputs: List[pd.DataFrame] = []

    for tid in trial_ids:
        json_path = os.path.join(RAW_DATA, f"{tid}.json")
        data = load_json(json_path)
        study = extract_study_obj(data)
        eligibility = get_eligibility_text(study)
        sections = split_inclusion_exclusion(eligibility)

        for ctype in ["inclusion", "exclusion"]:
            section_text = sections.get(ctype, "").strip()
            if not section_text:
                continue

            prompt = build_segmentation_prompt(tid, ctype, section_text)
            raw_output = call_kimi(prompt)
            if not raw_output:
                print(f"⚠️ {tid} / {ctype}: no output returned by Kimi")
                continue

            try:
                df_out = llm_output_to_df("kimi", tid, ctype, raw_output)
                all_outputs.append(df_out)
                print(f"✓ {tid} / {ctype}: {len(df_out)} criteria")
            except Exception as exc:
                print(f"⚠️ {tid} / {ctype} parsing failed: {exc}")

    if not all_outputs:
        return pd.DataFrame(columns=["model", "trial_id", "criterion_type", "source_sentence"])

    combined = pd.concat(all_outputs, ignore_index=True)
    combined = combined.drop_duplicates(subset=["trial_id", "criterion_type", "source_sentence"]).reset_index(drop=True)
    return combined


def create_annotation_template(seed_df: pd.DataFrame) -> pd.DataFrame:
    template = seed_df.copy()
    template.insert(0, "criterion_id", [f"KIMI-{i+1:04d}" for i in range(len(template))])

    annotation_fields = [
        "ad_domain",
        "clinical_concept",
        "operator",
        "value_lower",
        "value_upper",
        "units",
        "diagnostic_framework",
        "severity_stage",
        "temporal_scope",
        "evidence_type",
        "certainty",
    ]

    for field in annotation_fields:
        if field not in template.columns:
            template[field] = ""

    column_order = [
        "criterion_id",
        "trial_id",
        "criterion_type",
        "source_sentence",
    ] + annotation_fields

    template = template[column_order]
    return template



In [17]:
KIMI_KEY     = "sk-WwqYxLkX8kZPy2HTl3Msio8xGUv43YfOFSrbYyC7cbusSz3y"
os.environ["KIMI_API_KEY"]    = KIMI_KEY
print(" KIMI_API_KEY     =", "✔")

 KIMI_API_KEY     = ✔


In [18]:
# Uncomment the lines below to run the extraction and template generation inside the notebook.
kimi_seed_df = run_kimi_seed_extraction(GOLD_TRIAL_IDS)
kimi_seed_df.to_csv(SEED_OUTPUT_CSV, index=False)
annotation_template_df = create_annotation_template(kimi_seed_df.drop(columns=["model"], errors="ignore"))
annotation_template_df.to_csv(ANNOTATION_TEMPLATE_CSV, index=False)
print(f"Saved seed extraction to {SEED_OUTPUT_CSV}")
print(f"Saved annotation template to {ANNOTATION_TEMPLATE_CSV}")


✓ NCT01767311 / inclusion: 30 criteria
✓ NCT01767311 / exclusion: 12 criteria
✓ NCT02008357 / inclusion: 5 criteria
✓ NCT02008357 / exclusion: 9 criteria
✓ NCT02477800 / inclusion: 7 criteria
✓ NCT02477800 / exclusion: 11 criteria
✓ NCT02484547 / inclusion: 7 criteria
✓ NCT02484547 / exclusion: 11 criteria
✓ NCT03443973 / inclusion: 9 criteria
✓ NCT03443973 / exclusion: 23 criteria
✓ NCT03444870 / inclusion: 9 criteria
✓ NCT03444870 / exclusion: 23 criteria
✓ NCT03887455 / inclusion: 28 criteria
✓ NCT03887455 / exclusion: 26 criteria
✓ NCT04437511 / inclusion: 5 criteria
✓ NCT04437511 / exclusion: 2 criteria
✓ NCT04770220 / inclusion: 6 criteria
✓ NCT04770220 / exclusion: 7 criteria
✓ NCT04777396 / inclusion: 7 criteria
✓ NCT04777396 / exclusion: 5 criteria
✓ NCT05026866 / inclusion: 6 criteria
✓ NCT05026866 / exclusion: 11 criteria
✓ NCT05108922 / inclusion: 8 criteria
✓ NCT05108922 / exclusion: 7 criteria
Saved seed extraction to /Users/guoshuyan/Desktop/OpenAD/Study2_outputs/gold_se