# Study 1: Direct Raw JSON Extraction

## Objective
Test which method can extract eligibility criteria as **raw text** and compare against downloaded JSON from ClinicalTrials.gov.

**No parsing into structured fields - just raw text extraction!**

In [1]:
# ================================================================
# Cell 0 ‚Äì Imports, base paths, trial IDs
# ================================================================
import os
import json
import time
import re
from typing import Dict, Any, List, Set
from collections import OrderedDict

import requests
import pandas as pd
from difflib import SequenceMatcher

import numpy as np
import matplotlib.pyplot as plt

pd.set_option("display.max_colwidth", 200)

# ---- Your trial IDs (20 AD clinical trials) ----
# Import centralized trial list
import sys
sys.path.append('/Users/guoshuyan/Desktop/OpenAD')
from trial_ids_20 import TRIAL_IDS_20

TRIAL_IDS = TRIAL_IDS_20  # All 20 AD clinical trials

# Alternative: Define directly if import doesn't work
# TRIAL_IDS = [
#     # Original 12 trials
#     "NCT01767311", "NCT02008357", "NCT02477800", "NCT02484547",
#     "NCT03443973", "NCT03444870", "NCT03887455", "NCT04437511",
#     "NCT04770220", "NCT04777396", "NCT05026866", "NCT05108922",
#     # Additional 8 trials (update after downloading)
#     "NCT04592341", "NCT04619420", "NCT04828122", "NCT04947636",
#     "NCT05014540", "NCT05269394", "NCT05310008", "NCT05531656"
# ]

# ---- Folders ----
BASE_DIR   = "/Users/guoshuyan/Desktop/OpenAD"
RAW_DATA   = os.path.join(BASE_DIR, "Raw_data")   # new v2 downloads
RAW_JSON   = os.path.join(BASE_DIR, "Raw_json")   # your existing "gold"

os.makedirs(RAW_DATA, exist_ok=True)
os.makedirs(RAW_JSON, exist_ok=True)

def text_similarity(a: str, b: str) -> float:
    """Character-level similarity using SequenceMatcher (0‚Äì1)."""
    if not a and not b:
        return 1.0
    return SequenceMatcher(None, a, b).ratio()


In [None]:
# ================================================================
# Cell 2 ‚Äì Imports, paths, trial IDs
# ================================================================
import os
import json
from typing import Any, Dict, List
from collections import OrderedDict

import numpy as np
import pandas as pd
from difflib import SequenceMatcher
import matplotlib.pyplot as plt

from sentence_transformers import SentenceTransformer

pd.set_option("display.max_colwidth", 200)

# --- Paths on your Mac ---
BASE_DIR  = "/Users/guoshuyan/Desktop/OpenAD"
RAW_JSON  = os.path.join(BASE_DIR, "Raw_json")   # your curated / gold JSONs
RAW_DATA  = os.path.join(BASE_DIR, "Raw_data")   # v2 API JSON downloads

# --- Make sure dirs exist (won't overwrite anything) ---
os.makedirs(RAW_JSON, exist_ok=True)
os.makedirs(RAW_DATA, exist_ok=True)

# --- Trial IDs you want to compare ---
TRIAL_IDS = [
    "NCT01767311",
    "NCT02008357",
    "NCT02477800",
    "NCT02484547",
    "NCT03443973",
    "NCT03444870",
    "NCT03887455",
    "NCT04437511",
    "NCT04770220",
    "NCT04777396",
    "NCT05026866",
    "NCT05108922",
]


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# ================================================================
# Cell 3 ‚Äì Utilities: load JSON, flatten to keypath:text, etc.
# ================================================================
def load_json_if_exists(path: str) -> Dict[str, Any]:
    """Load JSON file or return {} if it doesn't exist / fails."""
    if not os.path.exists(path):
        return {}
    try:
        with open(path, "r", encoding="utf-8") as f:
            return json.load(f)
    except Exception as e:
        print(f"‚ö†Ô∏è Error loading {path}: {e}")
        return {}


def flatten_json(obj: Any, prefix: str = "") -> List[str]:
    """Recursively flatten a JSON object into structural lines."""
    lines: List[str] = []
    if isinstance(obj, dict):
        for k, v in obj.items():
            new_prefix = f"{prefix}.{k}" if prefix else k
            lines.extend(flatten_json(v, new_prefix))
    elif isinstance(obj, list):
        for i, item in enumerate(obj):
            new_prefix = f"{prefix}[{i}]"
            lines.extend(flatten_json(item, new_prefix))
    else:
        val = str(obj).replace("
", " ")
        lines.append(f"{prefix}: {val}")
    return lines


def json_to_struct_text(json_obj: Dict[str, Any]) -> str:
    """Convert a JSON object into a single structural text string."""
    if not json_obj:
        return ""
    return "
".join(flatten_json(json_obj))


def char_similarity(a: str, b: str) -> float:
    """Simple character-level similarity (0‚Äì1)."""
    if not a and not b:
        return 1.0
    return SequenceMatcher(None, a, b).ratio()


def _value_to_text(value: Any) -> str:
    """Best-effort conversion of nested JSON values into readable text."""
    if value is None:
        return ""
    if isinstance(value, str):
        return value
    if isinstance(value, dict):
        for key in ("textblock", "textBlock", "TextBlock"):
            text = value.get(key)
            if isinstance(text, str):
                return text
        parts: List[str] = []
        for v in value.values():
            text = _value_to_text(v)
            if text:
                parts.append(text)
        return " 
".join(parts)
    if isinstance(value, (list, tuple, set)):
        parts: List[str] = []
        for item in value:
            text = _value_to_text(item)
            if text:
                parts.append(text)
        return " 
".join(parts)
    return str(value)


def _normalize_text(text: str) -> str:
    text = (text or "").replace("
", "
")
    text = re.sub(r"
{3,}", "

", text)
    text = re.sub(r"[	 ]+", " ", text)
    return text.strip()


def extract_eligibility_textblock(json_obj: Dict[str, Any]) -> str:
    module = json_obj.get("protocolSection", {}).get("eligibilityModule", {})
    return _normalize_text(_value_to_text(module.get("eligibilityCriteria")))


def extract_eligibility_profile(json_obj: Dict[str, Any]) -> str:
    module = json_obj.get("protocolSection", {}).get("eligibilityModule", {})
    pieces: List[str] = []
    textblock = _normalize_text(_value_to_text(module.get("eligibilityCriteria")))
    if textblock:
        pieces.append(f"Eligibility criteria:
{textblock}")
    field_labels = {
        "minimumAge": "Minimum age",
        "maximumAge": "Maximum age",
        "sex": "Sex",
        "stdAges": "Standard ages",
        "acceptsHealthyVolunteers": "Accepts healthy volunteers",
        "studyPopulation": "Study population",
        "samplingMethod": "Sampling method",
    }
    for key, label in field_labels.items():
        value = _normalize_text(_value_to_text(module.get(key)))
        if value:
            pieces.append(f"{label}: {value}")
    return "

".join(pieces).strip()


EXTRACTION_METHODS = OrderedDict({
    "flattened_json": json_to_struct_text,
    "eligibility_textblock": extract_eligibility_textblock,
    "eligibility_profile": extract_eligibility_profile,
})

EXTRACTION_LABELS = {
    "flattened_json": "Flattened JSON",
    "eligibility_textblock": "Eligibility text block",
    "eligibility_profile": "Eligibility + profile fields",
}


In [None]:
# ================================================================
# Cell 5 ‚Äì Compute similarities for all trials and extraction methods
# ================================================================
rows: List[Dict[str, Any]] = []
method_order = list(EXTRACTION_METHODS.keys())

for tid in TRIAL_IDS:
    gold_path = os.path.join(RAW_JSON, f"{tid}.json")
    api_path = os.path.join(RAW_DATA, f"{tid}.json")

    gold_json = load_json_if_exists(gold_path)
    api_json = load_json_if_exists(api_path)

    if not gold_json and not api_json:
        print(f"‚ö†Ô∏è Skipping {tid}: both gold and API JSON are empty / missing.")
        continue

    gold_text = json_to_struct_text(gold_json)
    gold_len = len(gold_text)

    for method_name, extractor in EXTRACTION_METHODS.items():
        method_text = extractor(api_json)
        row: Dict[str, Any] = {
            "trial_id": tid,
            "method": method_name,
            "method_label": EXTRACTION_LABELS.get(method_name, method_name),
            "gold_len": gold_len,
            "method_len": len(method_text),
            "char_sim": char_similarity(gold_text, method_text),
        }
        for name, model in MODELS.items():
            e_gold = embed(model, gold_text)
            e_method = embed(model, method_text)
            row[f"{name}_cos"] = cosine(e_gold, e_method)
        rows.append(row)

sim_df = pd.DataFrame(rows)
if sim_df.empty:
    print("No trials compared ‚Äì check that JSON files exist in both folders.")
else:
    sim_df["len_ratio_method_over_gold"] = np.where(
        sim_df["gold_len"].astype(float) > 0,
        sim_df["method_len"].astype(float) / sim_df["gold_len"].astype(float),
        np.nan,
    )
    sim_df["method"] = pd.Categorical(sim_df["method"], categories=method_order, ordered=True)
    sim_df = sim_df.sort_values(["trial_id", "method"]).reset_index(drop=True)
    print("üîç Multi-model embedding similarity summary (per trial √ó method):")
    display(sim_df)


In [None]:
# ================================================================
# Cell 6 ‚Äì Visualize similarities across methods
# ================================================================
if sim_df.empty:
    print("No trials compared ‚Äì run the previous cell first.")
else:
    df = sim_df.copy()
    method_order = list(EXTRACTION_METHODS.keys())
    method_labels = [EXTRACTION_LABELS[m] for m in method_order]
    metrics = ["char_sim"] + [f"{name}_cos" for name in MODELS]
    metric_titles = {"char_sim": "Character-level similarity"}
    for name in MODELS:
        metric_titles[f"{name}_cos"] = f"Embedding cosine similarity ‚Äì {name}"

    fig, axes = plt.subplots(len(metrics), 1, figsize=(14, 4 * len(metrics)), sharex=True)
    if len(metrics) == 1:
        axes = [axes]

    for ax, metric in zip(axes, metrics):
        pivot = df.pivot(index="trial_id", columns="method", values=metric)
        pivot = pivot[method_order]
        pivot.columns = method_labels
        pivot.plot(kind="bar", ax=ax)
        ax.set_title(metric_titles.get(metric, metric), fontsize=14)
        ax.set_ylabel("Similarity")
        ax.set_ylim(0, 1)
        ax.legend(title="Extraction method")

    axes[-1].set_xlabel("Trial ID")
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.show()

    # optional: visualize method/gold length ratios
    length_pivot = df.pivot(index="trial_id", columns="method", values="len_ratio_method_over_gold")
    length_pivot = length_pivot[method_order]
    length_pivot.columns = method_labels

    fig, ax = plt.subplots(figsize=(14, 4))
    length_pivot.plot(kind="bar", ax=ax)
    ax.set_title("Method length vs. curated gold (ratio)", fontsize=14)
    ax.set_ylabel("Length ratio (method / gold)")
    ax.legend(title="Extraction method")
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.show()


üîç Multi-model embedding similarity summary:


Unnamed: 0,trial_id,gold_len,api_len,char_sim,miniLM_cos,mpnet_cos,bge_base_cos
0,NCT01767311,104027,104027,0.999981,1.0,1.0,1.0
1,NCT02008357,844418,844418,0.999998,1.0,1.0,1.0
2,NCT02477800,765203,765198,0.999994,1.0,1.0,1.0
3,NCT02484547,742903,742903,0.999997,1.0,1.0,1.0
4,NCT03443973,534879,534879,0.999996,1.0,1.0,1.0
5,NCT03444870,869861,869861,0.999998,1.0,1.0,1.0
6,NCT03887455,154889,154889,0.999987,1.0,1.0,1.0
7,NCT04437511,497698,497698,0.999996,1.0,1.0,1.0
8,NCT04770220,62594,351362,0.294983,1.0,0.999981,0.999992
9,NCT04777396,196339,196339,0.99999,1.0,1.0,1.0


In [None]:
# ================================================================
# Publication-style MARKDOWN table for sim_df
# ================================================================
import numpy as np
import pandas as pd


def _bar(value, max_value=1.0, width=10):
    """Unicode bar for Markdown (no colors)."""
    if pd.isna(value):
        return ""
    value = max(0.0, min(value, max_value))
    filled = int(round((value / max_value) * width))
    return "‚ñÆ" * filled + "‚ñØ" * (width - filled)


def _fmt(v, nd=3):
    return "" if pd.isna(v) else f"{v:.{nd}f}"


def _fmt_int(v):
    return "" if pd.isna(v) else f"{int(v):,}"


def make_similarity_markdown(sim_df: pd.DataFrame, max_rows: int | None = None) -> str:
    """
    Build a clean Markdown table string from sim_df.
    Columns:
    Rank | trial_id | method | gold_len | method_len | ratio | len_bar | char_sim | <embeds...> | mean_cos
    """
    df = sim_df.copy().sort_values(["trial_id", "method"]).reset_index(drop=True)

    embedding_cols = [c for c in df.columns if c.endswith("_cos")]
    if embedding_cols:
        df["mean_cos"] = df[embedding_cols].mean(axis=1)
    else:
        df["mean_cos"] = np.nan

    df["len_ratio_method_over_gold"] = np.where(
        df["gold_len"].astype(float) > 0,
        df["method_len"].astype(float) / df["gold_len"].astype(float),
        np.nan,
    )

    df["rank"] = (-df["mean_cos"]).rank(method="min").astype(int)

    header_cols = [
        "Rank",
        "Trial ID",
        "Method",
        "Gold len",
        "Method len",
        "Method/Gold",
        "Len bar",
        "Char sim",
    ] + [c.replace("_cos", "") for c in embedding_cols] + ["Mean cos"]

    rows = []
    it = df if max_rows is None else df.head(max_rows)
    for _, r in it.iterrows():
        ratio = r.get("len_ratio_method_over_gold", np.nan)
        row = [
            str(r["rank"]),
            str(r["trial_id"]),
            str(r.get("method_label", r["method"])),
            _fmt_int(r["gold_len"]),
            _fmt_int(r["method_len"]),
            _fmt(ratio, 3),
            _bar(min(ratio, 2.0) if pd.notna(ratio) else np.nan, max_value=2.0, width=10),
            _fmt(r.get("char_sim", np.nan), 3),
        ]
        for c in embedding_cols:
            row.append(_fmt(r[c], 3))
        row.append(_fmt(r["mean_cos"], 3))
        rows.append(row)

    aligns = ["---:", ":---", ":---", "---:", "---:", "---:", ":---:", "---:"]         + ["---:"] * len(embedding_cols) + ["---:"]

    md = []
    md.append("**Similarity between curated (Raw_json) and API-derived extractions (multiple methods)**  ")
    md.append("")
    md.append("| " + " | ".join(header_cols) + " |")
    md.append("| " + " | ".join(aligns) + " |")
    for row in rows:
        md.append("| " + " | ".join(row) + " |")

    md.append("
_Legend:_ Length bar caps at 2√ó gold length; multiple methods share the same gold reference text.")
    return "
".join(md)


md_table = make_similarity_markdown(sim_df)
print(md_table)

with open("similarity_table.md", "w") as f:
    f.write(md_table)
print("Saved: similarity_table.md")


**Similarity between curated (Raw_json) and API v2 (Raw_data) CT.gov JSON per trial**  

| Rank | Trial ID | Gold len | API len | API/Gold | Len bar | Char sim | miniLM | mpnet | bge_base | Mean cos |
| ---: | :--- | ---: | ---: | ---: | :---: | ---: | ---: | ---: | ---: | ---: |
| 3 | NCT01767311 | 104,027 | 104,027 | 1.000 | ‚ñÆ‚ñÆ‚ñÆ‚ñÆ‚ñÆ‚ñØ‚ñØ‚ñØ‚ñØ‚ñØ | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 |
| 3 | NCT02008357 | 844,418 | 844,418 | 1.000 | ‚ñÆ‚ñÆ‚ñÆ‚ñÆ‚ñÆ‚ñØ‚ñØ‚ñØ‚ñØ‚ñØ | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 |
| 1 | NCT02477800 | 765,203 | 765,198 | 1.000 | ‚ñÆ‚ñÆ‚ñÆ‚ñÆ‚ñÆ‚ñØ‚ñØ‚ñØ‚ñØ‚ñØ | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 |
| 9 | NCT02484547 | 742,903 | 742,903 | 1.000 | ‚ñÆ‚ñÆ‚ñÆ‚ñÆ‚ñÆ‚ñØ‚ñØ‚ñØ‚ñØ‚ñØ | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 |
| 5 | NCT03443973 | 534,879 | 534,879 | 1.000 | ‚ñÆ‚ñÆ‚ñÆ‚ñÆ‚ñÆ‚ñØ‚ñØ‚ñØ‚ñØ‚ñØ | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 |
| 9 | NCT03444870 | 869,861 | 869,861 | 1.000 | ‚ñÆ‚ñÆ‚ñÆ‚ñÆ‚ñÆ‚ñØ‚ñØ‚ñØ‚ñØ‚ñØ | 1.000 | 1.000 | 1.000 | 1.000 | 1

In [None]:
# ================================================================
# Clean numeric table + save to TSV/CSV (no markdown formatting)
# ================================================================
import numpy as np
import pandas as pd

df = sim_df.copy().sort_values(["trial_id", "method"]).reset_index(drop=True)
embedding_cols = [c for c in df.columns if c.endswith("_cos")]

df["mean_cos"] = df[embedding_cols].mean(axis=1) if embedding_cols else np.nan
df["len_ratio_method_over_gold"] = np.where(
    df["gold_len"].astype(float) > 0,
    df["method_len"].astype(float) / df["gold_len"].astype(float),
    np.nan,
)

df["rank"] = (
    df["mean_cos"].rank(ascending=False, method="first").astype(int)
)

summary_cols = (
    ["rank", "trial_id", "method", "method_label",
     "gold_len", "method_len", "len_ratio_method_over_gold", "char_sim"]
    + embedding_cols
    + ["mean_cos"]
)

summary_df = df[summary_cols].sort_values("rank").reset_index(drop=True)

float_fmt = lambda x: f"{x:.6f}"
print("```text")
print(summary_df.to_string(index=False, float_format=float_fmt))
print("```")

summary_df.to_csv(
    "similarity_summary.tsv",
    sep="	",
    index=False,
    float_format="%.6f",
)
summary_df.to_csv(
    "similarity_summary.csv",
    index=False,
    float_format="%.6f",
)
print("Saved similarity_summary.tsv and similarity_summary.csv")


```text
 rank    trial_id  gold_len  api_len  char_sim  miniLM_cos  mpnet_cos  bge_base_cos  mean_cos
    1 NCT02477800    765203   765198  0.999994    1.000000   1.000000      1.000000  1.000000
    2 NCT04437511    497698   497698  0.999996    1.000000   1.000000      1.000000  1.000000
    3 NCT01767311    104027   104027  0.999981    1.000000   1.000000      1.000000  1.000000
    4 NCT02008357    844418   844418  0.999998    1.000000   1.000000      1.000000  1.000000
    5 NCT03443973    534879   534879  0.999996    1.000000   1.000000      1.000000  1.000000
    6 NCT04777396    196339   196339  0.999990    1.000000   1.000000      1.000000  1.000000
    7 NCT05026866    130912   130912  0.999985    1.000000   1.000000      1.000000  1.000000
    8 NCT05108922    384732   384732  0.999995    1.000000   1.000000      1.000000  1.000000
    9 NCT02484547    742903   742903  0.999997    1.000000   1.000000      1.000000  1.000000
   10 NCT03444870    869861   869861  0.999998    1.

# Two other methods

In [None]:
# ================================================================
# Cell 1 ‚Äì Imports, paths, trial IDs
# ================================================================
import os
import json
from typing import Dict, Any, List
from collections import OrderedDict
from difflib import SequenceMatcher

import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from bs4 import BeautifulSoup

# ---- 10 trial IDs ----
TRIAL_IDS = [
    "NCT01767311",
    "NCT02008357",
    "NCT02477800",
    "NCT02484547",
    "NCT03443973",
    "NCT03444870",
    "NCT03887455",
    "NCT04437511",
    "NCT04770220",
    "NCT04777396",
]

print("Using trials:", TRIAL_IDS)

# ---- Folder structure (your Mac) ----
BASE_DIR = "/Users/guoshuyan/Desktop/OpenAD"

RAW_JSON  = os.path.join(BASE_DIR, "Raw_json")     # ground truth
RAW_API_V2 = os.path.join(BASE_DIR, "Raw_data")    # API v2 JSON
RAW_HTML   = os.path.join(BASE_DIR, "Raw_html")    # raw HTML pages

os.makedirs(RAW_API_V2, exist_ok=True)
os.makedirs(RAW_HTML, exist_ok=True)

print("Folders ready.")


Using trials: ['NCT01767311', 'NCT02008357', 'NCT02477800', 'NCT02484547', 'NCT03443973', 'NCT03444870', 'NCT03887455', 'NCT04437511', 'NCT04770220', 'NCT04777396']
Folders ready.


In [None]:
# ================================================================
# Cell 2 ‚Äì Utility similarity + flattening functions
# ================================================================
def text_similarity(a: str, b: str) -> float:
    """Character-level similarity in [0, 1]."""
    a, b = (a or "").strip(), (b or "").strip()
    if not a and not b: return 1.0
    if not a or not b:  return 0.0
    return SequenceMatcher(None, a, b).ratio()


def flatten_json(obj: Any, prefix: str = "") -> Dict[str, str]:
    """Flatten nested JSON fully."""
    out = {}
    if isinstance(obj, dict):
        for k, v in obj.items():
            out.update(flatten_json(v, f"{prefix}.{k}" if prefix else k))
    elif isinstance(obj, list):
        for i, v in enumerate(obj):
            out.update(flatten_json(v, f"{prefix}[{i}]"))
    else:
        out[prefix or "root"] = "" if obj is None else str(obj)
    return out


def json_to_struct_text(obj: Dict[str, Any]) -> str:
    """Turn whole JSON into deterministic structured text."""
    flat = flatten_json(obj)
    lines = [f"{k}: {v}" for k, v in sorted(flat.items())]
    return "\n".join(lines)


In [None]:
# ================================================================
# Cell 3 ‚Äì Safe I/O
# ================================================================
def load_json_if_exists(path: str) -> Dict[str, Any]:
    if not os.path.exists(path): return {}
    try:
        with open(path, "r", encoding="utf-8") as f:
            return json.load(f)
    except:
        return {}

def load_text_if_exists(path: str) -> str:
    if not os.path.exists(path): return ""
    try:
        with open(path, "r", encoding="utf-8") as f:
            return f.read()
    except:
        return ""


In [None]:
# ================================================================
# Cell 4 ‚Äì Download both raw documents (API v2 + HTML)
# ================================================================
CTGOV_V2_BASE = "https://clinicaltrials.gov/api/v2/studies"

def fetch_api_v2_json(tid: str):
    url = f"{CTGOV_V2_BASE}/{tid}"
    resp = requests.get(url, timeout=30)
    if resp.status_code != 200:
        print(f"‚ö†Ô∏è API v2 JSON not found for {tid}")
        return {}
    return resp.json()

def fetch_html(tid: str):
    url = f"https://clinicaltrials.gov/study/{tid}"
    resp = requests.get(url, timeout=30)
    if resp.status_code != 200:
        print(f"‚ö†Ô∏è HTML not found for {tid}")
        return ""
    return resp.text

for tid in TRIAL_IDS:
    # API v2
    p_json = os.path.join(RAW_API_V2, f"{tid}.json")
    if not os.path.exists(p_json):
        j = fetch_api_v2_json(tid)
        if j:
            with open(p_json, "w", encoding="utf-8") as f:
                json.dump(j, f, indent=2, ensure_ascii=False)

    # HTML
    p_html = os.path.join(RAW_HTML, f"{tid}.html")
    if not os.path.exists(p_html):
        html = fetch_html(tid)
        if html:
            with open(p_html, "w", encoding="utf-8") as f:
                f.write(html)

print("Download step complete.")


Download step complete.


In [None]:
# ================================================================
# Cell 5 ‚Äì Whole-document loaders (NO eligibility slicing)
# ================================================================
def gold_text(tid: str) -> str:
    j = load_json_if_exists(os.path.join(RAW_JSON, f"{tid}.json"))
    return json_to_struct_text(j) if j else ""

def api_v2_text(tid: str) -> str:
    j = load_json_if_exists(os.path.join(RAW_API_V2, f"{tid}.json"))
    return json_to_struct_text(j) if j else ""

def html_text(tid: str) -> str:
    html = load_text_if_exists(os.path.join(RAW_HTML, f"{tid}.html"))
    if not html: return ""
    soup = BeautifulSoup(html, "html.parser")
    return soup.get_text("\n", strip=True)


In [None]:
# ================================================================
# Cell 6 ‚Äì Only TWO extraction systems
# ================================================================
EXTRACTION_SYSTEMS = OrderedDict({
    "api_v2": {
        "label": "API v2 JSON",
        "loader": api_v2_text,
    },
    "html": {
        "label": "HTML",
        "loader": html_text,
    },
})

print("üåü Using two extraction methods:")
for k, cfg in EXTRACTION_SYSTEMS.items():
    print(f"- {k}: {cfg['label']}")


üåü Using two extraction methods:
- api_v2: API v2 JSON
- html: HTML


In [None]:
# ================================================================
# Cell 7 ‚Äì Embedding models (for similarity only)
# ================================================================
from sentence_transformers import SentenceTransformer

MODELS = OrderedDict({
    "miniLM": SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2"),
    "mpnet": SentenceTransformer("sentence-transformers/all-mpnet-base-v2"),
})

def embed(model, text):
    text = (text or "").strip()
    if not text:
        return np.zeros(model.get_sentence_embedding_dimension())
    return model.encode(text, convert_to_numpy=True)

def cosine(a, b):
    if a is None or b is None or a.size == 0 or b.size == 0: return np.nan
    na, nb = np.linalg.norm(a), np.linalg.norm(b)
    if na == 0 or nb == 0: return np.nan
    return float(np.dot(a, b) / (na * nb))


In [None]:
# ================================================================
# Cell 9 ‚Äì Summary table (ranked, cleaned)
# ================================================================
embedding_cols = [c for c in sim_df.columns if c.endswith("_cos")]

# mean cosine across all embedding models
sim_df["mean_cos"] = sim_df[embedding_cols].mean(axis=1)

# rank trials by best method similarity (higher mean_cos = better)
sim_df["rank"] = sim_df["mean_cos"].rank(ascending=False, method="first")

# keep only the columns we care about
cols_keep = [
    "rank", "trial_id", "method_label",
    "char_sim", "mean_cos",
    "gold_len", "method_len",
] + embedding_cols

summary_df = (
    sim_df[cols_keep]
    .sort_values(["rank", "trial_id", "method_label"])
    .reset_index(drop=True)
)

# nice display (rounded)
display(
    summary_df.style
    .format({
        "char_sim": "{:.3f}",
        "mean_cos": "{:.3f}",
        **{c: "{:.3f}" for c in embedding_cols}
    })
)

summary_path = os.path.join(BASE_DIR, "summary_api_vs_html.csv")
summary_df.to_csv(summary_path, index=False)
print("Saved:", summary_path)


Unnamed: 0,rank,trial_id,method_label,char_sim,mean_cos,gold_len,method_len,miniLM_cos,mpnet_cos,mean_cos.1
0,1.0,NCT03887455,API v2 JSON (whole),1.0,1.0,154889,154889,1.0,1.0,1.0
1,2.0,NCT01767311,API v2 JSON (whole),1.0,1.0,104027,104027,1.0,1.0,1.0
2,3.0,NCT02477800,API v2 JSON (whole),1.0,1.0,765203,765198,1.0,1.0,1.0
3,4.0,NCT02484547,API v2 JSON (whole),1.0,1.0,742903,742903,1.0,1.0,1.0
4,5.0,NCT03443973,API v2 JSON (whole),1.0,1.0,534879,534879,1.0,1.0,1.0
5,6.0,NCT03444870,API v2 JSON (whole),1.0,1.0,869861,869861,1.0,1.0,1.0
6,7.0,NCT04437511,API v2 JSON (whole),1.0,1.0,497698,497698,1.0,1.0,1.0
7,8.0,NCT04770220,API v2 JSON (whole),0.293,1.0,62594,351362,1.0,1.0,1.0
8,9.0,NCT04777396,API v2 JSON (whole),1.0,1.0,196339,196339,1.0,1.0,1.0
9,10.0,NCT02008357,API v2 JSON (whole),1.0,1.0,844418,844418,1.0,1.0,1.0


Saved: /Users/guoshuyan/Desktop/OpenAD/summary_api_vs_html.csv
