In [1]:
from pathlib import Path
import rich
import polars as pl
import json

In [2]:
console = rich.get_console()

In [3]:
radgraphXL_path = Path("../data/radgraphXL")
assert radgraphXL_path.exists(), "RadGraph2 data path does not exist."

In [4]:
mimic_data = []
with open(radgraphXL_path / "mimic-radgraph-XL.jsonl", "r") as f:
    for line in f:
        item = json.loads(line)
        mimic_data.append(item)
console.print(f"Loaded {len(mimic_data)} examples")

stanford_data = []
with open(radgraphXL_path / "stanford-radgraph-XL.jsonl", "r") as f:
    for line in f:
        item = json.loads(line)
        stanford_data.append(item)
console.print(f"Loaded {len(stanford_data)} examples")

all_data = mimic_data + stanford_data
console.print(f"Total examples: {len(all_data)}")

In [5]:
cleaned_data = []
for item in all_data:
    dataset = item.get("dataset", "unknown")
    doc_key = item.get("doc_key", "unknown")
    tokens = item['sentences'][0]
    ner = item['ner'][0]
    ner = [(start, end, label) for (start, end, label) in ner]
    relations = item['relations'][0]
    relations = [(start1, end1, start2, end2, label) for (start1, end1, start2, end2, label) in relations]
    cleaned_data.append({
        "dataset": dataset,
        "doc_key": doc_key,
        "tokens": tokens,
        "ner": ner,
        "relations": relations
    })

In [6]:
ner

[(54, 54, 'Observation::definitely absent'),
 (55, 55, 'Observation::definitely absent'),
 (57, 57, 'Observation::definitely absent'),
 (59, 59, 'Observation::definitely absent'),
 (61, 61, 'Anatomy::definitely present'),
 (62, 62, 'Observation::definitely absent'),
 (66, 66, 'Anatomy::definitely present'),
 (67, 68, 'Observation::definitely absent'),
 (71, 71, 'Observation::definitely absent'),
 (72, 72, 'Anatomy::definitely present'),
 (73, 73, 'Observation::definitely absent'),
 (77, 77, 'Anatomy::definitely present'),
 (79, 79, 'Anatomy::definitely present'),
 (80, 80, 'Anatomy::definitely present'),
 (82, 82, 'Observation::definitely present'),
 (91, 93, 'Observation::definitely present'),
 (96, 96, 'Anatomy::definitely present'),
 (97, 97, 'Anatomy::definitely present'),
 (100, 100, 'Anatomy::definitely present'),
 (101, 101, 'Anatomy::definitely present'),
 (103, 103, 'Observation::definitely present'),
 (105, 105, 'Anatomy::definitely present'),
 (106, 106, 'Anatomy::definitely

In [6]:
with open("../data/radgraphXL/cleaned_data.jsonl", "w") as f:
    for item in cleaned_data:
        json.dump(item, f)
        f.write("\n")
console.print(f"Saved {len(cleaned_data)} examples to cleaned_data.jsonl")