In [8]:
import dspy
import json
import langextract as lx
from pathlib import Path
from dotenv import load_dotenv
from rich.console import Console

load_dotenv()

cons = Console()

In [2]:
path = Path("../data/radgraphXL/cleaned_data.jsonl")
assert path.exists()

In [11]:
with open(path, 'r') as f:
    data = [json.loads(line) for line in f]
print(f"Loaded {len(data)} reports from RadGraph-XL dataset.")
print(data[0].keys())
print(data[0]['ner'])

Loaded 2300 reports from RadGraph-XL dataset.
dict_keys(['dataset', 'doc_key', 'tokens', 'ner', 'relations'])
[[77, 77, 'Anatomy::definitely present'], [78, 78, 'Observation::definitely present'], [83, 83, 'Anatomy::definitely present'], [84, 84, 'Anatomy::definitely present'], [86, 86, 'Observation::definitely present'], [89, 89, 'Anatomy::definitely present'], [90, 90, 'Observation::definitely absent'], [92, 96, 'Anatomy::definitely present'], [97, 98, 'Observation::definitely present'], [102, 107, 'Observation::definitely present'], [112, 112, 'Anatomy::definitely present'], [114, 114, 'Anatomy::definitely present'], [115, 115, 'Observation::definitely absent'], [117, 119, 'Observation::definitely present'], [126, 128, 'Anatomy::definitely present'], [133, 133, 'Anatomy::definitely present'], [134, 134, 'Observation::definitely absent'], [137, 137, 'Observation::definitely absent'], [139, 139, 'Anatomy::definitely present'], [140, 140, 'Observation::definitely absent'], [142, 142, '

In [9]:
sample = data[0]
rad_text = " ".join(sample['tokens'])
cons.print(rad_text)

In [None]:
prompt = 'Your input fields are:\n1. `input_text` (str): radiology report text\nYour output fields are:\n1. `reasoning` (str): \n2. `extracted_entities` (list[str]): Extracted clinical entities from the report\nAll interactions will be structured in the following way, with the appropriate values filled in.\n\n[[ ## input_text ## ]]\n{input_text}\n\n[[ ## reasoning ## ]]\n{reasoning}\n\n[[ ## extracted_entities ## ]]\n{extracted_entities}        # note: the value you produce must adhere to the JSON schema: {"type": "array", "items": {"type": "string"}}\n\n[[ ## completed ## ]]\nIn adhering to this structure, your objective is: \n        Given the fields `input_text`, produce the fields `extracted_entities`.'

examples = [
    lx.data.ExampleData(
        text="STUDY : CT torso . HISTORY : Metastatic breast cancer and recently diagnosed squamous cell cancer .",
        extractions=[
            lx.data.Extraction(
                extraction_class="problem",
                extraction_text="Metastatic breast cancer",
                attributes={"assertion": "historical", "urgency": "high"}
            ),
            lx.data.Extraction(
                extraction_class="problem",
                extraction_text="squamous cell cancer",
                attributes={"assertion": "recent"}
            )
        ]
    )
]

In [19]:
result = lx.extract(
    text_or_documents=" ".join(data[1]['tokens']),
    prompt_description=prompt,
    examples=examples,
    model_id="gemini-2.5-flash",
    extraction_passes=3,    # Improves recall through multiple passes
    max_workers=20,         # Parallel processing for speed
    max_char_buffer=1000    # Smaller contexts for better accuracy
)



In [20]:
cons.print(result)