In [16]:
import json
from pathlib import Path

import dspy
from dotenv import load_dotenv

load_dotenv()  # Load environment variables from .env file

True

In [3]:
path = Path("../data/radgraphXL/cleaned_data.jsonl")
assert path.exists()

In [21]:
with open(path) as f:
    data = [json.loads(line) for line in f]
print(f"Loaded {len(data)} reports from RadGraph-XL dataset.")
print(data[0].keys())  # Print keys of the first report to understand its structure

Loaded 2300 reports from RadGraph-XL dataset.
dict_keys(['dataset', 'doc_key', 'tokens', 'ner', 'relations'])


In [14]:
class RadGraphExtractor(dspy.Signature):
    input_text: str = dspy.InputField(desc="radiology report text")
    extracted_entities: list[str] = dspy.OutputField(desc="Extracted clinical entities from the report")

In [15]:
radex = dspy.ChainOfThought(RadGraphExtractor)

In [26]:
lm = dspy.LM("gemini/gemini-2.5-pro")
dspy.configure(lm=lm)
lm("write a poem about extracting clinical entities from a radiology report")  # Test the model

['The report arrives, a wall of prose,\nA river dense with medical decree,\nWhere findings sleep and diagnoses dose,\nA patient\'s story, for the code to see.\n\nNo human eye, but gaze of script and light,\nBegins to parse, to segment, and to scan,\nA silent hunter in the field of white,\nAccording to a logic-driven plan.\n\nFirst, find the landmarks, anchor to the frame:\nThe *thoracic cage*, the *liver\'s* proper name,\nThe *femur\'s shaft*, the *cerebrum\'s* winding game,\nEach organ tagged, a target in the aim.\n\nThen, seek the signal, separate the chaff,\nIt tags the *lesion*, isolates the *mass*,\nThe *pleural effusion* in a paragraph,\nThe subtle *fracture* that a glance might pass.\n\nIt notes the texture, "*ground-glass opacity*,"\nThe shape, "*spiculated*," a descriptor stark.\nIt finds the size with cold tenacity,\n"*Seven millimeters*," a numeric mark.\n\nBut just as keen, it learns what isn\'t there,\nA crucial capture, elegantly won:\n"*No evidence of metastatic spread*,

In [33]:
sample = data[0]
rad_text = " ".join(sample['tokens'])
print(rad_text)

STUDY : CT torso . HISTORY : Metastatic breast cancer and recently diagnosed squamous cell cancer . Restaging . COMPARISONS : _ _ _ . TECHNIQUE : Following the uneventful administration of intravenous contrast , MDCT images were acquired from the thoracic inlet to the pubic symphysis and displayed in axial , coronal and sagittal reconstructions . Three - minute delayed images through the abdomen were also acquired . CT OF THE CHEST WITH IV CONTRAST : Left mastectomy is again evident . Heart size is normal . No pericardial effusion . Right upper paratracheal and prevascular lymph nodes measure up to 5 mm , within normal range . There is no axillary or hilar adenopathy . A few subpleural blebs are again evident at the right lung apex . There are no pulmonary nodules . No consolidation or pleural effusion . Bibasilar dependent atelectasis . CT OF THE ABDOMEN WITH IV CONTRAST : There are numerous low - attenuation lesions throughout the liver , many of which are new compared to the CT of _

In [34]:
pred = radex(input_text=rad_text)

In [39]:
for e in pred.extracted_entities:
    print(e)

Metastatic breast cancer
squamous cell cancer
Left mastectomy
subpleural blebs
Bibasilar dependent atelectasis
liver metastases
gallstones
sigmoid diverticula
Extensive osseous metastasis
multiple left posterior rib fractures
Disease progression
Cholelithiasis
osseous metastatic disease


In [37]:
print(sample['ner'])

[[77, 77, 'Anatomy::definitely present'], [78, 78, 'Observation::definitely present'], [83, 83, 'Anatomy::definitely present'], [84, 84, 'Anatomy::definitely present'], [86, 86, 'Observation::definitely present'], [89, 89, 'Anatomy::definitely present'], [90, 90, 'Observation::definitely absent'], [92, 96, 'Anatomy::definitely present'], [97, 98, 'Observation::definitely present'], [102, 107, 'Observation::definitely present'], [112, 112, 'Anatomy::definitely present'], [114, 114, 'Anatomy::definitely present'], [115, 115, 'Observation::definitely absent'], [117, 119, 'Observation::definitely present'], [126, 128, 'Anatomy::definitely present'], [133, 133, 'Anatomy::definitely present'], [134, 134, 'Observation::definitely absent'], [137, 137, 'Observation::definitely absent'], [139, 139, 'Anatomy::definitely present'], [140, 140, 'Observation::definitely absent'], [142, 142, 'Anatomy::definitely present'], [143, 144, 'Observation::definitely present'], [156, 160, 'Observation::definit

In [42]:
lm.history[-1]['messages'][0]['content']  # View the prompt sent to the model

'Your input fields are:\n1. `input_text` (str): radiology report text\nYour output fields are:\n1. `reasoning` (str): \n2. `extracted_entities` (list[str]): Extracted clinical entities from the report\nAll interactions will be structured in the following way, with the appropriate values filled in.\n\n[[ ## input_text ## ]]\n{input_text}\n\n[[ ## reasoning ## ]]\n{reasoning}\n\n[[ ## extracted_entities ## ]]\n{extracted_entities}        # note: the value you produce must adhere to the JSON schema: {"type": "array", "items": {"type": "string"}}\n\n[[ ## completed ## ]]\nIn adhering to this structure, your objective is: \n        Given the fields `input_text`, produce the fields `extracted_entities`.'