In [1]:
import pandas as pd
import json
from openai import OpenAI

- The "system" role is used to set the context or guidelines for the conversation. It can provide background information or instructions that guide how the AI should respond

In [2]:
openai_key = "xxxxxxxxxx"

def classify_report(report):
    
    prompt = f"""
    You are a specialist Radiologist who specialize in diagnosing and treating diseases and injuries using X-rays images.   
    You will be provided with a chest X-ray report, which consists of several findings (sentences).  
    Reorganize the findings from the provided chest X-ray report into anatomical regions: lung, heart, mediastinal, bone, and others. 
    For each region, assign the corresponding findings that relate specifically to it. 
    If any findings cannot be assigned to these anatomical regions, place them under "others".

    lung: any mention of lung conditions, e.g. pneumothorax, consolidation, nodules.
    heart: any mention of the heart, e.g. cardiomegaly, heart size, heart failure.
    mediastinal: any mention of structures in the mediastinum, e.g. aorta, vena cava, trachea, windpipe, lymph nodes, esophagus.
    bone: any mention of bone-related findings, e.g. fractures, abnormalities in ribs, spine.
    others: any findings that do not clearly belong to the above categories, e.g. soft tissue, skin, or general findings.

    Review the report carefully before assigning findings to the regions. Do not rush to answer. 
    Consider each finding (sentence) in each anatomical region before making the final decision. 

    Input chest X-ray report: {report}

    Expected Output:
    {{
        "lung": "",
        "heart": "",
        "mediastinal": "",
        "bone": "",
        "others": ""
    }}
    """

    messages = [ {"role": "system", "content": prompt}]

    client = OpenAI(api_key=openai_key)

    response = client.chat.completions.create(
                            model="gpt-4o-mini",
                            messages=messages,
                            response_format={"type": "text"},
                            temperature=1,
                            max_completion_tokens=2048,
                            top_p=1,
                            frequency_penalty=0,
                            presence_penalty=0) 
    
    return response.choices[0].message.content


report = """
The cardiomediastinal silhouette and pulmonary vasculature are within normal limits in size. 
The lungs are mildly hypoinflated but grossly clear of focal airspace disease, pneumothorax, or pleural effusion. 
There are mild degenerative endplate changes in the thoracic spine. There are no acute bony findings.
Mediastinal contours within normal limits in size.
"""

output = classify_report(report)
output = json.loads(output)
output

{'lung': 'The lungs are mildly hypoinflated but grossly clear of focal airspace disease, pneumothorax, or pleural effusion.',
 'heart': 'The cardiomediastinal silhouette and pulmonary vasculature are within normal limits in size.',
 'mediastinal': 'Mediastinal contours within normal limits in size.',
 'bone': 'There are mild degenerative endplate changes in the thoracic spine. There are no acute bony findings.',
 'others': ''}

In [3]:
file_path = 'data/annotation_quiz_all.json'

with open(file_path, 'r') as f:
    data = json.load(f)

data_val = data["val"]

print( len(data_val) ) 
data_val[0] 

296


{'id': 'CXR2279_IM-0865',
 'original_report': 'Heart size is enlarged. The aorta is unfolded. Otherwise the mediastinal contour is normal. There are streaky bibasilar opacities. There are no nodules or masses. No visible pneumothorax. No visible pleural fluid. The XXXX are grossly normal. There is no visible free intraperitoneal air under the diaphragm.',
 'split': 'val'}

In [4]:
import time
from tqdm.notebook import tqdm

val_classified = []
for i in tqdm(range(len(data_val))): 
    id = data_val[i]['id']
    original_report = data_val[i]['original_report'] 

    try:
        output = classify_report(original_report)
        output = json.loads(output) 
    except json.JSONDecodeError as e:
        time.sleep(20)
        output = classify_report(original_report)
        output = json.loads(output)

    val_classified.append({"id":id, "report":output, "split":"val"})

    if (i + 1) % 5 == 0:
        time.sleep(2)  


  0%|          | 0/296 [00:00<?, ?it/s]

In [7]:
data['val'] = val_classified

In [8]:
new_file_path = 'data/annotation_quiz_all_v2.json'

with open(new_file_path, 'w') as f:
    json.dump(data, f, indent=4)