In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('../data/ReportsDATASET.csv')

In [None]:
df

In [None]:
df['Text'][0]

## Run llama 3 locally

- install `ollama`
- run `ollama pull llama3` to pull down the llama 3 8B model 
- start the model running using `ollama run llama3`

In [None]:
import requests
import json

In [None]:
url = "http://localhost:11434/api/chat"

In [None]:
def llama3(prompt: str) -> str:
    data = {
        "model": "llama3",
        "messages": [
            {
                "role": "user",
                "content": prompt
            }
        ],
        "stream": False
    }
    
    headers = {
        'Content-Type': 'application/json'
    }
    
    response = requests.post(url, headers=headers, json=data)
    
    return response.json()['message']['content']

In [None]:
response = llama3("who wrote the book the godfather?")

In [None]:
response

### Extract labels using llama 3

In [None]:
import xml.etree.ElementTree as ET

In [None]:
# Define the list of abnormalities
abnormalities = ["pulmonary edema", "consolidation", "pleural effusion", "pneumothorax", "cardiomegaly"]

In [None]:
def classify_abnormalities(report):
    global abnormalities
    
    # Initialize results
    results = {abnormality: 0 for abnormality in abnormalities}
    
    # Prepare the prompt for the GPT-4 model
    prompt = f"Read the following radiology report and identify the presence or absence of the following abnormalities: {', '.join(abnormalities)}.\n\nReport:\n{report}\n\nOutput the results, formatted in xml, with each of the abnormalities with 0 for absence and 1 for presence. The output should be xml with no other text."
    
    # Get the classification results from llama 3
    response = llama3(prompt)
    
    return response

In [None]:
def clean_xml_string(xml_string):
    """
    Clean the XML string to ensure it is well-formed.
    """
    # Remove leading/trailing whitespace
    xml_string = xml_string.strip()
    
    # Normalize the XML string
    xml_string = xml_string.replace('-', '_').lower()
    xml_string = xml_string.replace('pulmonary edema', 'pulmonary_edema').lower()
    xml_string = xml_string.replace('pulmonaryedema', 'pulmonary_edema').lower()
    xml_string = xml_string.replace('pleural effusion', 'pleural_effusion').lower()
    xml_string = xml_string.replace('pleuraleffusion', 'pleural_effusion').lower()
    
    # Additional cleaning steps can be added here if needed
    
    return xml_string

def extract_abnormalities_from_xml(xml_string):
    """
    This function extracts abnormalities and their values from the given XML string.
    """
    # Clean the XML string
    xml_string = clean_xml_string(xml_string)
    
    # Define the list of abnormalities we're interested in
    abnormalities = ["pulmonary_edema", "consolidation", "pleural_effusion", "pneumothorax", "cardiomegaly"]
    
    # Initialize the results dictionary
    results = {abnormality: 0 for abnormality in abnormalities}
    
    try:
        # Parse the XML
        root = ET.fromstring(xml_string)
        
        # Extract values
        for abnormality in abnormalities:
            element = root.find(f".//{abnormality}")
            if element is not None:
                results[abnormality] = int(element.text.strip())
    except ET.ParseError as e:
        print(f"Error parsing XML: {e}")
        print(f"XML string: {xml_string}")
    
    return results

In [None]:
def process_radiology_reports(df):
    """
    This function processes the radiology reports in the dataframe and extracts the abnormalities.
    """
    # Initialize a list to store the results
    data = []

    for index, row in df.iterrows():
        try:
            report_text = row['Text']

            # Here we assume `run_llama3` is your function that processes the report text and returns the XML
            xml_output = classify_abnormalities(report_text)

            # Extract abnormalities from the XML
            abnormalities = extract_abnormalities_from_xml(xml_output)

            # Combine the original text with the extracted abnormalities
            data.append({**{'Text': report_text}, **abnormalities})
        except:
            print(f'WARNING! Issue with index: {index}')
    
    # Create a new dataframe from the results
    new_df = pd.DataFrame(data)
    
    return new_df

In [None]:
df_sample = df[:40].copy()

In [None]:
df_rad = process_radiology_reports(df_sample)

In [None]:
df_rad

In [None]:
df_sample['Text'][12]