In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('../data/ReportsDATASET.csv')

In [4]:
df

Unnamed: 0,Text
0,\nChest PA-Lat XR\n\nImaging Study\nXray Chest...
1,"EXAM(S): Chest, 2 views, frontal and lateral\n..."
2,\nExam\nXray Chest PA and Lateral\n\nDate\nXXX...
3,\nRADIOLOGY REPORT\n\nExamination\nPA and late...
4,\nChest PA-Lat XR\n\nImaging Study\nXray Chest...
...,...
1979,\nChest PA-Lat XR\n\nImaging Study\nXray Chest...
1980,"\nExam\nPA and lateral views of the chest, XXX..."
1981,\nRADIOLOGY REPORT\n\nExam\nPA and lateral che...
1982,\nSIGNATURE\nXXXX\n\nRADIOLOGY REPORT\n\nHisto...


In [5]:
df['Text'][0]

'\nChest PA-Lat XR\n\nImaging Study\nXray Chest PA and Lateral\nExam: 2 views of the chest XXXX/XXXX.\n \nComparison: None.\n \nIndication: Positive TB test\n \nFindings:\nThe cardiac silhouette and mediastinum size are within normal limits.\nThere is no pulmonary edema. There is no focal consolidation. There\nare no XXXX of a pleural effusion. There is no evidence of\npneumothorax.\n \nImpression:\nNormal chest x-XXXX. \nThis examination and reported findings have been reviewed and\nconfirmed by the undersigned.\n\n'

## Run llama 3 locally

- install `ollama`
- run `ollama pull llama3` to pull down the llama 3 8B model 
- start the model running using `ollama run llama3`

In [6]:
import requests
import json

In [7]:
url = "http://localhost:11434/api/chat"

In [8]:
def llama3(prompt: str) -> str:
    data = {
        "model": "llama3",
        "messages": [
            {
                "role": "user",
                "content": prompt
            }
        ],
        "stream": False
    }
    
    headers = {
        'Content-Type': 'application/json'
    }
    
    response = requests.post(url, headers=headers, json=data)
    
    return response.json()['message']['content']

In [9]:
response = llama3("who wrote the book the godfather?")

In [10]:
response

'The book "The Godfather" was written by Mario Puzo, an American author and screenwriter. The novel was published in 1969 and tells the story of the Corleone crime family, which is based on the Italian-American Mafia.\n\nPuzo\'s novel was a huge success, and it went on to be adapted into a film of the same name by Francis Ford Coppola in 1972. The movie starred Marlon Brando as Don Vito Corleone and became one of the most iconic and influential films in cinema history.\n\nMario Puzo wrote several other novels and screenplays throughout his career, but "The Godfather" remains his most famous work.'

### Extract labels using llama 3

In [11]:
import xml.etree.ElementTree as ET

In [12]:
# Define the list of abnormalities
abnormalities = ["pulmonary edema", "consolidation", "pleural effusion", "pneumothorax", "cardiomegaly"]

In [13]:
def classify_abnormalities(report):
    global abnormalities
    
    # Initialize results
    results = {abnormality: 0 for abnormality in abnormalities}
    
    # Prepare the prompt for the GPT-4 model
    prompt = f"Read the following radiology report and identify the presence or absence of the following abnormalities: {', '.join(abnormalities)}.\n\nReport:\n{report}\n\nOutput the results, formatted in xml, with each of the abnormalities with 0 for absence and 1 for presence. The output should be xml with no other text."
    
    # Get the classification results from llama 3
    response = llama3(prompt)
    
    return response

In [14]:
def clean_xml_string(xml_string):
    """
    Clean the XML string to ensure it is well-formed.
    """
    # Remove leading/trailing whitespace
    xml_string = xml_string.strip()
    
    # Normalize the XML string
    xml_string = xml_string.replace('-', '_').lower()
    xml_string = xml_string.replace('pulmonary edema', 'pulmonary_edema').lower()
    xml_string = xml_string.replace('pulmonaryedema', 'pulmonary_edema').lower()
    xml_string = xml_string.replace('pleural effusion', 'pleural_effusion').lower()
    xml_string = xml_string.replace('pleuraleffusion', 'pleural_effusion').lower()
    
    # Additional cleaning steps can be added here if needed
    
    return xml_string

def extract_abnormalities_from_xml(xml_string):
    """
    This function extracts abnormalities and their values from the given XML string.
    """
    # Clean the XML string
    xml_string = clean_xml_string(xml_string)
    
    # Define the list of abnormalities we're interested in
    abnormalities = ["pulmonary_edema", "consolidation", "pleural_effusion", "pneumothorax", "cardiomegaly"]
    
    # Initialize the results dictionary
    results = {abnormality: 0 for abnormality in abnormalities}
    
    try:
        # Parse the XML
        root = ET.fromstring(xml_string)
        
        # Extract values
        for abnormality in abnormalities:
            element = root.find(f".//{abnormality}")
            if element is not None:
                results[abnormality] = int(element.text.strip())
    except ET.ParseError as e:
        print(f"Error parsing XML: {e}")
        print(f"XML string: {xml_string}")
    
    return results

In [15]:
def process_radiology_reports(df):
    """
    This function processes the radiology reports in the dataframe and extracts the abnormalities.
    """
    # Initialize a list to store the results
    data = []

    for index, row in df.iterrows():
        try:
            report_text = row['Text']

            # Here we assume `run_llama3` is your function that processes the report text and returns the XML
            xml_output = classify_abnormalities(report_text)

            # Extract abnormalities from the XML
            abnormalities = extract_abnormalities_from_xml(xml_output)

            # Combine the original text with the extracted abnormalities
            data.append({**{'Text': report_text}, **abnormalities})
        except:
            print(f'WARNING! Issue with index: {index}')
    
    # Create a new dataframe from the results
    new_df = pd.DataFrame(data)
    
    return new_df

In [16]:
df_sample = df[:50].copy()

In [17]:
df_rad = process_radiology_reports(df_sample)

Error parsing XML: syntax error: line 1, column 0
XML string: .xml
<?xml version="1.0"?>
<radiology_report>
  <pulmonary_edema>0</pulmonary_edema>
  <consolidation>0</consolidation>
  <pleural_effusion>0</pleural_effusion>
  <pneumothorax>0</pneumothorax>
  <cardiomegaly>0</cardiomegaly>
</radiology_report>


In [18]:
df_rad

Unnamed: 0,Text,pulmonary_edema,consolidation,pleural_effusion,pneumothorax,cardiomegaly
0,\nChest PA-Lat XR\n\nImaging Study\nXray Chest...,0,0,0,0,0
1,"EXAM(S): Chest, 2 views, frontal and lateral\n...",0,0,0,0,1
2,\nExam\nXray Chest PA and Lateral\n\nDate\nXXX...,0,0,0,0,0
3,\nRADIOLOGY REPORT\n\nExamination\nPA and late...,0,0,0,0,0
4,\nChest PA-Lat XR\n\nImaging Study\nXray Chest...,0,0,0,0,0
5,\nRADIOLOGY REPORT\n\nPA and Lateral Chest\nXX...,0,0,0,0,0
6,\nChest PA-Lat XR\n\nImaging Study\nXray Chest...,0,0,0,0,0
7,\nChest PA-Lat XR\n\nImaging Study\nXray Chest...,0,0,0,0,0
8,\nChest PA-Lat XR\n\nImaging Study\nXray Chest...,0,0,0,0,0
9,\nRADIOLOGY REPORT\n\nEXAM\nPA and lateral che...,0,0,0,0,0


### Save the labels to disk

In [19]:
df_rad.to_csv('../data/report_pseudo_labels_llama3.csv', index=False)

In [20]:
df_sample['Text'][12]

'\nRADIOLOGY REPORT\n\nExam\nPA and lateral chest radiograph (2 views) (2 images) Date: XXXX, XXXX at XXXX hours Indication: Chest pain. Comparison: Chest radiograph from XXXX, XXXX. Findings: The cardiac silhouette is borderline enlarged. Otherwise, there is no focal opacity. Mediastinal contours are within normal limits. There is no large pleural effusion. No pneumothorax. Transcribed by - PSCB Transcription Date - XXXX\n\nIMPRESSION\nBorderline enlargement of the cardiac silhouette without acute pulmonary disease. DICTATED BY : Dr. XXXX XXXX XXXX XXXX XXXX ELECTRONICALLY SIGNED XXXX. XXXX XXXX XXXX XXXX XXXX TRANSCRIBED XXXX 11 XXXX XXXX  RADRES XXXX\n\nSIGNATURE\nXXXX\n\n'

# Train a model on our pseudo labels

## Option 1: Fine Tune

### Step 1: Data Preparation

In [21]:
!pip install scikit-learn torch transformers datasets

Collecting scikit-learn
  Downloading scikit_learn-1.5.0-cp311-cp311-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting torch
  Downloading torch-2.3.1-cp311-none-macosx_11_0_arm64.whl.metadata (26 kB)
Collecting transformers
  Downloading transformers-4.41.2-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.13.1-cp311-cp311-macosx_12_0_arm64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Collecting filelock (f

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict

# Load your dataframe
df = pd.read_csv('path_to_your_dataframe.csv')

# Ensure the columns are in the correct format
df['Text'] = df['Text'].astype(str)
df['pulmonary_edema'] = df['pulmonary_edema'].astype(int)
df['consolidation'] = df['consolidation'].astype(int)
df['pleural_effusion'] = df['pleural_effusion'].astype(int)
df['pneumothorax'] = df['pneumothorax'].astype(int)
df['cardiomegaly'] = df['cardiomegaly'].astype(int)

# Split the dataframe into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert pandas dataframe to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Create a DatasetDict
dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset
})


### Step 2: Tokenization

In [None]:
# Define the tokenizer
tokenizer = AutoTokenizer.from_pretrained("RLHFlow/ArmoRM-Llama3-8B-v0.1", use_fast=True)

def tokenize_function(example):
    return tokenizer(example['Text'], padding="max_length", truncation=True, max_length=512)

# Apply tokenization to the dataset
tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)


### Step 3: Model Setup

In [None]:
# Load the pre-trained model
model = AutoModelForSequenceClassification.from_pretrained(
    "RLHFlow/ArmoRM-Llama3-8B-v0.1",
    num_labels=5,  # Number of labels (one for each abnormality)
    problem_type="multi_label_classification"
)

# Define the columns to keep and set the format for PyTorch
tokenized_datasets = tokenized_datasets.remove_columns(["Text"])
tokenized_datasets.set_format("torch")


### Step 4: Define Training Arguments

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)


### Step 5: Define the Trainer

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='micro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics,
)


### Step 6: Train the Model

In [None]:
trainer.train()

## Option 2: Pre-trained model feature extractor

This portion will largely follow chapter 2 of Natural Language Processing with Transformers by Tunstall, Werra, and Wolf

To get code working and as a guide we can use the emotions dataset, which looks at classifying the emotion assocated with Twitter messages and is available from Hugging Face Hub. 

In [None]:
import torch.nn.functional as F

from torch import Tensor
from transformers import AutoTokenizer, AutoModel

### Model setup

In [None]:
model_id = 'intfloat/e5-base-v2'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
model = AutoModel.from_pretrained(model_id).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_id)

### Extracting the last hidden state

In [None]:
text = "some sample text"
inputs = tokenizer(text, return_tensors="pt")

Note that the hidden state or embedding vector on the class token is being used here. This class token is the one typically used for classification tasks. We will start by using that here as well

In [None]:
def extract_hidden_states(batch):
    inputs = {k:v.to(device) for k,v in batch.items() if k in tokenizer.model_input_names}
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
    return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}

### Create feature matrix

### Train model on the extracted features

We could use a simple fully connected model where the final output has `sigmoid` activation function. Or we could use an ensemble model (e.g. xgboost).