In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('../data/ReportsDATASET.csv')

In [None]:
df

In [None]:
df['Text'][0]

## Run llama 3 locally

- install `ollama`
- run `ollama pull llama3` to pull down the llama 3 8B model 
- start the model running using `ollama run llama3`

In [None]:
import requests
import json

In [None]:
url = "http://localhost:11434/api/chat"

In [None]:
def llama3(prompt: str) -> str:
    data = {
        "model": "llama3",
        "messages": [
            {
                "role": "user",
                "content": prompt
            }
        ],
        "stream": False
    }
    
    headers = {
        'Content-Type': 'application/json'
    }
    
    response = requests.post(url, headers=headers, json=data)
    
    return response.json()['message']['content']

In [None]:
response = llama3("who wrote the book the godfather?")

In [None]:
response

### Extract labels using llama 3

In [None]:
import xml.etree.ElementTree as ET

In [None]:
# Define the list of abnormalities
abnormalities = ["pulmonary edema", "consolidation", "pleural effusion", "pneumothorax", "cardiomegaly"]

In [None]:
def classify_abnormalities(report):
    global abnormalities
    
    # Initialize results
    results = {abnormality: 0 for abnormality in abnormalities}
    
    # Prepare the prompt for the GPT-4 model
    prompt = f"Read the following radiology report and identify the presence or absence of the following abnormalities: {', '.join(abnormalities)}.\n\nReport:\n{report}\n\nOutput the results, formatted in xml, with each of the abnormalities with 0 for absence and 1 for presence. The output should be xml with no other text."
    
    # Get the classification results from llama 3
    response = llama3(prompt)
    
    return response

In [None]:
def clean_xml_string(xml_string):
    """
    Clean the XML string to ensure it is well-formed.
    """
    # Remove leading/trailing whitespace
    xml_string = xml_string.strip()
    
    # Normalize the XML string
    xml_string = xml_string.replace('-', '_').lower()
    xml_string = xml_string.replace('pulmonary edema', 'pulmonary_edema').lower()
    xml_string = xml_string.replace('pulmonaryedema', 'pulmonary_edema').lower()
    xml_string = xml_string.replace('pleural effusion', 'pleural_effusion').lower()
    xml_string = xml_string.replace('pleuraleffusion', 'pleural_effusion').lower()
    
    # Additional cleaning steps can be added here if needed
    
    return xml_string

def extract_abnormalities_from_xml(xml_string):
    """
    This function extracts abnormalities and their values from the given XML string.
    """
    # Clean the XML string
    xml_string = clean_xml_string(xml_string)
    
    # Define the list of abnormalities we're interested in
    abnormalities = ["pulmonary_edema", "consolidation", "pleural_effusion", "pneumothorax", "cardiomegaly"]
    
    # Initialize the results dictionary
    results = {abnormality: 0 for abnormality in abnormalities}
    
    try:
        # Parse the XML
        root = ET.fromstring(xml_string)
        
        # Extract values
        for abnormality in abnormalities:
            element = root.find(f".//{abnormality}")
            if element is not None:
                results[abnormality] = int(element.text.strip())
    except ET.ParseError as e:
        print(f"Error parsing XML: {e}")
        print(f"XML string: {xml_string}")
    
    return results

In [None]:
def process_radiology_reports(df):
    """
    This function processes the radiology reports in the dataframe and extracts the abnormalities.
    """
    # Initialize a list to store the results
    data = []

    for index, row in df.iterrows():
        try:
            report_text = row['Text']

            # Here we assume `run_llama3` is your function that processes the report text and returns the XML
            xml_output = classify_abnormalities(report_text)

            # Extract abnormalities from the XML
            abnormalities = extract_abnormalities_from_xml(xml_output)

            # Combine the original text with the extracted abnormalities
            data.append({**{'Text': report_text}, **abnormalities})
        except:
            print(f'WARNING! Issue with index: {index}')
    
    # Create a new dataframe from the results
    new_df = pd.DataFrame(data)
    
    return new_df

In [None]:
df_sample = df[:50].copy()

In [None]:
df_rad = process_radiology_reports(df_sample)

In [None]:
df_rad

### Save the labels to disk

In [None]:
df_rad.to_csv('../data/report_pseudo_labels_llama3.csv', index=False)

In [None]:
df_sample['Text'][12]

## Load data

In [None]:
df_rad = pd.read_csv('../data/report_pseudo_labels_llama3.csv')

In [None]:
df_rad

# Train a model on our pseudo labels

## Option 1: Fine Tune

### Step 1: Data Preparation

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict

# Load your dataframe
df = pd.read_csv('path_to_your_dataframe.csv')

# Ensure the columns are in the correct format
df['Text'] = df['Text'].astype(str)
df['pulmonary_edema'] = df['pulmonary_edema'].astype(int)
df['consolidation'] = df['consolidation'].astype(int)
df['pleural_effusion'] = df['pleural_effusion'].astype(int)
df['pneumothorax'] = df['pneumothorax'].astype(int)
df['cardiomegaly'] = df['cardiomegaly'].astype(int)

# Split the dataframe into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert pandas dataframe to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Create a DatasetDict
dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset
})


### Step 2: Tokenization

In [None]:
# Define the tokenizer
tokenizer = AutoTokenizer.from_pretrained("RLHFlow/ArmoRM-Llama3-8B-v0.1", use_fast=True)

def tokenize_function(example):
    return tokenizer(example['Text'], padding="max_length", truncation=True, max_length=512)

# Apply tokenization to the dataset
tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)


### Step 3: Model Setup

In [None]:
# Load the pre-trained model
model = AutoModelForSequenceClassification.from_pretrained(
    "RLHFlow/ArmoRM-Llama3-8B-v0.1",
    num_labels=5,  # Number of labels (one for each abnormality)
    problem_type="multi_label_classification"
)

# Define the columns to keep and set the format for PyTorch
tokenized_datasets = tokenized_datasets.remove_columns(["Text"])
tokenized_datasets.set_format("torch")


### Step 4: Define Training Arguments

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)


### Step 5: Define the Trainer

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='micro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics,
)


### Step 6: Train the Model

In [None]:
trainer.train()

## Option 2: Pre-trained model feature extractor

This portion will largely follow chapter 2 of Natural Language Processing with Transformers by Tunstall, Werra, and Wolf

To get code working and as a guide we can use the emotions dataset, which looks at classifying the emotion assocated with Twitter messages and is available from Hugging Face Hub. 

### Look at class distribution

In [None]:
import matplotlib.pyplot as plt

In [None]:
abnormalities = ['consolidation', 'pneumothorax', 'cardiomegaly']

In [None]:
# Count the occurrences of 1s and 0s for each column
counts = df_rad[abnormalities].apply(pd.Series.value_counts).T
counts.columns = ['0', '1']

In [None]:
counts.fillna(0, inplace=True)

In [None]:
counts

In [None]:
# Plot the counts
counts.plot(kind='bar', stacked=True)
plt.title('Distribution of Conditions')
plt.xlabel('Condition')
plt.ylabel('Count')
plt.legend(title='Value')
plt.show()

### Look at length of radiology reports

In [None]:
df_rad['words per report'] = df_rad['Text'].str.split().apply(len)

In [None]:
# Plot the histogram for the "words per report" column
df_rad['words per report'].hist(bins=10, edgecolor='black')

# Add titles and labels
plt.title('Distribution of Words per Report')
plt.xlabel('Words per Report')
plt.ylabel('Frequency')

# Show the plot
plt.show()

In [None]:
df_rad

In [None]:
import torch.nn.functional as F
import torch

from torch import Tensor
from transformers import AutoTokenizer, AutoModel

### Model setup

In [None]:
model_id = 'intfloat/e5-small-v2'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
model = AutoModel.from_pretrained(model_id).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_id)

### Tokenization

In [None]:
df_rad['Text'][0]

Get the encoded token ids for the text above

In [None]:
encoded_text = tokenizer(df_rad['Text'][0])
print(encoded_text)

Let's see how the original text was tokenized into words and subwords

In [None]:
tokens = tokenizer.convert_ids_to_tokens(tokenizer(df_rad['Text'][0]).input_ids)
print(tokens)

In [None]:
print(tokenizer.convert_tokens_to_string(tokens))

The tokenizer also has a few useful attributes to understand its properties

In [None]:
tokenizer.vocab_size

In [None]:
tokenizer.model_max_length

In [None]:
tokenizer.model_input_names

### Get stats around tokenization length of the reports

In [None]:
def get_token_length(text):
    tokens = tokenizer(text)
    return len(tokens['input_ids'])

In [None]:
df_rad['token_length'] = df_rad['Text'].apply(get_token_length)

In [None]:
df_rad['token_length'].max()

In [None]:
# Plot the histogram for the "tokens per report" column
df_rad['token_length'].hist(bins=10, edgecolor='black')

# Add titles and labels
plt.title('Distribution of Tokens per Report')
plt.xlabel('Tokens per Report')
plt.ylabel('Frequency')

# Show the plot
plt.show()

### Extracting the last hidden state

In [None]:
text = "some sample text"
inputs = tokenizer(text, return_tensors="pt")

Note that the hidden state or embedding vector on the class token is being used here. This class token is the one typically used for classification tasks. We will start by using that here as well

In [None]:
def extract_hidden_states(batch):
    inputs = {k:v.to(device) for k,v in batch.items() if k in tokenizer.model_input_names}
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
    return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}

### Create feature matrix

### Train model on the extracted features

We could use a simple fully connected model where the final output has `sigmoid` activation function. Or we could use an ensemble model (e.g. xgboost).