In [None]:
import json
import os
import pandas as pd
from pathlib import Path
import re
import torch
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
from typing import List
from utils import json_to_dataframe, json_to_string_list

In [None]:
# Define a central location for storing models
CENTRAL_MODEL_DIR = os.path.expanduser('~/huggingface_models')

# model_name = 'microsoft/phi-2'
# model_name = 'microsoft/phi-1_5'
# model_name = 'microsoft/Phi-3.5-mini-instruct'
# model_name = 'google/gemma-2-9b'
# model_name = 'meta-llama/Meta-Llama-3.1-8B'
# model_name = 'meta-llama/Meta-Llama-3.1-8B-Instruct' # Downloaded locally (slow)
model_name = 'google/gemma-2-2b-it' # Downloaded locally
# model_name = 'google/gemma-2-9b-it' # Downloaded locally (slow)
# model_name = 'Qwen/Qwen2.5-7B-Instruct' # Downloaded locally (slow but faster than gemma 2 9b and llama 3.1 8b)
# model_name = 'meta-llama/Llama-3.2-3B-Instruct'
# model_name = 'meta-llama/Llama-3.2-1B-Instruct'

# Create the central directory if it doesn't exist
os.makedirs(CENTRAL_MODEL_DIR, exist_ok=True)

# Define the path where the model will be saved locally
local_model_path = os.path.join(CENTRAL_MODEL_DIR, model_name.replace('/', '-'))

In [None]:
# Automatically detect and use GPU if available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Set up the device map
if torch.cuda.is_available():
    device_map = "auto"  # This will automatically distribute the model across available GPUs
else:
    device_map = {"": device}  # Use the detected device (CPU in this case)

## Optional load quantized model

This may be a good solution in cases where the full model won't fit into GPU memory

In [None]:
# import torch
# from transformers import AutoModelForCausalLM, AutoTokenizer
# from bitsandbytes.nn import Linear8bitLt

# def load_quantized_model(model_name, quantization_bit=8):
#     # Ensure bitsandbytes is installed
#     try:
#         import bitsandbytes as bnb
#     except ImportError:
#         raise ImportError("Please install bitsandbytes: pip install bitsandbytes")

#     # Set up quantization configuration
#     if quantization_bit == 8:
#         bnb_config = {'load_in_8bit': True}
#     elif quantization_bit == 4:
#         bnb_config = {'load_in_4bit': True}
#     else:
#         raise ValueError("Quantization bit must be 4 or 8")

#     # Load the tokenizer
#     tokenizer = AutoTokenizer.from_pretrained(model_name)

#     # Load the quantized model
#     model = AutoModelForCausalLM.from_pretrained(
#         model_name,
#         device_map="auto",
#         quantization_config=bnb_config,
#         trust_remote_code=True
#     )

#     return model, tokenizer

# # Usage
# model_name = 'Qwen/Qwen2.5-7B-Instruct'
# quantized_model, tokenizer = load_quantized_model(model_name, quantization_bit=8)

# # You can now use quantized_model instead of original_model in your code

### Optional huggingface login

In [None]:
# from huggingface_hub import notebook_login
# notebook_login()

In [None]:
# Check if the model exists locally
if os.path.exists(local_model_path):
    print(f"Loading model from local path: {local_model_path}")
    original_model = AutoModelForCausalLM.from_pretrained(
        local_model_path,
        device_map=device_map,
        # quantization_config=bnb_config,
        trust_remote_code=True
    )
else:
    print(f"Downloading model from {model_name}")
    original_model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map=device_map,
        # quantization_config=bnb_config,
        trust_remote_code=True
    )
    # Save the model locally
    original_model.save_pretrained(local_model_path)
    print(f"Model saved to {local_model_path}")

NOTE: If there is a warning above about offloading onto CPU, then the model will run verrrry slooowwwly

In [None]:
def create_llm_function(model, tokenizer, max_new_tokens=512, temperature=0.7):
    def llm_function(prompt: str) -> str:
        # Tokenize the input prompt
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

        # Generate the output
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                temperature=temperature,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id
            )

        # Decode the output
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Extract the model's response (everything after the prompt)
        response = generated_text[len(prompt):].strip()

        return response

    return llm_function

## Load the data

In [None]:
# load public radiology reports dataset
# df = pd.read_csv('../data/ReportsDATASET.csv')

In [None]:
filepath = '../data/vector_veterinary_imaging_2.json'

df = json_to_dataframe(filepath) 
rad_strings = json_to_string_list(filepath)

In [None]:
df

## Define pre and post processing

In [None]:
def preprocess_radiology_report(report: str) -> str:
    """
    Preprocesses a radiology report by removing unnecessary whitespace,
    newline characters, and potential HTML tags.

    Args:
    report (str): The original radiology report text

    Returns:
    str: The preprocessed radiology report text
    """
    # Remove any HTML tags
    report = re.sub(r'<[^>]+>', '', report)
    
    # Replace multiple newlines and spaces with a single space
    report = re.sub(r'\s+', ' ', report)
    
    # Remove leading and trailing whitespace
    report = report.strip()
    
    # Replace 'XXXX' with a placeholder like '[REDACTED]'
    report = re.sub(r'XXXX', '[REDACTED]', report)
    
    return report

In [None]:
# example_report = df['Text'][0]

In [None]:
# preprocessed_report = preprocess_radiology_report(example_report)
# print(preprocessed_report)

In [None]:
def post_process_abnormalities(classification_result: str) -> List[str]:
    """
    Takes the JSON string output from classify_abnormalities and returns a list of
    abnormalities that are present (have a value of 1).

    Args:
    classification_result (str): JSON string output from classify_abnormalities

    Returns:
    List[str]: List of abnormalities that are present
    """
    try:
        # Parse the JSON string into a dictionary
        result_dict = json.loads(classification_result)
        
        # Filter the dictionary for keys with value 1
        present_abnormalities = [abnormality for abnormality, value in result_dict.items() if value == 1]
        
        return present_abnormalities
    except json.JSONDecodeError:
        raise ValueError("Invalid JSON string provided")
    except Exception as e:
        raise ValueError(f"Error processing classification result: {str(e)}")

In [None]:
def post_process_llm_output(output: str) -> str:
    """
    Clean up the LLM output by removing code block markers and newlines.
    """
    # Remove code block markers
    output = re.sub(r'```(?:json)?\s*', '', output)
    output = output.replace('`', '')
    
    # Remove newlines
    output = output.replace('\n', ' ')
    
    # Remove any leading/trailing whitespace
    output = output.strip()
    
    return output

## Generate prompt to identify abnormalities

In [None]:
def classify_abnormalities(abnormalities: List[str], report: str, llm_function) -> str:
    # Preprocess the report
    preprocessed_report = preprocess_radiology_report(report)

    # Create a dynamic prompt for the LLM
    prompt = f"""
Given the following radiology report, classify the presence (1) or absence (0) of the specified abnormalities.
Output the result as a JSON string without any additional explanation.

Abnormalities to classify: {', '.join(abnormalities)}

Radiology report:
{preprocessed_report}

Output format:
{{
    "abnormality1": 0 or 1,
    "abnormality2": 0 or 1,
    ...
}}
Return a JSON string without any explanation.
"""

    # Call the LLM function with the prompt
    llm_output = llm_function(prompt)

    # Post-process the LLM output
    llm_output = post_process_llm_output(llm_output)

    # Ensure the output is valid JSON
    try:
        result = json.loads(llm_output)
        # Verify that all abnormalities are present in the output
        for abnormality in abnormalities:
            if abnormality not in result:
                print(result)
                raise ValueError(f"Missing abnormality in LLM output: {abnormality}")
        return json.dumps(result)
    except json.JSONDecodeError:
        print(result)
        raise ValueError("LLM output is not valid JSON")
    except Exception as e:
        raise ValueError(f"Error processing LLM output: {str(e)}")

# Example usage 
def mock_llm_function(prompt: str) -> str:
    # This is a mock function that simulates an LLM's response
    return '{"pulmonary edema": 1, "consolidation": 0, "pleural effusion": 1, "pneumothorax": 0, "cardiomegaly": 1}'

In [None]:
abnormalities = ["pulmonary edema", "consolidation", "pleural effusion", "pneumothorax", "cardiomegaly"]

In [None]:
# Example usage
# result = classify_abnormalities(abnormalities, example_report, mock_llm_function)
# print(result)

In [None]:
# Using the post-processing function
# present_abnormalities = post_process_abnormalities(result)
# print("Present abnormalities:", present_abnormalities)

## Create the LLM inference function

In [None]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Create the LLM function
llm_function = create_llm_function(original_model, tokenizer)

## Run a single example

In [None]:
# example_2 = df['Text'][3]

In [None]:
# example_report

In [None]:
# result = classify_abnormalities(abnormalities, example_report, llm_function)
# print(result)

In [None]:
# print(example_2)

In [None]:
# result = classify_abnormalities(abnormalities, example_2, llm_function)
# print(result)

## Run the model on a full dataset

In [None]:
def process_radiology_reports(df, abnormalities, llm_function):
    # Create new columns for each abnormality, initialized with 0
    for abnormality in abnormalities:
        df[abnormality] = 0
    
    # Create a tqdm progress bar
    for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing reports"):
        report = row['conclusions_and_recommendations']
        
        try:
            # Classify abnormalities
            classification_result = classify_abnormalities(abnormalities, report, llm_function)
            
            # Post-process the classification result
            present_abnormalities = post_process_abnormalities(classification_result)
            
            # Update the DataFrame
            for abnormality in present_abnormalities:
                df.at[index, abnormality] = 1
                
        except Exception as e:
            print(f"Error processing report at index {index}: {str(e)}")
    
    return df

In [None]:
df.head()

In [None]:
df = process_radiology_reports(df, abnormalities, llm_function)

## Optional Run batched inference

In [None]:
# def create_batch_llm_function(model, tokenizer, max_new_tokens=512, temperature=0.7, batch_size=8):
#    def batch_llm_function(prompts: List[str]) -> List[str]:
#        # Tokenize all prompts
#        inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to(model.device)
       
#        # Generate outputs for the entire batch
#        with torch.no_grad():
#            outputs = model.generate(
#                **inputs,
#                max_new_tokens=max_new_tokens,
#                temperature=temperature,
#                do_sample=True,
#                pad_token_id=tokenizer.eos_token_id
#            )
       
#        # Decode all outputs
#        generated_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
       
#        # Extract responses (everything after the respective prompts)
#        responses = [text[len(prompt):].strip() for text, prompt in zip(generated_texts, prompts)]
       
#        return responses
   
#    def process_in_batches(all_prompts: List[str]) -> List[str]:
#        all_responses = []
#        for i in range(0, len(all_prompts), batch_size):
#            batch = all_prompts[i:i+batch_size]
#            responses = batch_llm_function(batch)
#            all_responses.extend(responses)
#        return all_responses
   
#    return process_in_batches

# def process_radiology_reports_batch(df, abnormalities, batch_llm_function):
#    for abnormality in abnormalities:
#        df[abnormality] = 0
   
#    all_reports = df['conclusions_and_recommendations'].tolist()
#    all_prompts = [f"Given the following radiology report, classify the presence (1) or absence (0) of the specified abnormalities. Output the result as a JSON string without any additional explanation.\n\nAbnormalities to classify: {', '.join(abnormalities)}\n\nRadiology report:\n{preprocess_radiology_report(report)}\n\nOutput format:\n{{\n    \"abnormality1\": 0 or 1,\n    \"abnormality2\": 0 or 1,\n    ...\n}}\nReturn a JSON string without any explanation." for report in all_reports]
   
#    all_results = batch_llm_function(all_prompts)
   
#    for index, result in enumerate(all_results):
#        try:
#            classification_result = post_process_llm_output(result)
#            present_abnormalities = post_process_abnormalities(classification_result)
#            for abnormality in present_abnormalities:
#                df.at[index, abnormality] = 1
#        except Exception as e:
#            print(f"Error processing report at index {index}: {str(e)}")
   
#    return df

In [None]:
# batch_llm_function = create_batch_llm_function(original_model, tokenizer)

# df = process_radiology_reports_batch(df, abnormalities, batch_llm_function)

## Save the results locally

In [None]:
from datetime import datetime

def save_labeled_dataframe(df, model_name, base_path='./labeled_data'):
    # Clean up the model name for use in filename
    clean_model_name = model_name.replace('/', '_').replace('\\', '_')
    
    # Get current date
    current_date = datetime.now().strftime('%Y%m%d')
    
    # Create filename
    filename = f"{clean_model_name}_model_labeled_{current_date}.csv"
    
    # Ensure the base path exists
    os.makedirs(base_path, exist_ok=True)
    
    # Full path for the file
    full_path = os.path.join(base_path, filename)
    
    # Save DataFrame to CSV without index
    df.to_csv(full_path, index=False)
    
    print(f"DataFrame saved to {full_path}")
    
    return full_path

In [None]:
saved_path = save_labeled_dataframe(df, model_name)