In [1]:
import torch
import numpy as np
import torch
from PIL import Image
import random

# Load Checkpoint for Inference

In [13]:
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

def inference(model, processor, n, image_paths, seed=42):

    image_tokens = "<image>" * n
    prompt = f"<|im_start|>user {image_tokens}\nProvide a description of the findings and impressions in the radiology images given the following images of the study.|im_end|><|im_start|>assistant"

    # chat template in interleaved format work same as in sampling videos. Just pass in as many images you want for a prompt
    conversation = [
        {

        "role": "user",
        "content": [
            {"type": "text", "text": "Provide a description of the findings and impressions in the radiology images given the following images of the study."},
            {"type": "image"},
            {"type": "image"},
            ],
        },
    ]

    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)


    images = []   
    for image_path in image_paths:
        images.append(
            Image.open(image_path).convert("RGB")
        )

    inputs_images = processor(text=prompt, images=images, padding=True, return_tensors="pt").to(model.device)

    # Ensure correct dtypes for different inputs
    for key, value in inputs_images.items():
        if torch.is_tensor(value):
            if key in ['input_ids', 'attention_mask']:
                inputs_images[key] = value.long()
            elif key == 'pixel_values':
                inputs_images[key] = value.half()

    # Use deterministic generation settings
    output = model.generate(
        **inputs_images,
        max_new_tokens=512,
        do_sample=False,  # Use greedy decoding
        temperature=1.0,  # Fixed temperature
        num_beams=1,  # No beam search
        top_k=1,  # Only consider the most likely token
        top_p=1.0,  # No nucleus sampling
    )
    
    # Format and print the output
    prediction = processor.decode(output[0], skip_special_tokens=True)
    
    print("\n" + "="*80)
    print("RADIOLOGY REPORT FINDINGS")
    print("="*80)
    
    # Split the prediction into sections if it contains multiple parts
    sections = prediction.split('|im_end|')
    for section in sections:
        if section.strip():
            # Remove any remaining template markers
            cleaned_section = section.replace('|im_start|', '').replace('assistant', '').strip()
            if cleaned_section:
                print(f"\n{cleaned_section}")
    
    print("\n" + "="*80 + "\n")
    
    return prediction


In [3]:
from transformers import AutoProcessor, LlavaForConditionalGeneration, BitsAndBytesConfig

# Set a global seed for reproducibility
GLOBAL_SEED = 42
set_seed(GLOBAL_SEED)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4", 
)

original_model_id = "llava-hf/llava-interleave-qwen-7b-hf" 

original_model = LlavaForConditionalGeneration.from_pretrained(
    original_model_id, 
    torch_dtype=torch.float16,
    quantization_config=bnb_config,
    low_cpu_mem_usage=True, 
)

from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16, #Rank
    lora_alpha=32,
    target_modules=[
        'q_proj',
        'k_proj',
        'v_proj',
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)

# Enabling gradient checkpointing to reduce memory usage during fine-tuning
original_model.gradient_checkpointing_enable()

peft_model = get_peft_model(original_model, config)


  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 4/4 [03:43<00:00, 55.87s/it]


In [5]:
# Print parameters to be trained under LoRA
peft_model.print_trainable_parameters()

trainable params: 15,458,304 || all params: 8,156,558,880 || trainable%: 0.1895


In [14]:
from transformers import AutoProcessor, LlavaForConditionalGeneration, BitsAndBytesConfig

# Set a global seed for reproducibility
GLOBAL_SEED = 42
set_seed(GLOBAL_SEED)

# Initialise model configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4", 
)

# Image paths
image_paths = [
      "/working/datasets/mimic-cxr/mimic-cxr-images-512/img/p10/p10046166/s50051329/427446c1-881f5cce-85191ce1-91a58ba9-0a57d3f5.jpg",
      "/working/datasets/mimic-cxr/mimic-cxr-images-512/img/p10/p10046166/s50051329/abea5eb9-b7c32823-3a14c5ca-77868030-69c83139.jpg"
    ]
n = len(image_paths)

# Load processor, same for both models
processor = AutoProcessor.from_pretrained(original_model_id)

# Original model ID
ORIGINAL_MODEL_ID = "llava-hf/llava-interleave-qwen-7b-hf" 

 # Process with original model
original_model = LlavaForConditionalGeneration.from_pretrained(
    ORIGINAL_MODEL_ID, 
    torch_dtype=torch.float16,
    quantization_config=bnb_config,
    low_cpu_mem_usage=True, 
)
print("\n=== Original Model Processing ===")
inference(original_model, processor, n, image_paths, seed=GLOBAL_SEED)
del original_model

# Fine-tuned model ID
FINETUNED_MODEL_ID = "/working/rajan/multiview-llm/Models/Finetune/lmms-finetune/checkpoints/llava-interleave-qwen-7b_lora-True_qlora-True"

 # Process with original model
finetuned_model = LlavaForConditionalGeneration.from_pretrained(
    FINETUNED_MODEL_ID, 
    torch_dtype=torch.float16,
    quantization_config=bnb_config,
    low_cpu_mem_usage=True, 
)
print("\n=== Fine-tuned Model Processing ===")
inference(finetuned_model, processor, n, image_paths, seed=GLOBAL_SEED)
del finetuned_model



Loading checkpoint shards: 100%|██████████| 4/4 [00:10<00:00,  2.72s/it]
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.



=== Original Model Processing ===

RADIOLOGY REPORT FINDINGS

user

Provide a description of the findings and impressions in the radiology images given the following images of the study.

The first image shows a lateral view of the chest, focusing on the right side. The ribcage is visible, and there is a distinct area of concern, which appears to be a mass or a lesion. The second image is a frontal view of the chest, showing the entire ribcage and the upper part of the spine. The area of concern is still present, and the ribcage is intact. The third image is a lateral view similar to the first, but this time, the area of concern is on the left side. The ribcage is intact, and there are no visible signs of disease. The fourth image is a frontal view similar to the second, but this time, the area of concern is on the right side. The ribcage is intact, and there are no visible signs of disease.




Loading checkpoint shards: 100%|██████████| 4/4 [00:11<00:00,  2.91s/it]
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.



=== Fine-tuned Model Processing ===

RADIOLOGY REPORT FINDINGS

user

Provide a description of the findings and impressions in the radiology images given the following images of the study.

The patient is status post median sternotomy and CABG. Heart size is normal. Mediastinal and hilar contours are unremarkable. Pulmonary vasculature is not engorged. Patchy opacities are noted in the lung bases, more pronounced on the left, which may reflect areas of atelectasis. No focal consolidation, pleural effusion or pneumothorax is present. No acute osseous abnormality is visualized.




# Generate

In [50]:
import json
import torch
from transformers import AutoProcessor, LlavaForConditionalGeneration, BitsAndBytesConfig
from PIL import Image
import os

def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

def load_model_and_processor(model_path, original_model_id):
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_quant_type="nf4",
    )
    
    processor = AutoProcessor.from_pretrained(original_model_id)
    model = LlavaForConditionalGeneration.from_pretrained(
        model_path,
        torch_dtype=torch.float16,
        quantization_config=bnb_config,
        low_cpu_mem_usage=True,
    )
    
    return model, processor

def generate_description(model, processor, image_paths, system_prompt):
    images = [Image.open(img_path).convert("RGB") for img_path in image_paths]
    image_tokens = "<image>" * len(images)
    user_prompt = f"<|im_start|>user {image_tokens}\nProvide a description of the findings and impressions in the radiology images given the following images of the study.|im_end|><|im_start|>assistant"

    # chat template in interleaved format work same as in sampling videos. Just pass in as many images you want for a prompt
    conversation = [
        {

        "role": "user",
        "content": [
            {"type": "text", "text": "Provide a description of the findings and impressions in the radiology images given the following images of the study."},
            ]+ [{"type": "image"} for _ in image_paths]
        },
    ]

    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
    inputs = processor(text=prompt, images=images, padding=True, return_tensors="pt").to(model.device)
    
    for key, value in inputs.items():
        if torch.is_tensor(value):
            if key in ['input_ids', 'attention_mask']:
                inputs[key] = value.long()
            elif key == 'pixel_values':
                inputs[key] = value.half()
    
    output = model.generate(
        **inputs,
        max_new_tokens=512,
        do_sample=True,
        temperature=0.7,
        top_k=1,
        top_p=0.9,
    )

    # temperature=1.0,  # Fixed temperature
    # num_beams=1,  # No beam search
    # top_k=1,  # Only consider the most likely token
    # top_p=1.0,  # No nucleus sampling
    
    return processor.decode(output[0], skip_special_tokens=True)

def extract_study_id(image_path):
    # Split the path and find the component starting with 's'
    path_components = image_path.split('/')
    for component in path_components:
        if component.startswith('s') and component[1:].isdigit():
            return component
    return None  # Return None if no valid study_id is found

def evaluate_test_set(test_set_path, model_path, original_model_id, output_path):
    set_seed(42)  # Set a fixed seed for reproducibility
    model, processor = load_model_and_processor(model_path, original_model_id)
    
    with open(test_set_path, 'r') as f:
        test_set = json.load(f)
    
    results = []
    
    for item in test_set:
        study_id = extract_study_id(item['image'][0])
        if study_id is None:
            print(f"Warning: Could not extract study_id from {item['image'][0]}")
            continue
        
        ground_truth = item['conversations'][1]['value']
        system_prompt = item['system_prompt']
        
        generated_output = generate_description(model, processor, item['image'], system_prompt)
        
        results.append({
            'study_id': study_id,
            'ground_truth': ground_truth,
            'generated_output': generated_output
        })
    
    with open(output_path, 'w') as f:
        json.dump(results, f, indent=2)
    
    print(f"Evaluation complete. Results saved to {output_path}")


In [51]:
# Evaluate
test_set_path = '/working/rajan/multiview-llm/Models/Finetune/dataset/data/mimic_cxr_multi_test_findings.json'
model_path = '/working/rajan/multiview-llm/Models/Finetune/lmms-finetune/checkpoints/llava-interleave-qwen-7b_lora-True_qlora-True'
original_model_id = 'llava-hf/llava-interleave-qwen-7b-hf'
output_path = 'outputs/results_multi.json'

evaluate_test_set(test_set_path, model_path, original_model_id, output_path)

Loading checkpoint shards: 100%|██████████| 4/4 [00:10<00:00,  2.69s/it]
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for

Evaluation complete. Results saved to outputs/results_multi.json


In [66]:
def extract_assistant_response(generated_output):
    parts = generated_output.split('assistant')
    if len(parts) > 1:
        return parts[-1].strip()
    else:
        return generated_output.strip()

def post_process_results(input_path, output_path):
    # Read the existing results
    with open(input_path, 'r') as f:
        results = json.load(f)
    
    # Process each result
    for result in results:
        result['generated_output'] = extract_assistant_response(result['generated_output'])
    
    # Save the processed results
    with open(output_path, 'w') as f:
        json.dump(results, f, indent=2)
    
    print(f"Post-processing complete. Updated results saved to {output_path}")

In [67]:
# Usage
input_path = 'outputs/results_multi.json'
output_path = 'outputs/processed_results.json'

post_process_results(input_path, output_path)

Post-processing complete. Updated results saved to outputs/processed_results.json


In [2]:
import json
# Load results from a JSON file
with open('/working/rajan/multiview-llm/Models/Finetune/usage/outputs/processed_results.json', 'r') as f:
    results = json.load(f)

In [3]:
results[0]

{'study_id': 's50051329',
 'ground_truth': 'Lateral view somewhat limited due to overlying motion artifact. The lungs are low in volume. There is no focal airspace consolidation to suggest pneumonia. A 1.2-cm calcified granuloma just below the medial aspect of the right hemidiaphragm is unchanged from prior study. No pleural effusions or pulmonary edema. There is no pneumothorax. The inferior sternotomy wire is fractured but unchanged. Surgical clips and vascular markers in the thorax are related to prior CABG surgery.',
 'generated_output': 'The patient is status post median sternotomy and CABG. Heart size is normal. The mediastinal and hilar contours are unremarkable. The pulmonary vasculature is normal. No focal consolidation, pleural effusion or pneumothorax is present. There are no acute osseous abnormalities.'}

# Evaluate

## Lexical Metrics

In [4]:
import nltk
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.meteor.meteor import Meteor
from rouge import Rouge
import numpy as np
from tqdm import tqdm

nltk.download('punkt', quiet=True)

def calculate_metrics(results):
    references = []
    hypotheses = []
    cider_gts = {}
    cider_res = {}
    meteor_gts = {}
    meteor_res = {}
    
    print("Preprocessing data...")
    for i, result in enumerate(tqdm(results)):
        reference = nltk.word_tokenize(result['ground_truth'])
        hypothesis = nltk.word_tokenize(result['generated_output'])
        references.append([reference])
        hypotheses.append(hypothesis)
        
        cider_gts[i] = [result['ground_truth']]
        cider_res[i] = [result['generated_output']]
        meteor_gts[i] = [result['ground_truth']]
        meteor_res[i] = [result['generated_output']]
    
    print("Calculating BLEU scores...")
    smoothie = SmoothingFunction().method1
    bleu_1 = corpus_bleu(references, hypotheses, weights=(1, 0, 0, 0), smoothing_function=smoothie)
    bleu_2 = corpus_bleu(references, hypotheses, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothie)
    bleu_3 = corpus_bleu(references, hypotheses, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smoothie)
    bleu_4 = corpus_bleu(references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie)
    
    print("Calculating CIDEr score...")
    cider_scorer = Cider()
    cider_score, _ = cider_scorer.compute_score(cider_gts, cider_res)
    
    print("Calculating ROUGE scores...")
    rouge_scorer = Rouge()
    rouge_scores = rouge_scorer.get_scores([r['generated_output'] for r in results], 
                                           [r['ground_truth'] for r in results], avg=True)
    
    return {
        'BLEU-1': bleu_1,
        'BLEU-2': bleu_2,
        'BLEU-3': bleu_3,
        'BLEU-4': bleu_4,
        'CIDEr': cider_score,
        'ROUGE-1': rouge_scores['rouge-1']['f'],
        'ROUGE-2': rouge_scores['rouge-2']['f'],
        'ROUGE-L': rouge_scores['rouge-l']['f']
    }

In [5]:
# Calculate metrics - generated from `write_radiologic_report()` method
metrics = calculate_metrics(results)

# Print results
for metric, score in metrics.items():
    print(f"{metric}: {score:.4f}")

Preprocessing data...


100%|██████████| 1600/1600 [00:01<00:00, 868.94it/s]


Calculating BLEU scores...
Calculating CIDEr score...
Calculating ROUGE scores...
BLEU-1: 0.2660
BLEU-2: 0.1679
BLEU-3: 0.1159
BLEU-4: 0.0809
CIDEr: 0.0777
ROUGE-1: 0.2956
ROUGE-2: 0.1082
ROUGE-L: 0.2828


## CheXbert

In [6]:
results[12]

{'study_id': 's53356050',
 'ground_truth': 'Chest PA and lateral radiograph demonstrates unchanged cardiomediastinal and hilar contours. No overt pulmonary edema is evident though chronic mild interstitial abnormalities are stable. Faint opacification projecting over the left mid lung may represent developing infectious process. There is no definitive correlate on the lateral radiograph. No pleural effusion or pneumothorax present. Mild separation of superior aspect of sternotomy line with intact sternotomy sutures.',
 'generated_output': 'The patient is status post median sternotomy and CABG. The heart size is normal. The mediastinal and hilar contours are unremarkable. The pulmonary vascularity is normal. No focal consolidation, pleural effusion or pneumothorax is present. There are no acute osseous abnormalities.'}

In [7]:
from chexbert import CheXbertMetrics
import torch
torch.cuda.set_device(0)

# Setup CheXbertMetrics
test_chexbert_metrics = CheXbertMetrics(
    bert_path='bert-base-uncased',
    checkpoint_path='/working/rajan/multiview-llm/Models/Multi/cxrmate/checkpoints/stanford/chexbert/chexbert.pth',
    ckpt_dir='ckpt',
    mbatch_size=1,
    exp_dir='metrics',
)

# Prepare the data
res = [result['generated_output'] for result in results]
gt = [result['ground_truth'] for result in results]
ids = list(range(len(res)))

# Update CheXbert metrics
test_chexbert_metrics.update(res, gt, ids)

# compute CheXbert metrics
chexbert_scores = test_chexbert_metrics.compute()

# Print CheXbert metrics
print("CheXbert Metrics:")
for key, value in chexbert_scores.items():
    if isinstance(value, torch.Tensor):
        print(f'{key}: {value.item():.3f}')
    else:
        print(f'{key}: {value:.3f}')

  from .autonotebook import tqdm as notebook_tqdm


CheXbert Metrics:
ce_precision_macro: 0.292
ce_recall_macro: 0.226
ce_f1_macro: 0.218
ce_precision_micro: 0.472
ce_recall_micro: 0.322
ce_f1_micro: 0.383
ce_precision_example: 0.405
ce_recall_example: 0.294
ce_f1_example: 0.317
ce_num_examples: 1600.000


In [8]:

# Combine with other metrics
all_metrics = {
    **metrics,  # Your previously calculated metrics
    **chexbert_scores  # CheXbert metrics
}

# Print all metrics
print("\nAll Metrics:")
for key, value in all_metrics.items():
    print(f'{key}: {value:.3f}')


All Metrics:
BLEU-1: 0.266
BLEU-2: 0.168
BLEU-3: 0.116
BLEU-4: 0.081
CIDEr: 0.078
ROUGE-1: 0.296
ROUGE-2: 0.108
ROUGE-L: 0.283
ce_precision_macro: 0.292
ce_recall_macro: 0.226
ce_f1_macro: 0.218
ce_precision_micro: 0.472
ce_recall_micro: 0.322
ce_f1_micro: 0.383
ce_precision_example: 0.405
ce_recall_example: 0.294
ce_f1_example: 0.317
ce_num_examples: 1600.000


## CXRBERT

In [9]:
from cxr_bert import CXRBERT

# CXR-BERT:
cxr_bert_metric = CXRBERT(
    ckpt_dir='ckpt', 
    mbatch_size=10, 
    exp_dir='metrics',
    split='test',
    accumulate_over_dicoms = False)

# calculate CXR-BERT metric
study_ids = [result['study_id'] for result in results]
# dicom_ids = [result['dicom_id'] for result in results]
cxr_bert_metric.update(res, [[label] for label in gt], study_ids)
cxr_bert_scores = cxr_bert_metric.compute(epoch=1)

# Add CXR-BERT metric to all_metrics
for key, value in cxr_bert_scores.items():
    if isinstance(value, torch.Tensor):
        all_metrics[f"CXR-BERT_{key}"] = value.item()
    else:
        all_metrics[f"CXR-BERT_{key}"] = value

# Print CXR-BERT metrics
print("CXR-BERT Metrics:")
for key, value in cxr_bert_scores.items():
    if isinstance(value, torch.Tensor):
        print(f'{key}: {value.item():.3f}')
    else:
        print(f'{key}: {value:.3f}')



CXR-BERT Metrics:
cxr_bert_metric: 0.413


In [10]:
# Print summary of all metrics
print("\n===== Test Epoch End Metrics Summary =====")
for key, value in all_metrics.items():
    print(f"{key}: {value:.3f}")
print("==========================================\n")


===== Test Epoch End Metrics Summary =====
BLEU-1: 0.266
BLEU-2: 0.168
BLEU-3: 0.116
BLEU-4: 0.081
CIDEr: 0.078
ROUGE-1: 0.296
ROUGE-2: 0.108
ROUGE-L: 0.283
ce_precision_macro: 0.292
ce_recall_macro: 0.226
ce_f1_macro: 0.218
ce_precision_micro: 0.472
ce_recall_micro: 0.322
ce_f1_micro: 0.383
ce_precision_example: 0.405
ce_recall_example: 0.294
ce_f1_example: 0.317
ce_num_examples: 1600.000
CXR-BERT_cxr_bert_metric: 0.413



In [13]:
output_path = 'outputs/metrics_multiview'
with open(output_path, 'w') as f:
    json.dump(all_metrics, f, indent=2)