# Finetuning

In [1]:
!pip install unsloth datasets trl
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git@32f86a0

Collecting unsloth
  Downloading unsloth-2025.2.15-py3-none-any.whl.metadata (57 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/57.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━[0m [32m51.2/57.8 kB[0m [31m3.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.8/57.8 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting trl
  Downloading trl-0.15.2-py3-none-any.whl.metadata (11 kB)
Collecting unsloth_zoo>=2025.2.7 (from unsloth)
  Downloading unsloth_zoo-2025.2.7-py3-none-any.whl.metadata (16 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.29.post3-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting bitsandbytes (from unsloth)
  Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 

Found existing installation: unsloth 2025.2.15
Uninstalling unsloth-2025.2.15:
  Successfully uninstalled unsloth-2025.2.15
Collecting git+https://github.com/unslothai/unsloth.git@32f86a0
  Cloning https://github.com/unslothai/unsloth.git (to revision 32f86a0) to /tmp/pip-req-build-v2ms54co
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-req-build-v2ms54co
[0m  Running command git checkout -q 32f86a0
  Resolved https://github.com/unslothai/unsloth.git to commit 32f86a0
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: unsloth
  Building wheel for unsloth (pyproject.toml) ... [?25l[?25hdone
  Created wheel for unsloth: filename=unsloth-2025.2.15-py3-none-any.whl size=189134 sha256=53536191762ffd16b36ed56a892b2a6bb73344d3ccc039f1b6a33262616cdf8b
  Stored in directory: /t

In [3]:
import json
import os
import glob
from PIL import Image
import torch
from unsloth import FastVisionModel
from datasets import Dataset
from unsloth import is_bf16_supported
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTTrainer, SFTConfig

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from huggingface_hub import login
login(
  token= # replace with your own token
)

In [5]:
model_name = "unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit"
data_dir = "/content/drive/MyDrive/wanglab_quiz/data"
model_dir = "/content/drive/MyDrive/wanglab_quiz/Llama-3.2-11B-Vision"

prompt = """You are an advanced AI model specialized in analyzing chest X-ray images.
Your response must be in the following JSON format:

{
    "lung": "...",
    "heart": "...",
    "mediastinal": "...",
    "bone": "..."
}

Analyze the given chest X-ray and generate a structured report.
Describe abnormalities (if any) for:
- Lung
- Heart
- Mediastinum
- Bone

Return findings strictly in JSON format."""

In [None]:
def load_data(json_path, images_dir, split="train"):
    with open(json_path, "r") as f:
        data = json.load(f)[split]

    for item in data:
        image_folder = os.path.join(images_dir, item["id"])
        image_paths = glob.glob(os.path.join(image_folder, "*.png"))
        filtered_report = {k: v for k, v in item["report"].items() if k != "others"}
        yield {"images": image_paths, "report": filtered_report}

def pad_images(image_paths, max_images=4):
    blank_image = Image.new("RGB", (224, 224), (0, 0, 0))
    if len(image_paths) >= max_images:
        return image_paths[:max_images]
    else:
        return image_paths + [blank_image] * (max_images - len(image_paths))

def format_train_data(sample):
    padded_images = pad_images(sample["images"], max_images=4)

    return {"messages": [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
            ] + [{"type": "image", "image": img} for img in padded_images],
        },
        {
            "role": "assistant",
            "content": [{"type": "text", "text": sample["report"]}],
        },
    ],
}

train_dataset = Dataset.from_generator(lambda: load_data(
    json_path=os.path.join(data_dir, 'annotation_quiz_all.json'),
    images_dir=os.path.join(data_dir, 'images')
))
train_dataset = [format_train_data(sample) for sample in train_dataset]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
train_dataset[0]

{'messages': [{'role': 'user',
   'content': [{'type': 'text',
     'text': 'You are an advanced AI model specialized in analyzing chest X-ray images. \nYour response must be in the following JSON format:\n\n{\n    "lung": "...",\n    "heart": "...",\n    "mediastinal": "...",\n    "bone": "..."\n}\n\nAnalyze the given chest X-ray and generate a structured report.\nDescribe abnormalities (if any) for:\n- Lung\n- Heart\n- Mediastinum\n- Bone\n\nReturn findings strictly in JSON format.'},
    {'type': 'image',
     'image': '/content/drive/MyDrive/wanglab_quiz/data/images/CXR2384_IM-0942/1.png'},
    {'type': 'image',
     'image': '/content/drive/MyDrive/wanglab_quiz/data/images/CXR2384_IM-0942/0.png'},
    {'type': 'image', 'image': <PIL.Image.Image image mode=RGB size=224x224>},
    {'type': 'image',
     'image': <PIL.Image.Image image mode=RGB size=224x224>}]},
  {'role': 'assistant',
   'content': [{'type': 'text',
     'text': {'bone': 'Degenerative changes are present in the spin

In [None]:
model, tokenizer = FastVisionModel.from_pretrained(
    model_name,
    load_in_4bit = False,
    use_gradient_checkpointing = "unsloth",
)
model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers     = False,
    finetune_language_layers   = True,
    finetune_attention_modules = True,
    finetune_mlp_modules       = True,
    r = 16,
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

==((====))==  Unsloth 2025.2.15: Fast Mllama vision patching. Transformers: 4.48.3.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json:   0%|          | 0.00/89.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/5 [00:00<?, ?it/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/1.47G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/210 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/477 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.9k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/5.15k [00:00<?, ?B/s]

Unsloth: Making `model.base_model.model.language_model` require gradients


In [None]:
FastVisionModel.for_training(model)

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    data_collator = UnslothVisionDataCollator(model, tokenizer),
    train_dataset = train_dataset,
    args = SFTConfig(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 30,
        learning_rate = 2e-4,
        fp16 = not is_bf16_supported(),
        bf16 = is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = model_dir,
        remove_unused_columns = False,
        dataset_text_field = "",
        dataset_kwargs = {"skip_prepare_dataset": True},
        dataset_num_proc = 4,
        max_seq_length = 2048,
    ),
)

In [None]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 2,069 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 30
 "-____-"     Number of trainable parameters = 52,428,800
🦥 Unsloth needs about 1-3 minutes to load everything - please wait!
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mksp7292[0m ([33myws0322[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,2.7899
2,2.8185
3,2.7296
4,2.4939
5,1.9723
6,1.5754
7,1.2311
8,1.007
9,0.762
10,0.6992


TrainOutput(global_step=30, training_loss=0.9127454032500585, metrics={'train_runtime': 542.7249, 'train_samples_per_second': 0.442, 'train_steps_per_second': 0.055, 'total_flos': 2723846278218264.0, 'train_loss': 0.9127454032500585})

In [None]:
def save_model(model, save_dir):
    os.makedirs(save_dir, exist_ok=True)
    model.save_pretrained(save_dir)
    tokenizer.save_pretrained(save_dir)
    print(f"Model saved at {save_dir}")

save_model(model, os.path.join(model_dir, "final"))

Model saved at /content/drive/MyDrive/wanglab_quiz/Llama-3.2-11B-Vision/final


# Evaluation

In [None]:
!pip install -U datasets accelerate bitsandbytes

In [None]:
import os
import ast
import glob
import json
import torch
from PIL import Image
from transformers import (
    AutoModelForVision2Seq,
    AutoProcessor,
    BitsAndBytesConfig,
)
from tqdm import tqdm
from tabulate import tabulate
from collections import defaultdict
from green_score import GREEN

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
data_dir = "/content/drive/MyDrive/wanglab_quiz/data" # replace with your own data dir
model_dir = "/content/drive/MyDrive/wanglab_quiz/NoUnsloth_Qwen2-VL-7B-Instruct" # replace with your own model dir

system_message = """You are an advanced AI model specialized in analyzing chest X-ray images.
Your response **must** be in the following JSON format without any extra text:

{
    "lung": "...",
    "heart": "...",
    "mediastinal": "...",
    "bone": "..."
}
"""

prompt = """Analyze the given chest X-ray and generate a structured report.
Describe abnormalities (if any) for:
- Lung
- Heart
- Mediastinal
- Bone

Return findings strictly in JSON format as previously instructed."""

In [None]:
model, tokenizer = FastVisionModel.from_pretrained(
    os.path.join(model_dir, "final"),
    load_in_4bit = True,
    use_gradient_checkpointing = "unsloth",
)

==((====))==  Unsloth 2025.2.15: Fast Mllama vision patching. Transformers: 4.48.3.
   \\   /|    GPU: NVIDIA L4. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json:   0%|          | 0.00/375k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.94G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
def load_data(json_path, images_dir, split="train"):
    with open(json_path, "r") as f:
        data = json.load(f)[split]

    for item in data:
        image_folder = os.path.join(images_dir, item["id"])
        image_paths = glob.glob(os.path.join(image_folder, "*.png"))
        filtered_report = {k: v for k, v in item["report"].items() if k != "others"}
        yield {"images": image_paths, "report": filtered_report}

def pad_images(image_paths, max_images=4):
    blank_image = Image.new("RGB", (224, 224), (0, 0, 0))
    if len(image_paths) >= max_images:
        return image_paths[:max_images]
    else:
        return image_paths + [blank_image] * (max_images - len(image_paths))

def format_test_data(sample):
    padded_images = pad_images(sample["images"], max_images=4)

    return  {"messages": [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
            ] + [{"type": "image", "image": img} for img in padded_images],
        },
    ],
}

## Testing Set

In [None]:
test_dataset = Dataset.from_generator(lambda: load_data(
    json_path=os.path.join(data_dir, 'annotation_quiz_all.json'),
    images_dir=os.path.join(data_dir, 'images'),
    split="test"
))

test_dataset_message = [format_test_data(sample) for sample in test_dataset]

In [None]:
FastVisionModel.for_inference(model)

generated_reports = []
ground_truths = []
processed_count = 0

with tqdm(total=len(test_dataset_message), desc="Processing Samples", unit="sample") as pbar:
    for idx, sample in enumerate(test_dataset_message):
        try:
            text = tokenizer.apply_chat_template(sample["messages"], tokenize=False, add_generation_prompt=True)

            image_inputs = []
            for content_item in sample["messages"][0]["content"]:
                if content_item.get("type") == "image":
                    image_path = content_item.get("image")
                    if isinstance(image_path, str):
                        image = Image.open(image_path).convert("RGB")
                    else:
                        image = image_path
                    image_inputs.append(image)

            inputs = tokenizer(text=[text], images=[image_inputs], padding=True, return_tensors="pt").to(model.device)

            generated_ids = model.generate(**inputs, max_new_tokens=256, top_p=1.0, do_sample=True, temperature=0.8)
            generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]

            decoded_texts = tokenizer.batch_decode(generated_ids_trimmed, skip_special_tokens=True)
            generated_text = decoded_texts[0] if decoded_texts else ""

            if isinstance(generated_text, dict):
                generated_report = generated_text
            else:
                try:
                    generated_report = ast.literal_eval(generated_text)
                except (SyntaxError, ValueError):
                    print(f"\nSample {idx} Warning: Failed to parse generated text as dict. Using fallback method.")

                    generated_report = {}
                    for line in generated_text.split("\n"):
                        if ": " in line:
                            key, value = line.split(": ", 1)
                        else:
                            key, value = line, ""
                        generated_report[key] = value

            ground_truth = test_dataset[idx]["report"]

            for region in ["lung", "heart", "mediastinal", "bone"]:
                generated_report.setdefault(region, "")
                ground_truth.setdefault(region, "")

            ground_truths.append(ground_truth)
            generated_reports.append(generated_report)

            processed_count += 1
            pbar.update(1)

        except (IndexError, ValueError, KeyError, FileNotFoundError, SyntaxError) as e:
            print(f"\nError processing sample {idx}: {e}")
            print(f"Processed {processed_count}/{len(test_dataset_message)} samples so far.")
            pbar.update(1)

Processing Samples:   9%|▉         | 55/590 [14:24<2:16:36, 15.32s/sample]




Processing Samples:  42%|████▏     | 249/590 [1:04:47<1:32:00, 16.19s/sample]




Processing Samples:  53%|█████▎    | 315/590 [1:22:02<1:34:49, 20.69s/sample]




Processing Samples:  57%|█████▋    | 335/590 [1:27:35<1:21:11, 19.10s/sample]




Processing Samples:  68%|██████▊   | 403/590 [1:45:37<45:18, 14.54s/sample]




Processing Samples:  75%|███████▌  | 445/590 [1:56:34<43:40, 18.07s/sample]




Processing Samples:  94%|█████████▍| 557/590 [2:25:20<08:19, 15.15s/sample]




Processing Samples:  95%|█████████▌| 563/590 [2:26:55<07:36, 16.89s/sample]




Processing Samples: 100%|██████████| 590/590 [2:34:03<00:00, 15.67s/sample]


In [None]:
generated_reports_file = os.path.join(data_dir, "generated_reports.json")
ground_truths_file = os.path.join(data_dir, "ground_truths.json")

with open(generated_reports_file, "w", encoding="utf-8") as f:
    json.dump(generated_reports, f, ensure_ascii=False, indent=4)

with open(ground_truths_file, "w", encoding="utf-8") as f:
    json.dump(ground_truths, f, ensure_ascii=False, indent=4)

In [None]:
generated_reports_file = os.path.join(data_dir, "generated_reports.json")
ground_truths_file = os.path.join(data_dir, "ground_truths.json")

with open(generated_reports_file, "r", encoding="utf-8") as f:
    generated_reports = json.load(f)

with open(ground_truths_file, "r", encoding="utf-8") as f:
    ground_truths = json.load(f)

print(generated_reports)
print(ground_truths)

[{'bone': 'Degenerative changes of the thoracic spine.', 'heart': 'Normal cardiac silhouette.', 'lung': 'No focal areas of consolidation, effusion, or pneumothorax. No evidence of active pulmonary disease. No pleural plaques or pleural calcifications.', 'mediastinal': 'Normal thymic contour.'}, {'bone': 'Bony structures are intact.', 'heart': 'Heart is normal in size.', 'lung': 'There is no focal consolidation, effusion, or pleural effusion. No pneumothorax. No definite granulomatosis in the lungs. Normal thoracic aorta.', 'mediastinal': 'Medistinal contours are normal.'}, {'bone': '', 'heart': 'Heart size is normal.', 'lung': 'Lungs are clear. No pneumothorax. No pleural effusion. No focal airspace consolidations. No large mediastinal adenopathy, although there is a small right paratracheal lymph node.', 'mediastinal': 'Aortogram is within normal limits.'}, {'bone': '', 'heart': 'Heart is normal.', 'lung': 'Bilateral nodular densities in the lung bases. No evidence of pneumothorax or 

In [None]:
green_scorer = GREEN("StanfordAIMI/GREEN-RadPhi2", output_dir=".")

config.json:   0%|          | 0.00/876 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/21.0k [00:00<?, ?B/s]

tokenization_chexagent.py:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/999k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.85k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

In [None]:
region_results = {}
all_empty = True
for region in ["lung", "heart", "mediastinal", "bone"]:
    gt_region = [gt[region] for gt in ground_truths]
    gen_region = [gen[region] for gen in generated_reports]

    if gt_region and gen_region:
        all_empty = False
        mean, std, green_score_list, summary, result_df = green_scorer(gt_region, gen_region)
        region_results[region] = {
            "mean": mean,
            "std": std,
            "scores": green_score_list,
            "summary": summary,
            "result_df": result_df
        }

        print(f"\n### GREEN Score Summary for {region.capitalize()} ###")
        print(summary)
        print(f"\n### Detailed GREEN Scores for {region.capitalize()} ###")
        print(tabulate(result_df.head(10), headers="keys", tablefmt="grid"))

if all_empty:
    print("Skipping GREEN scoring as no valid regions were found.")

Processing data...making prompts


Map:   0%|          | 0/590 [00:00<?, ? examples/s]

Done.
==== Beginning Inference ====


74it [24:44, 20.07s/it]


==== End Inference ====
Computing summary ...


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


Seconds per example:  2.5833457724522737

### GREEN Score Summary for Lung ###

-------------GREEN-RadPhi2----------------
 [Summary]: Green average 0.5068563357546408 and standard deviation 0.35999898385988155 
 [Clinically Significant Errors Analyses]: <accuracy>. <representative error>

(a) False report of a finding in the candidate: 0.5610169491525424. 
  No pneumothorax 

(b) Missing a finding present in the reference: 0.48983050847457626. 
  Absence of pleural effusion 

(c) Misidentification of a finding's anatomic location/position: 0.9864406779661017. 
  Disease at the right upper lobe and the left upper lobe. 

(d) Misassessment of the severity of a finding: 0.988135593220339. 
  The candidate report states that the lungs are hyperexpanded and hyperinflated, which is a more severe assessment than the "relatively clear" statement in the reference report. 

(e) Mentioning a comparison that isn't in the reference: 0.9847457627118644. 
  Mention of "Lung volumes are normal" 

(f)

Map:   0%|          | 0/590 [00:00<?, ? examples/s]

Done.
==== Beginning Inference ====


74it [20:54, 16.95s/it]


==== End Inference ====
Computing summary ...


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


Seconds per example:  2.1483604318004543

### GREEN Score Summary for Heart ###

-------------GREEN-RadPhi2----------------
 [Summary]: Green average 0.6943502824858756 and standard deviation 0.4218168729306563 
 [Clinically Significant Errors Analyses]: <accuracy>. <representative error>

(a) False report of a finding in the candidate: 0.9169491525423729. 
  The candidate report falsely states that the heart size is mildly enlarged. 

(b) Missing a finding present in the reference: 0.7915254237288135. 
  The candidate report does not mention the cardiac contours. 

(c) Misidentification of a finding's anatomic location/position: 0.9966101694915255. 
  The candidate report specifies the heart size, while the reference report refers to the overall size of an object. 

(d) Misassessment of the severity of a finding: 0.9389830508474576. 
  The candidate report incorrectly states that the heart size is normal, while the reference report indicates that the heart size is at the upper limits 

Map:   0%|          | 0/590 [00:00<?, ? examples/s]

Done.
==== Beginning Inference ====


74it [21:20, 17.30s/it]


==== End Inference ====
Computing summary ...
Seconds per example:  2.1971334020970232

### GREEN Score Summary for Mediastinal ###

-------------GREEN-RadPhi2----------------
 [Summary]: Green average 0.44662631154156573 and standard deviation 0.47872950451392166 
 [Clinically Significant Errors Analyses]: <accuracy>. <representative error>

(a) False report of a finding in the candidate: 0.8305084745762712. 
  Mediastinal contours normal 

(b) Missing a finding present in the reference: 0.7593220338983051. 
  The candidate report does not mention the mediastinal contours. 

(c) Misidentification of a finding's anatomic location/position: 0.9966101694915255. 
  The candidate report mentions a "Mediastinal silhouette" instead of a "Cardiopulmonary silhouette". 

(d) Misassessment of the severity of a finding: 0.9932203389830508. 
  The candidate report misassesses the mediastinal contours as normal instead of stable. 

(e) Mentioning a comparison that isn't in the reference: 1.0. 
 Non

Map:   0%|          | 0/590 [00:00<?, ? examples/s]

Done.
==== Beginning Inference ====


  0%|          | 0/73 [00:00<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 88.00 MiB. GPU 0 has a total capacity of 22.16 GiB of which 27.38 MiB is free. Process 10775 has 22.13 GiB memory in use. Of the allocated memory 21.74 GiB is allocated by PyTorch, and 135.67 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# region_results = {}
# all_empty = True
# for region in ["bone"]:
#     gt_region = [gt[region] for gt in ground_truths]
#     gen_region = [gen[region] for gen in generated_reports]

#     if gt_region and gen_region:
#         all_empty = False
#         mean, std, green_score_list, summary, result_df = green_scorer(gt_region, gen_region)
#         region_results[region] = {
#             "mean": mean,
#             "std": std,
#             "scores": green_score_list,
#             "summary": summary,
#             "result_df": result_df
#         }

#         print(f"\n### GREEN Score Summary for {region.capitalize()} ###")
#         print(summary)
#         print(f"\n### Detailed GREEN Scores for {region.capitalize()} ###")
#         print(tabulate(result_df.head(10), headers="keys", tablefmt="grid"))

# if all_empty:
#     print("Skipping GREEN scoring as no valid regions were found.")

Processing data...making prompts


Map:   0%|          | 0/590 [00:00<?, ? examples/s]

Done.
==== Beginning Inference ====


74it [20:40, 16.76s/it]


==== End Inference ====
Computing summary ...


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/594 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  return fit_method(estimator, *args, **kwargs)


Seconds per example:  2.1405938892041223

### GREEN Score Summary for Bone ###

-------------GREEN-RadPhi2----------------
 [Summary]: Green average 0.06714689265536723 and standard deviation 0.22595529590684124 
 [Clinically Significant Errors Analyses]: <accuracy>. <representative error>

(a) False report of a finding in the candidate: 0.8491525423728814. 
  Degenerative changes of the thoracic spine 

(b) Missing a finding present in the reference: 0.823728813559322. 
  The candidate report is missing. 

(c) Misidentification of a finding's anatomic location/position: 0.9915254237288136. 
  Degenerative changes were reported in the thoracic spine instead of the spine in general. 

(d) Misassessment of the severity of a finding: 0.9864406779661017. 
  The candidate report describes the degenerative changes in the thoracic spine as "mild", which is not specified in the reference report. 

(e) Mentioning a comparison that isn't in the reference: 1.0. 
 None 

(f) Omitting a comparison 

## Validation Set

In [None]:
val_dataset = []

with open("validation_dataset.json", "r") as f:
    val_data = json.load(f)

for item in val_data:
    image_folder = os.path.join(data_dir, "images", item["id"])
    image_paths = glob.glob(os.path.join(image_folder, "*.png"))
    filtered_report = {k: v for k, v in item["classification"].items() if k != "others"}

    if image_paths:
        val_dataset.append({"images": image_paths, "report": filtered_report})

val_dataset_message = [format_test_data(sample) for sample in val_dataset]

In [None]:
FastVisionModel.for_inference(model)

generated_reports = []
ground_truths = []
processed_count = 0

with tqdm(total=len(val_dataset_message), desc="Processing Samples", unit="sample") as pbar:
    for idx, sample in enumerate(val_dataset_message):
        try:
            text = tokenizer.apply_chat_template(sample["messages"], tokenize=False, add_generation_prompt=True)

            image_inputs = []
            for content_item in sample["messages"][0]["content"]:
                if content_item.get("type") == "image":
                    image_path = content_item.get("image")
                    if isinstance(image_path, str):
                        image = Image.open(image_path).convert("RGB")
                    else:
                        image = image_path
                    image_inputs.append(image)

            inputs = tokenizer(text=[text], images=[image_inputs], padding=True, return_tensors="pt").to(model.device)

            generated_ids = model.generate(**inputs, max_new_tokens=256, top_p=1.0, do_sample=True, temperature=0.8)
            generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]

            decoded_texts = tokenizer.batch_decode(generated_ids_trimmed, skip_special_tokens=True)
            generated_text = decoded_texts[0] if decoded_texts else ""

            if isinstance(generated_text, dict):
                generated_report = generated_text
            else:
                try:
                    generated_report = ast.literal_eval(generated_text)
                except (SyntaxError, ValueError):
                    print(f"\nSample {idx} Warning: Failed to parse generated text as dict. Using fallback method.")
                    generated_report = {}
                    for line in generated_text.split("\n"):
                        if ": " in line:
                            key, value = line.split(": ", 1)
                        else:
                            key, value = line, ""
                        generated_report[key] = value

            ground_truth = val_dataset[idx]["report"]

            for region in ["lung", "heart", "mediastinal", "bone"]:
                generated_report.setdefault(region, "")
                ground_truth.setdefault(region, "")

            ground_truths.append(ground_truth)
            generated_reports.append(generated_report)

            processed_count += 1
            pbar.update(1)

        except (IndexError, ValueError, KeyError, FileNotFoundError, SyntaxError) as e:
            print(f"\nError processing sample {idx}: {e}")
            print(f"Processed {processed_count}/{len(val_dataset_message)} samples so far.")
            pbar.update(1)

Processing Samples:   2%|▏         | 7/296 [02:22<1:23:54, 17.42s/sample]




Processing Samples:  88%|████████▊ | 259/296 [1:08:53<10:05, 16.38s/sample]




Processing Samples: 100%|██████████| 296/296 [1:18:36<00:00, 15.93s/sample]


In [None]:
generated_reports_file = os.path.join(data_dir, "llama_val_generated_reports.json")
ground_truths_file = os.path.join(data_dir, "llama_val_ground_truths.json")

with open(generated_reports_file, "w", encoding="utf-8") as f:
    json.dump(generated_reports, f, ensure_ascii=False, indent=4)

with open(ground_truths_file, "w", encoding="utf-8") as f:
    json.dump(ground_truths, f, ensure_ascii=False, indent=4)

In [6]:
generated_reports_file = os.path.join(data_dir, "llama_val_generated_reports.json")
ground_truths_file = os.path.join(data_dir, "llama_val_ground_truths.json")

with open(generated_reports_file, "r", encoding="utf-8") as f:
    generated_reports = json.load(f)

with open(ground_truths_file, "r", encoding="utf-8") as f:
    ground_truths = json.load(f)

print(generated_reports)
print(ground_truths)

[{'bone': '', 'heart': 'Heart size is apical. No evidence of cardiomegaly.', 'lung': 'No focal areas of pneumothorax, no pleural effusion, and no large pleural masses. No significant lung volumes. No pneumothorax or large air collections within the lungs.', 'mediastinal': ''}, {'bone': '', 'heart': 'Heart size is normal.', 'lung': 'Lungs are expanded. No focal airspace disease. No pulmonary edema.', 'mediastinal': 'Mediastinum is normal.'}, {'bone': 'Negative for acute bony abnormality.', 'heart': 'Heart size within normal limits.', 'lung': 'Lungs are clear. No pneumothorax, effusion, or pneumoperitoneum.', 'mediastinal': 'Mediastinal contours are normal.'}, {'bone': '', 'heart': 'Mild cardiomegaly.', 'lung': 'Right upper lobe hyperinflation. Clear lungs. No focal areas of consolidation.', 'mediastinal': ''}, {'bone': 'Posterior rib notches, degenerative. No acute bony abnormalities.', 'heart': 'Normal sized heart.', 'lung': 'No focal consolidation. No pneumothorax or pleural effusion.

In [8]:
green_scorer = GREEN("StanfordAIMI/GREEN-RadPhi2", output_dir=".")

config.json:   0%|          | 0.00/876 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/21.0k [00:00<?, ?B/s]

tokenization_chexagent.py:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/999k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.85k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

In [9]:
region_results = {}
all_empty = True
for region in ["lung", "heart", "mediastinal", "bone"]:
    gt_region = [gt[region] for gt in ground_truths]
    gen_region = [gen[region] for gen in generated_reports]

    if gt_region and gen_region:
        all_empty = False
        mean, std, green_score_list, summary, result_df = green_scorer(gt_region, gen_region)
        region_results[region] = {
            "mean": mean,
            "std": std,
            "scores": green_score_list,
            "summary": summary,
            "result_df": result_df
        }

        print(f"\n### GREEN Score Summary for {region.capitalize()} ###")
        print(summary)
        print(f"\n### Detailed GREEN Scores for {region.capitalize()} ###")
        print(tabulate(result_df.head(10), headers="keys", tablefmt="grid"))

if all_empty:
    print("Skipping GREEN scoring as no valid regions were found.")

Processing data...making prompts


Map:   0%|          | 0/296 [00:00<?, ? examples/s]

Done.
==== Beginning Inference ====


100%|██████████| 37/37 [16:21<00:00, 26.53s/it]


==== End Inference ====
Computing summary ...


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/594 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Seconds per example:  3.368575808164236

### GREEN Score Summary for Lung ###

-------------GREEN-RadPhi2----------------
 [Summary]: Green average 0.49378753753753757 and standard deviation 0.35649000614323345 
 [Clinically Significant Errors Analyses]: <accuracy>. <representative error>

(a) False report of a finding in the candidate: 0.5945945945945946. 
  Hyperexpanded lungs 

(b) Missing a finding present in the reference: 0.38175675675675674. 
  Absence of pneumothorax 

(c) Misidentification of a finding's anatomic location/position: 0.9966216216216216. 
  Right middle lobe is mildly hyperexpanded. 

(d) Misassessment of the severity of a finding: 0.9797297297297297. 
  Candidate report states lungs are clear, contradicting the reference report's finding of a calcified granuloma. 

(e) Mentioning a comparison that isn't in the reference: 0.9797297297297297. 
  No large pleural effusion. 

(f) Omitting a comparison detailing a change from a prior study: 1.0. 
 None 

------------

Map:   0%|          | 0/296 [00:00<?, ? examples/s]

Done.
==== Beginning Inference ====


100%|██████████| 37/37 [13:11<00:00, 21.40s/it]


==== End Inference ====
Computing summary ...


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


Seconds per example:  2.694492771818831

### GREEN Score Summary for Heart ###

-------------GREEN-RadPhi2----------------
 [Summary]: Green average 0.5101351351351351 and standard deviation 0.4266710441741899 
 [Clinically Significant Errors Analyses]: <accuracy>. <representative error>

(a) False report of a finding in the candidate: 0.8817567567567568. 
  The candidate report falsely reports that the heart size is mildly enlarged. 

(b) Missing a finding present in the reference: 0.6587837837837838. 
  The candidate report does not mention the cardiomediastinal silhouette. 

(c) Misidentification of a finding's anatomic location/position: 0.9898648648648649. 
  The candidate report states "Heart size is normal in size" instead of "Cardiac silhouette is normal." 

(d) Misassessment of the severity of a finding: 0.9054054054054054. 
  The candidate report incorrectly states that the heart size is normal, while the reference report indicates that the heart is at the upper limits of nor

Map:   0%|          | 0/296 [00:00<?, ? examples/s]

Done.
==== Beginning Inference ====


100%|██████████| 37/37 [13:20<00:00, 21.63s/it]


==== End Inference ====
Computing summary ...


  return fit_method(estimator, *args, **kwargs)


Seconds per example:  2.751781072165515

### GREEN Score Summary for Mediastinal ###

-------------GREEN-RadPhi2----------------
 [Summary]: Green average 0.3037725225225225 and standard deviation 0.4069172845704953 
 [Clinically Significant Errors Analyses]: <accuracy>. <representative error>

(a) False report of a finding in the candidate: 0.8412162162162162. 
  Normal Mediastinal contour 

(b) Missing a finding present in the reference: 0.5878378378378378. 
  The candidate report does not mention the cardiomediastinal silhouette. 

(c) Misidentification of a finding's anatomic location/position: 0.9932432432432432. 
  Mediastinal contour is abnormal. 

(d) Misassessment of the severity of a finding: 0.9864864864864865. 
  The candidate report misassesses the severity of the mediastinal contours by stating they are at the upper limits of normal, while the reference report states they are stable. 

(e) Mentioning a comparison that isn't in the reference: 0.9966216216216216. 
  The can

Map:   0%|          | 0/296 [00:00<?, ? examples/s]

Done.
==== Beginning Inference ====


100%|██████████| 37/37 [13:46<00:00, 22.35s/it]


==== End Inference ====
Computing summary ...


  return fit_method(estimator, *args, **kwargs)


Seconds per example:  2.846199076723408

### GREEN Score Summary for Bone ###

-------------GREEN-RadPhi2----------------
 [Summary]: Green average 0.07911036036036036 and standard deviation 0.2467050064205492 
 [Clinically Significant Errors Analyses]: <accuracy>. <representative error>

(a) False report of a finding in the candidate: 0.8547297297297297. 
  Degenerative changes of the thoracic spine 

(b) Missing a finding present in the reference: 0.777027027027027. 
  The candidate report is missing. 

(c) Misidentification of a finding's anatomic location/position: 0.9797297297297297. 
  Mild degenerative changes of the thoracic spine instead of the thoracic spine. 

(d) Misassessment of the severity of a finding: 0.9797297297297297. 
  The candidate report describes the degenerative changes as "mild", which is not mentioned in the reference report. 

(e) Mentioning a comparison that isn't in the reference: 0.9966216216216216. 
  The candidate report mentions "No acute bony abnorma