# Finetuning

In [2]:
!pip install unsloth datasets trl
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

Collecting unsloth
  Downloading unsloth-2025.2.15-py3-none-any.whl.metadata (57 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/57.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.8/57.8 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting trl
  Downloading trl-0.15.2-py3-none-any.whl.metadata (11 kB)
Collecting unsloth_zoo>=2025.2.7 (from unsloth)
  Downloading unsloth_zoo-2025.2.7-py3-none-any.whl.metadata (16 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.29.post3-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting bitsandbytes (from unsloth)
  Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.9.16-py3-none-any.whl.metadata (9.4 kB)
Collecting protobuf<4.0.0 (from unsloth)
 

Found existing installation: unsloth 2025.2.15
Uninstalling unsloth-2025.2.15:
  Successfully uninstalled unsloth-2025.2.15
Collecting git+https://github.com/unslothai/unsloth.git
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-req-build-6ib03y2b
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-req-build-6ib03y2b
  Resolved https://github.com/unslothai/unsloth.git to commit 3a0d3d58d2c31d9d04bab2d712b59bb75a1e2e3b
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: unsloth
  Building wheel for unsloth (pyproject.toml) ... [?25l[?25hdone
  Created wheel for unsloth: filename=unsloth-2025.3.1-py3-none-any.whl size=189226 sha256=4104391bea18afb92811b044ae58357a57a155a2ece8d0614c0aa874bde4e40f
  Stored in directory: /tmp/pip-ephem-wheel-cache-xs1phoiu/wheels/d1/

In [3]:
import json
import os
import glob
from PIL import Image
import torch
from unsloth import FastVisionModel
from datasets import Dataset
from unsloth import is_bf16_supported
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTTrainer, SFTConfig

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
from huggingface_hub import login
login(
  token= # replace with your own token
)

In [5]:
model_name = "unsloth/Qwen2-VL-7B-Instruct-bnb-4bit"
data_dir = "/content/drive/MyDrive/wanglab_quiz/data"
model_dir = "/content/drive/MyDrive/wanglab_quiz/Qwen2-VL-7B-Instruct"

prompt = """You are an advanced AI model specialized in analyzing chest X-ray images.
Your response must be in the following JSON format:

{
    "lung": "...",
    "heart": "...",
    "mediastinal": "...",
    "bone": "..."
}

Analyze the given chest X-ray and generate a structured report.
Describe abnormalities (if any) for:
- Lung
- Heart
- Mediastinum
- Bone

Return findings strictly in JSON format."""

In [None]:
def load_data(json_path, images_dir, split="train"):
    with open(json_path, "r") as f:
        data = json.load(f)[split]

    for item in data:
        image_folder = os.path.join(images_dir, item["id"])
        image_paths = glob.glob(os.path.join(image_folder, "*.png"))
        filtered_report = {k: v for k, v in item["report"].items() if k != "others"}
        yield {"images": image_paths, "report": filtered_report}

def pad_images(image_paths, max_images=4):
    blank_image = Image.new("RGB", (224, 224), (0, 0, 0))
    if len(image_paths) >= max_images:
        return image_paths[:max_images]
    else:
        return image_paths + [blank_image] * (max_images - len(image_paths))

def format_train_data(sample):
    padded_images = pad_images(sample["images"], max_images=4)

    return {"messages": [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
            ] + [{"type": "image", "image": img} for img in padded_images],
        },
        {
            "role": "assistant",
            "content": [{"type": "text", "text": sample["report"]}],
        },
    ],
}

train_dataset = Dataset.from_generator(lambda: load_data(
    json_path=os.path.join(data_dir, 'annotation_quiz_all.json'),
    images_dir=os.path.join(data_dir, 'images')
))
train_dataset = [format_train_data(sample) for sample in train_dataset]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
train_dataset[0]

{'messages': [{'role': 'user',
   'content': [{'type': 'text',
     'text': 'You are an advanced AI model specialized in analyzing chest X-ray images.\nYour response must be in the following JSON format:\n\n{\n    "lung": "...",\n    "heart": "...",\n    "mediastinal": "...",\n    "bone": "..."\n}\n\nAnalyze the given chest X-ray and generate a structured report.\nDescribe abnormalities (if any) for:\n- Lung\n- Heart\n- Mediastinum\n- Bone\n\nReturn findings strictly in JSON format.'},
    {'type': 'image',
     'image': '/content/drive/MyDrive/wanglab_quiz/data/images/CXR2384_IM-0942/1.png'},
    {'type': 'image',
     'image': '/content/drive/MyDrive/wanglab_quiz/data/images/CXR2384_IM-0942/0.png'},
    {'type': 'image', 'image': <PIL.Image.Image image mode=RGB size=224x224>},
    {'type': 'image',
     'image': <PIL.Image.Image image mode=RGB size=224x224>}]},
  {'role': 'assistant',
   'content': [{'type': 'text',
     'text': {'bone': 'Degenerative changes are present in the spine

In [None]:
model, tokenizer = FastVisionModel.from_pretrained(
    model_name,
    load_in_4bit = False,
    use_gradient_checkpointing = "unsloth",
)
model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers     = False,
    finetune_language_layers   = True,
    finetune_attention_modules = True,
    finetune_mlp_modules       = True,
    r = 16,
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

==((====))==  Unsloth 2025.3.1: Fast Qwen2_Vl vision patching. Transformers: 4.48.3.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json:   0%|          | 0.00/56.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.69G [00:00<?, ?B/s]

`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/238 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/567 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/4.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/392 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Unsloth: Making `model.base_model.model.model` require gradients


In [None]:
FastVisionModel.for_training(model)

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    data_collator = UnslothVisionDataCollator(model, tokenizer),
    train_dataset = train_dataset,
    args = SFTConfig(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 30,
        learning_rate = 2e-4,
        fp16 = not is_bf16_supported(),
        bf16 = is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = model_dir,
        remove_unused_columns = False,
        dataset_text_field = "",
        dataset_kwargs = {"skip_prepare_dataset": True},
        dataset_num_proc = 4,
        max_seq_length = 2048,
    ),
)

In [None]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 2,069 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 30
 "-____-"     Number of trainable parameters = 40,370,176
🦥 Unsloth needs about 1-3 minutes to load everything - please wait!


Step,Training Loss
1,0.3024
2,0.2793
3,0.3942
4,0.2659
5,0.3446
6,0.2688
7,0.2428
8,0.3419
9,0.3008
10,0.3684


TrainOutput(global_step=30, training_loss=0.327088426053524, metrics={'train_runtime': 236.9847, 'train_samples_per_second': 1.013, 'train_steps_per_second': 0.127, 'total_flos': 1.1766181767493632e+16, 'train_loss': 0.327088426053524})

In [None]:
def save_model(model, save_dir):
    os.makedirs(save_dir, exist_ok=True)
    model.save_pretrained(save_dir)
    tokenizer.save_pretrained(save_dir)
    print(f"Model saved at {save_dir}")

save_model(model, os.path.join(model_dir, "final"))

Model saved at /content/drive/MyDrive/wanglab_quiz/Qwen2-VL-7B-Instruct/final


# Evaluation

In [None]:
!pip install -U datasets accelerate bitsandbytes

In [None]:
import os
import ast
import glob
import json
import torch
from PIL import Image
from transformers import (
    AutoModelForVision2Seq,
    AutoProcessor,
    BitsAndBytesConfig,
)
from tqdm import tqdm
from tabulate import tabulate
from collections import defaultdict
from green_score import GREEN

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
data_dir = "/content/drive/MyDrive/wanglab_quiz/data" # replace with your own data dir
model_dir = "/content/drive/MyDrive/wanglab_quiz/NoUnsloth_Qwen2-VL-7B-Instruct" # replace with your own model dir

system_message = """You are an advanced AI model specialized in analyzing chest X-ray images.
Your response **must** be in the following JSON format without any extra text:

{
    "lung": "...",
    "heart": "...",
    "mediastinal": "...",
    "bone": "..."
}
"""

prompt = """Analyze the given chest X-ray and generate a structured report.
Describe abnormalities (if any) for:
- Lung
- Heart
- Mediastinal
- Bone

Return findings strictly in JSON format as previously instructed."""

In [8]:
model, tokenizer = FastVisionModel.from_pretrained(
    os.path.join(model_dir, "final"),
    load_in_4bit = True,
    use_gradient_checkpointing = "unsloth",
)

==((====))==  Unsloth 2025.3.1: Fast Qwen2_Vl vision patching. Transformers: 4.48.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/6.36G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/238 [00:00<?, ?B/s]

In [10]:
def load_data(json_path, images_dir, split="train"):
    with open(json_path, "r") as f:
        data = json.load(f)[split]

    for item in data:
        image_folder = os.path.join(images_dir, item["id"])
        image_paths = glob.glob(os.path.join(image_folder, "*.png"))
        filtered_report = {k: v for k, v in item["report"].items() if k != "others"}
        yield {"images": image_paths, "report": filtered_report}

def pad_images(image_paths, max_images=4):
    blank_image = Image.new("RGB", (224, 224), (0, 0, 0))
    if len(image_paths) >= max_images:
        return image_paths[:max_images]
    else:
        return image_paths + [blank_image] * (max_images - len(image_paths))

def format_test_data(sample):
    padded_images = pad_images(sample["images"], max_images=4)

    return  {"messages": [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
            ] + [{"type": "image", "image": img} for img in padded_images],
        },
    ],
}

## Testing Set

In [None]:
test_dataset = Dataset.from_generator(lambda: load_data(
    json_path=os.path.join(data_dir, 'annotation_quiz_all.json'),
    images_dir=os.path.join(data_dir, 'images'),
    split="test"
))

test_dataset_message = [format_test_data(sample) for sample in test_dataset]

In [None]:
FastVisionModel.for_inference(model)

generated_reports = []
ground_truths = []
processed_count = 0

with tqdm(total=len(test_dataset_message), desc="Processing Samples", unit="sample") as pbar:
    for idx, sample in enumerate(test_dataset_message):
        try:
            text = tokenizer.apply_chat_template(sample["messages"], tokenize=False, add_generation_prompt=True)

            image_inputs = []
            for content_item in sample["messages"][0]["content"]:
                if content_item.get("type") == "image":
                    image_path = content_item.get("image")
                    if isinstance(image_path, str):
                        image = Image.open(image_path).convert("RGB")
                    else:
                        image = image_path
                    image_inputs.append(image)

            inputs = tokenizer(text=[text], images=[image_inputs], padding=True, return_tensors="pt").to(model.device)

            generated_ids = model.generate(**inputs, max_new_tokens=256, top_p=1.0, do_sample=True, temperature=0.8)
            generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]

            decoded_texts = tokenizer.batch_decode(generated_ids_trimmed, skip_special_tokens=True)
            generated_text = decoded_texts[0] if decoded_texts else ""

            if isinstance(generated_text, dict):
                generated_report = generated_text
            else:
                try:
                    generated_report = ast.literal_eval(generated_text)
                except (SyntaxError, ValueError):
                    print(f"\nSample {idx} Warning: Failed to parse generated text as dict. Using fallback method.")
                    generated_report = {}
                    for line in generated_text.split("\n"):
                        if ": " in line:
                            key, value = line.split(": ", 1)
                        else:
                            key, value = line, ""
                        generated_report[key] = value

            ground_truth = test_dataset[idx]["report"]

            for region in ["lung", "heart", "mediastinal", "bone"]:
                generated_report.setdefault(region, "")
                ground_truth.setdefault(region, "")

            ground_truths.append(ground_truth)
            generated_reports.append(generated_report)

            processed_count += 1
            pbar.update(1)

        except (IndexError, ValueError, KeyError, FileNotFoundError, SyntaxError) as e:
            print(f"\nError processing sample {idx}: {e}")
            print(f"Processed {processed_count}/{len(test_dataset_message)} samples so far.")
            pbar.update(1)

Processing Samples: 100%|██████████| 590/590 [1:50:15<00:00, 11.21s/sample]


In [None]:
generated_reports_file = os.path.join(data_dir, "generated_reports.json")
ground_truths_file = os.path.join(data_dir, "ground_truths.json")

with open(generated_reports_file, "w", encoding="utf-8") as f:
    json.dump(generated_reports, f, ensure_ascii=False, indent=4)

with open(ground_truths_file, "w", encoding="utf-8") as f:
    json.dump(ground_truths, f, ensure_ascii=False, indent=4)

In [None]:
generated_reports_file = os.path.join(data_dir, "generated_reports.json")
ground_truths_file = os.path.join(data_dir, "ground_truths.json")

with open(generated_reports_file, "r", encoding="utf-8") as f:
    generated_reports = json.load(f)

with open(ground_truths_file, "r", encoding="utf-8") as f:
    ground_truths = json.load(f)

print(generated_reports)
print(ground_truths)

[{'bone': '', 'heart': 'Heart size is within normal limits.', 'lung': 'Lungs are clear. No pneumothorax or pleural effusion.', 'mediastinal': 'Mediastinum is within normal limits.'}, {'bone': 'No acute bony abnormality.', 'heart': 'Heart size is within normal limits.', 'lung': 'No focal airspace consolidation or pneumothorax. No pleural effusion.', 'mediastinal': 'Mediastinal contours are within normal limits.'}, {'bone': '', 'heart': 'Heart size is normal.', 'lung': 'Lungs are clear. No pneumothorax or pleural effusion.', 'mediastinal': 'Mediastinum is normal.'}, {'bone': '', 'heart': 'Heart size is normal.', 'lung': 'Lungs are clear. No pneumothorax or pleural effusion.', 'mediastinal': 'Mediastinum is normal.'}, {'bone': '', 'heart': 'Heart size is normal.', 'lung': 'Lungs are clear. No pneumothorax or pleural effusion.', 'mediastinal': 'Mediastinum is normal.'}, {'bone': '', 'heart': 'Heart size is normal.', 'lung': 'Lungs are clear. No pneumothorax or pleural effusion.', 'mediasti

In [None]:
green_scorer = GREEN("StanfordAIMI/GREEN-RadPhi2", output_dir=".")

config.json:   0%|          | 0.00/876 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/21.0k [00:00<?, ?B/s]

tokenization_chexagent.py:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/999k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.85k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

In [None]:
region_results = {}
all_empty = True
for region in ["lung", "heart", "mediastinal", "bone"]:
    gt_region = [gt[region] for gt in ground_truths]
    gen_region = [gen[region] for gen in generated_reports]

    if gt_region and gen_region:
        all_empty = False
        mean, std, green_score_list, summary, result_df = green_scorer(gt_region, gen_region)
        region_results[region] = {
            "mean": mean,
            "std": std,
            "scores": green_score_list,
            "summary": summary,
            "result_df": result_df
        }

        print(f"\n### GREEN Score Summary for {region.capitalize()} ###")
        print(summary)
        print(f"\n### Detailed GREEN Scores for {region.capitalize()} ###")
        print(tabulate(result_df.head(10), headers="keys", tablefmt="grid"))

if all_empty:
    print("Skipping GREEN scoring as no valid regions were found.")

Processing data...making prompts


Map:   0%|          | 0/590 [00:00<?, ? examples/s]

Done.
==== Beginning Inference ====


74it [28:41, 23.27s/it]


==== End Inference ====
Computing summary ...


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/594 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


Seconds per example:  2.960715040109925

### GREEN Score Summary for Lung ###

-------------GREEN-RadPhi2----------------
 [Summary]: Green average 0.6327441485068604 and standard deviation 0.26678881871641186 
 [Clinically Significant Errors Analyses]: <accuracy>. <representative error>

(a) False report of a finding in the candidate: 0.8271186440677966. 
  No pleural effusion. 

(b) Missing a finding present in the reference: 0.35423728813559324. 
  Absence of focal airspace disease. 

(c) Misidentification of a finding's anatomic location/position: 0.9983050847457627. 
  The candidate report omits the "bilaterally" when describing the clear lungs. 

(d) Misassessment of the severity of a finding: 0.9677966101694915. 
  Absence of effusions. 

(e) Mentioning a comparison that isn't in the reference: 0.9949152542372881. 
  The candidate report mentions the absence of pneumothorax or pleural effusion, which is not mentioned in the reference report. 

(f) Omitting a comparison detailing

Map:   0%|          | 0/590 [00:00<?, ? examples/s]

Done.
==== Beginning Inference ====


74it [27:05, 21.96s/it]


==== End Inference ====
Computing summary ...


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


Seconds per example:  2.770130224955284

### GREEN Score Summary for Heart ###

-------------GREEN-RadPhi2----------------
 [Summary]: Green average 0.8161016949152542 and standard deviation 0.35206501525837336 
 [Clinically Significant Errors Analyses]: <accuracy>. <representative error>

(a) False report of a finding in the candidate: 0.976271186440678. 
  The candidate report falsely reports that the heart size is normal. 

(b) Missing a finding present in the reference: 0.8372881355932204. 
  The candidate report does not mention the contour of the heart. 

(c) Misidentification of a finding's anatomic location/position: 0.9983050847457627. 
  The candidate report mentions "heart size" instead of "size". 

(d) Misassessment of the severity of a finding: 0.9372881355932203. 
  The candidate report incorrectly states that the heart size is normal, while the reference report indicates that the heart size is at the upper limits of normal. 

(e) Mentioning a comparison that isn't in the

Map:   0%|          | 0/590 [00:00<?, ? examples/s]

Done.
==== Beginning Inference ====


74it [27:08, 22.01s/it]


==== End Inference ====
Computing summary ...


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


Seconds per example:  2.774895415063632

### GREEN Score Summary for Mediastinal ###

-------------GREEN-RadPhi2----------------
 [Summary]: Green average 0.5785310734463277 and standard deviation 0.4714549628449428 
 [Clinically Significant Errors Analyses]: <accuracy>. <representative error>

(a) False report of a finding in the candidate: 0.9169491525423729. 
  Mediastinum is normal 

(b) Missing a finding present in the reference: 0.7932203389830509. 
  Pulmonary vascularity appears within normal limits. 

(c) Misidentification of a finding's anatomic location/position: 0.9949152542372881. 
  The candidate report specifies the mediastinum, which is not mentioned in the reference report. 

(d) Misassessment of the severity of a finding: 0.9966101694915255. 
  The candidate report misassessed the mediastinum as normal instead of unchanged. 

(e) Mentioning a comparison that isn't in the reference: 1.0. 
 None 

(f) Omitting a comparison detailing a change from a prior study: 0.996610

Map:   0%|          | 0/590 [00:00<?, ? examples/s]

Done.
==== Beginning Inference ====


 40%|███▉      | 29/73 [09:09<13:41, 18.67s/it]

In [None]:
# region_results = {}
# all_empty = True
# for region in ["bone"]:
#     gt_region = [gt[region] for gt in ground_truths]
#     gen_region = [gen[region] for gen in generated_reports]

#     if gt_region and gen_region:
#         all_empty = False
#         mean, std, green_score_list, summary, result_df = green_scorer(gt_region, gen_region)
#         region_results[region] = {
#             "mean": mean,
#             "std": std,
#             "scores": green_score_list,
#             "summary": summary,
#             "result_df": result_df
#         }

#         print(f"\n### GREEN Score Summary for {region.capitalize()} ###")
#         print(summary)
#         print(f"\n### Detailed GREEN Scores for {region.capitalize()} ###")
#         print(tabulate(result_df.head(10), headers="keys", tablefmt="grid"))

# if all_empty:
#     print("Skipping GREEN scoring as no valid regions were found.")

Processing data...making prompts


Map:   0%|          | 0/590 [00:00<?, ? examples/s]

Done.
==== Beginning Inference ====


74it [23:10, 18.79s/it]


==== End Inference ====
Computing summary ...


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/594 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


Seconds per example:  2.395488776190806

### GREEN Score Summary for Bone ###

-------------GREEN-RadPhi2----------------
 [Summary]: Green average 0.10028248587570622 and standard deviation 0.29555704891940654 
 [Clinically Significant Errors Analyses]: <accuracy>. <representative error>

(a) False report of a finding in the candidate: 0.9932203389830508. 
  No acute bony abnormality. 

(b) Missing a finding present in the reference: 0.8135593220338984. 
  The candidate report is missing. 

(c) Misidentification of a finding's anatomic location/position: 0.9949152542372881. 
  The candidate report incorrectly identifies the location of the metallic implant as the thoracic spine instead of the right humerus. 

(d) Misassessment of the severity of a finding: 0.9966101694915255. 
  The candidate report does not mention the absence of fractures. 

(e) Mentioning a comparison that isn't in the reference: 1.0. 
 None 

(f) Omitting a comparison detailing a change from a prior study: 0.99830

## Validation Set

In [11]:
val_dataset = []

with open("validation_dataset.json", "r") as f:
    val_data = json.load(f)

for item in val_data:
    image_folder = os.path.join(data_dir, "images", item["id"])
    image_paths = glob.glob(os.path.join(image_folder, "*.png"))
    filtered_report = {k: v for k, v in item["classification"].items() if k != "others"}

    if image_paths:
        val_dataset.append({"images": image_paths, "report": filtered_report})

val_dataset_message = [format_test_data(sample) for sample in val_dataset]

In [18]:
FastVisionModel.for_inference(model)

generated_reports = []
ground_truths = []
processed_count = 0

with tqdm(total=len(val_dataset_message), desc="Processing Samples", unit="sample") as pbar:
    for idx, sample in enumerate(val_dataset_message):
        try:
            text = tokenizer.apply_chat_template(sample["messages"], tokenize=False, add_generation_prompt=True)

            image_inputs = []
            for content_item in sample["messages"][0]["content"]:
                if content_item.get("type") == "image":
                    image_path = content_item.get("image")
                    if isinstance(image_path, str):
                        image = Image.open(image_path).convert("RGB")
                    else:
                        image = image_path
                    image_inputs.append(image)

            inputs = tokenizer(text=[text], images=[image_inputs], padding=True, return_tensors="pt").to(model.device)

            generated_ids = model.generate(**inputs, max_new_tokens=256, top_p=1.0, do_sample=True, temperature=0.8)
            generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]

            decoded_texts = tokenizer.batch_decode(generated_ids_trimmed, skip_special_tokens=True)
            generated_text = decoded_texts[0] if decoded_texts else ""

            if isinstance(generated_text, dict):
                generated_report = generated_text
            else:
                try:
                    generated_report = ast.literal_eval(generated_text)
                except (SyntaxError, ValueError):
                    print(f"\nSample {idx} Warning: Failed to parse generated text as dict. Using fallback method.")
                    generated_report = {}
                    for line in generated_text.split("\n"):
                        if ": " in line:
                            key, value = line.split(": ", 1)
                        else:
                            key, value = line, ""
                        generated_report[key] = value

            ground_truth = val_dataset[idx]["report"]

            for region in ["lung", "heart", "mediastinal", "bone"]:
                generated_report.setdefault(region, "")
                ground_truth.setdefault(region, "")

            ground_truths.append(ground_truth)
            generated_reports.append(generated_report)

            processed_count += 1
            pbar.update(1)

        except (IndexError, ValueError, KeyError, FileNotFoundError, SyntaxError) as e:
            print(f"\nError processing sample {idx}: {e}")
            print(f"Processed {processed_count}/{len(val_dataset_message)} samples so far.")
            pbar.update(1)

Processing Samples: 100%|██████████| 296/296 [53:57<00:00, 10.94s/sample]


In [19]:
generated_reports_file = os.path.join(data_dir, "val_generated_reports.json")
ground_truths_file = os.path.join(data_dir, "val_ground_truths.json")

with open(generated_reports_file, "w", encoding="utf-8") as f:
    json.dump(generated_reports, f, ensure_ascii=False, indent=4)

with open(ground_truths_file, "w", encoding="utf-8") as f:
    json.dump(ground_truths, f, ensure_ascii=False, indent=4)

In [6]:
generated_reports_file = os.path.join(data_dir, "val_generated_reports.json")
ground_truths_file = os.path.join(data_dir, "val_ground_truths.json")

with open(generated_reports_file, "r", encoding="utf-8") as f:
    generated_reports = json.load(f)

with open(ground_truths_file, "r", encoding="utf-8") as f:
    ground_truths = json.load(f)

print(generated_reports)
print(ground_truths)

[{'bone': 'No acute bony abnormalities.', 'heart': 'Heart size is within normal limits.', 'lung': 'No focal airspace consolidation or pneumothorax. No pleural effusion.', 'mediastinal': 'Mediastinal contours are within normal limits.'}, {'bone': '', 'heart': 'Heart size is normal.', 'lung': 'Lungs are clear. No pneumothorax or pleural effusion.', 'mediastinal': 'Mediastinum is normal.'}, {'bone': 'No acute bony abnormality.', 'heart': 'Heart size is within normal limits.', 'lung': 'No focal airspace consolidation or pneumothorax. No pleural effusion.', 'mediastinal': 'Mediastinal contours are within normal limits.'}, {'bone': 'No acute bony abnormality.', 'heart': 'Heart size is within normal limits.', 'lung': 'No focal airspace consolidation or pneumothorax. No pleural effusion.', 'mediastinal': 'Mediastinal contours are within normal limits.'}, {'bone': '', 'heart': 'Heart size is normal.', 'lung': 'Lungs are clear. No pneumothorax or pleural effusion.', 'mediastinal': 'Mediastinum i

In [7]:
green_scorer = GREEN("StanfordAIMI/GREEN-RadPhi2", output_dir=".")

config.json:   0%|          | 0.00/876 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/21.0k [00:00<?, ?B/s]

tokenization_chexagent.py:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/999k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.85k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

In [8]:
region_results = {}
all_empty = True
for region in ["lung", "heart", "mediastinal", "bone"]:
    gt_region = [gt[region] for gt in ground_truths]
    gen_region = [gen[region] for gen in generated_reports]

    if gt_region and gen_region:
        all_empty = False
        mean, std, green_score_list, summary, result_df = green_scorer(gt_region, gen_region)
        region_results[region] = {
            "mean": mean,
            "std": std,
            "scores": green_score_list,
            "summary": summary,
            "result_df": result_df
        }

        print(f"\n### GREEN Score Summary for {region.capitalize()} ###")
        print(summary)
        print(f"\n### Detailed GREEN Scores for {region.capitalize()} ###")
        print(tabulate(result_df.head(10), headers="keys", tablefmt="grid"))

if all_empty:
    print("Skipping GREEN scoring as no valid regions were found.")

Processing data...making prompts


Map:   0%|          | 0/296 [00:00<?, ? examples/s]

Done.
==== Beginning Inference ====


100%|██████████| 37/37 [14:58<00:00, 24.29s/it]


==== End Inference ====
Computing summary ...




config.json:   0%|          | 0.00/594 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


Seconds per example:  3.189009539984368

### GREEN Score Summary for Lung ###

-------------GREEN-RadPhi2----------------
 [Summary]: Green average 0.591618404118404 and standard deviation 0.27709875526377714 
 [Clinically Significant Errors Analyses]: <accuracy>. <representative error>

(a) False report of a finding in the candidate: 0.8445945945945946. 
  The candidate report falsely states that the lungs are clear. 

(b) Missing a finding present in the reference: 0.23648648648648649. 
  Low lung volumes. 

(c) Misidentification of a finding's anatomic location/position: 1.0. 
 None 

(d) Misassessment of the severity of a finding: 0.9628378378378378. 
  The candidate report does not mention "focal" in the description of the lungs. 

(e) Mentioning a comparison that isn't in the reference: 0.9898648648648649. 
  No pneumothorax or pleural effusion 

(f) Omitting a comparison detailing a change from a prior study: 0.9966216216216216. 
  The candidate report omitted the comparison tha

Map:   0%|          | 0/296 [00:00<?, ? examples/s]

Done.
==== Beginning Inference ====


100%|██████████| 37/37 [13:13<00:00, 21.46s/it]


==== End Inference ====
Computing summary ...


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


Seconds per example:  2.735852731240762

### GREEN Score Summary for Heart ###

-------------GREEN-RadPhi2----------------
 [Summary]: Green average 0.5605292792792792 and standard deviation 0.4157319035860013 
 [Clinically Significant Errors Analyses]: <accuracy>. <representative error>

(a) False report of a finding in the candidate: 0.9797297297297297. 
  The candidate report falsely reports that the heart size is normal. 

(b) Missing a finding present in the reference: 0.597972972972973. 
  The candidate report does not mention the contour of the cardiomediastinal silhouette. 

(c) Misidentification of a finding's anatomic location/position: 0.9966216216216216. 
  The candidate report mentions "Heart size is normal" instead of "Heart XXXX normal". 

(d) Misassessment of the severity of a finding: 0.8783783783783784. 
  The candidate report incorrectly states that the heart size is normal, while the reference report indicates mild cardiomegaly. 

(e) Mentioning a comparison that is

Map:   0%|          | 0/296 [00:00<?, ? examples/s]

Done.
==== Beginning Inference ====


100%|██████████| 37/37 [13:21<00:00, 21.65s/it]


==== End Inference ====
Computing summary ...


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


Seconds per example:  2.787163310759776

### GREEN Score Summary for Mediastinal ###

-------------GREEN-RadPhi2----------------
 [Summary]: Green average 0.3973536036036036 and standard deviation 0.420669441329407 
 [Clinically Significant Errors Analyses]: <accuracy>. <representative error>

(a) False report of a finding in the candidate: 0.9054054054054054. 
  Mediastinal contours are within normal limits. 

(b) Missing a finding present in the reference: 0.5675675675675675. 
  The candidate report does not mention the cardiomediastinal silhouette. 

(c) Misidentification of a finding's anatomic location/position: 0.9932432432432432. 
  The candidate report mentions "Mediastinal contours" instead of "Aortic contours". 

(d) Misassessment of the severity of a finding: 0.9797297297297297. 
  The candidate report misassessed the mediastinum as normal instead of stable. 

(e) Mentioning a comparison that isn't in the reference: 1.0. 
 None 

(f) Omitting a comparison detailing a change 

Map:   0%|          | 0/296 [00:00<?, ? examples/s]

Done.
==== Beginning Inference ====


100%|██████████| 37/37 [12:05<00:00, 19.61s/it]


==== End Inference ====
Computing summary ...


  return fit_method(estimator, *args, **kwargs)


Seconds per example:  2.506884795588416

### GREEN Score Summary for Bone ###

-------------GREEN-RadPhi2----------------
 [Summary]: Green average 0.12274774774774773 and standard deviation 0.31915892569734705 
 [Clinically Significant Errors Analyses]: <accuracy>. <representative error>

(a) False report of a finding in the candidate: 0.9932432432432432. 
  No acute bony abnormalities. 

(b) Missing a finding present in the reference: 0.7297297297297297. 
  The candidate report is missing. 

(c) Misidentification of a finding's anatomic location/position: 1.0. 
 None 

(d) Misassessment of the severity of a finding: 1.0. 
 None 

(e) Mentioning a comparison that isn't in the reference: 1.0. 
 None 

(f) Omitting a comparison detailing a change from a prior study: 1.0. 
 None 

----------------------------------


### Detailed GREEN Scores for Bone ###
+----+--------------------------------------------------------------------------------------------------------------------------------