# Finetuning

In [1]:
!pip install datasets qwen_vl_utils
!pip install -U accelerate bitsandbytes

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting qwen_vl_utils
  Downloading qwen_vl_utils-0.0.10-py3-none-any.whl.metadata (6.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting av (from qwen_vl_utils)
  Downloading av-14.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.6 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading qwen_vl_utils-0.0.10-py3-none-any.whl (6.7 kB)
Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
import os
import glob
import json
import torch
from PIL import Image
from transformers import (
    Qwen2VLProcessor,
    AutoModelForVision2Seq,
    AutoProcessor,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
)
from datasets import Dataset
from peft import get_peft_model, LoraConfig
from qwen_vl_utils import process_vision_info

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [5]:
from huggingface_hub import login
login(
  token= # replace with your own token
)

In [6]:
model_name = "Qwen/Qwen2-VL-7B-Instruct"
data_dir = "/content/drive/MyDrive/wanglab_quiz/data" # replace with your own data dir
model_dir = "/content/drive/MyDrive/wanglab_quiz/NoUnsloth_Qwen2-VL-7B-Instruct" # replace with your own model dir

system_message = """You are an advanced AI model specialized in analyzing chest X-ray images.
Your response **must** be in the following JSON format without any extra text:

{
    "lung": "...",
    "heart": "...",
    "mediastinal": "...",
    "bone": "..."
}
"""

prompt = """Analyze the given chest X-ray and generate a structured report.
Describe abnormalities (if any) for:
- Lung
- Heart
- Mediastinum
- Bone

Return findings strictly in JSON format as previously instructed."""

In [6]:
def load_data(json_path, images_dir):
    with open(json_path, 'r') as f:
        data = json.load(f)['train']

    samples = []
    for item in data:
        filtered_report = {k: v for k, v in item["report"].items() if k != "others"}
        image_id = item['id']
        image_folder = os.path.join(images_dir, image_id)
        image_paths = glob.glob(os.path.join(image_folder, "*.png"))
        yield {"images": image_paths, "report": filtered_report}

def pad_images(image_paths, max_images=4):
    blank_image = Image.new("RGB", (224, 224), (0, 0, 0))

    if len(image_paths) >= max_images:
        return image_paths[:max_images]
    else:
        return image_paths + [blank_image] * (max_images - len(image_paths))

def format_train_data(sample):
    padded_images = pad_images(sample["images"], max_images=4)

    return {"messages": [
        {
            "role": "system",
            "content": [{"type": "text", "text": system_message}],
        },
        {
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
            ] + [{"type": "image", "image": img} for img in padded_images],
        },
        {
            "role": "assistant",
            "content": [{"type": "text", "text": sample["report"]}],
        },
    ],
}

train_dataset = Dataset.from_generator(lambda: load_data(
    json_path=os.path.join(data_dir, 'annotation_quiz_all.json'),
    images_dir=os.path.join(data_dir, 'images')
))
train_dataset = [format_train_data(sample) for sample in train_dataset]

Generating train split: 0 examples [00:00, ? examples/s]

In [7]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

lora_config = LoraConfig(
    r = 16,
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM",
)

model = AutoModelForVision2Seq.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config
)

model = get_peft_model(model, lora_config)

processor = AutoProcessor.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/56.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/5 [00:00<?, ?it/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/1.09G [00:00<?, ?B/s]

`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/244 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/347 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

In [8]:
def collate_fn(examples):
    texts = [processor.apply_chat_template(example["messages"], tokenize=False) for example in examples]
    image_inputs = [process_vision_info(example["messages"])[0] for example in examples]

    batch = processor(text=texts, images=image_inputs, return_tensors="pt", padding=True)
    labels = batch["input_ids"].clone()
    labels[labels == processor.tokenizer.pad_token_id] = -100
    if isinstance(processor, Qwen2VLProcessor):
        image_tokens = [151652,151653,151655]
    else:
        image_tokens = [processor.tokenizer.convert_tokens_to_ids(processor.image_token)]
    for image_token_id in image_tokens:
        labels[labels == image_token_id] = -100
    batch["labels"] = labels

    return batch

args = TrainingArguments(
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 4,
    warmup_steps = 5,
    max_steps = 50,
    learning_rate = 2e-4,
    bf16 = True,
    logging_steps = 5,
    optim = "adamw_8bit",
    weight_decay = 0.01,
    lr_scheduler_type = "linear",
    seed = 3407,
    output_dir = model_dir,
    remove_unused_columns = False,
)

trainer = Trainer(
    model=model,
    train_dataset=train_dataset,
    data_collator=collate_fn,
    args=args
)

In [9]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mksp7292[0m ([33myws0322[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
5,2.3044
10,1.2321
15,0.516
20,0.4098
25,0.4043
30,0.3441
35,0.322
40,0.3338
45,0.2909
50,0.2781


TrainOutput(global_step=50, training_loss=0.6435517454147339, metrics={'train_runtime': 1142.6588, 'train_samples_per_second': 0.35, 'train_steps_per_second': 0.044, 'total_flos': 1.968698684422349e+16, 'train_loss': 0.6435517454147339, 'epoch': 0.1932367149758454})

In [10]:
def save_model(model, save_dir):
    os.makedirs(save_dir, exist_ok=True)
    model.save_pretrained(save_dir)
    processor.save_pretrained(save_dir)
    print(f"Model saved at {save_dir}")

save_model(model, os.path.join(model_dir, "final"))

Model saved at /content/drive/MyDrive/wanglab_quiz/NotUnsloth_Qwen2-VL-7B-Instruct/final


# Evaluation

In [12]:
!pip install -U datasets accelerate bitsandbytes

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [14]:
import os
import ast
import glob
import json
import torch
from PIL import Image
from transformers import (
    AutoModelForVision2Seq,
    AutoProcessor,
    BitsAndBytesConfig,
)
from tqdm import tqdm
from tabulate import tabulate
from collections import defaultdict
from green_score import GREEN

In [15]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [16]:
data_dir = "/content/drive/MyDrive/wanglab_quiz/data" # replace with your own data dir
model_dir = "/content/drive/MyDrive/wanglab_quiz/NoUnsloth_Qwen2-VL-7B-Instruct" # replace with your own model dir

system_message = """You are an advanced AI model specialized in analyzing chest X-ray images.
Your response **must** be in the following JSON format without any extra text:

{
    "lung": "...",
    "heart": "...",
    "mediastinal": "...",
    "bone": "..."
}
"""

prompt = """Analyze the given chest X-ray and generate a structured report.
Describe abnormalities (if any) for:
- Lung
- Heart
- Mediastinal
- Bone

Return findings strictly in JSON format as previously instructed."""

In [7]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForVision2Seq.from_pretrained(
    os.path.join(model_dir, "final"),
    torch_dtype=torch.float16,
    quantization_config=bnb_config
).to("cuda" if torch.cuda.is_available() else "cpu")

processor = AutoProcessor.from_pretrained(os.path.join(model_dir, "final"))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/56.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/5 [00:00<?, ?it/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/1.09G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/244 [00:00<?, ?B/s]

In [9]:
def pad_images(image_paths, max_images=4):
    blank_image = Image.new("RGB", (224, 224), (0, 0, 0))
    if len(image_paths) >= max_images:
        return image_paths[:max_images]
    else:
        return image_paths + [blank_image] * (max_images - len(image_paths))

def format_test_data(sample):
    padded_images = pad_images(sample["images"], max_images=4)

    return  {"messages": [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
            ] + [{"type": "image", "image": img} for img in padded_images],
        },
    ],
}

def generate_report(dataset_messages, dataset, processor, model):
    model.eval()
    generated_reports = []
    ground_truths = []
    processed_count = 0

    with tqdm(total=len(dataset_messages), desc="Processing Samples", unit="sample") as pbar:
        for idx, sample in enumerate(dataset_messages):
            try:
                text = processor.apply_chat_template(sample["messages"], tokenize=False, add_generation_prompt=True)

                image_inputs = []
                for content_item in sample["messages"][0]["content"]:
                    if content_item.get("type") == "image":
                        image_path = content_item.get("image")
                        if isinstance(image_path, str):
                            image = Image.open(image_path).convert("RGB")
                        else:
                            image = image_path
                        image_inputs.append(image)

                inputs = processor(text=[text], images=[image_inputs], padding=True, return_tensors="pt").to(model.device)

                generated_ids = model.generate(**inputs, max_new_tokens=256, top_p=1.0, do_sample=True, temperature=0.8)
                generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]

                decoded_texts = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True)
                generated_text = decoded_texts[0] if decoded_texts else ""

                try:
                    generated_report = ast.literal_eval(generated_text) if isinstance(generated_text, str) else generated_text
                except (SyntaxError, ValueError):
                    print(f"\nSample {idx} Warning: Failed to parse generated text as dict. Using fallback method.")
                    generated_report = {line.split(": ", 1)[0]: line.split(": ", 1)[1] if ": " in line else "" for line in generated_text.split("\n")}

                ground_truth = dataset[idx]["report"]
                for region in ["lung", "heart", "mediastinal", "bone"]:
                    generated_report.setdefault(region, "")
                    ground_truth.setdefault(region, "")

                ground_truths.append(ground_truth)
                generated_reports.append(generated_report)
                processed_count += 1
                pbar.update(1)

            except (IndexError, ValueError, KeyError, FileNotFoundError, SyntaxError) as e:
                print(f"\nError processing sample {idx}: {e}")
                print(f"Processed {processed_count}/{len(dataset_messages)} samples so far.")
                pbar.update(1)

    return generated_reports, ground_truths

def evaluate_green_scores(ground_truths, generated_reports, green_scorer):
    region_results = {}
    all_empty = True
    if "mediastinum" in generated_reports[0].keys():
        med = "mediastinum"
    else:
        med = "mediastinal"
    for region in ["lung", "heart", med, "bone"]:
        gen_region = [gen[region] for gen in generated_reports]
        if region == "mediastinum":
            gt_region = [gt["mediastinal"] for gt in ground_truths]
        else:
            gt_region = [gt[region] for gt in ground_truths]

        if gt_region and gen_region:
            all_empty = False
            mean, std, green_score_list, summary, result_df = green_scorer(gt_region, gen_region)
            region_results[region] = {
                "mean": mean,
                "std": std,
                "scores": green_score_list,
                "summary": summary,
                "result_df": result_df
            }

            print(f"\n### GREEN Score Summary for {region.capitalize()} ###")
            print(summary)
            print(f"\n### Detailed GREEN Scores for {region.capitalize()} ###")
            print(tabulate(result_df.head(10), headers="keys", tablefmt="grid"))

    if all_empty:
        print("Skipping GREEN scoring as no valid regions were found.")

    return region_results

## Testing Set

In [10]:
with open(os.path.join(data_dir, "annotation_quiz_all.json"), "r") as f:
    test_data = json.load(f)["test"]

test_dataset = []
for item in test_data:
    image_folder = os.path.join(os.path.join(data_dir, "images", item["id"]))
    image_paths = glob.glob(os.path.join(image_folder, "*.png"))
    filtered_report = {k: v for k, v in item["report"].items() if k != "others"}
    if image_paths:
        test_dataset.append({"images": image_paths, "report": filtered_report})

test_dataset_message = [format_test_data(sample) for sample in test_dataset]

In [11]:
test_generated_reports, test_ground_truths = generate_report(test_dataset_message, test_dataset, processor, model)

Processing Samples: 100%|██████████| 590/590 [2:46:30<00:00, 16.93s/sample]


In [12]:
generated_reports_file = os.path.join(data_dir, "qwen2_test_generated_reports.json")
ground_truths_file = os.path.join(data_dir, "test_ground_truths.json")

with open(generated_reports_file, "w", encoding="utf-8") as f:
    json.dump(test_generated_reports, f, ensure_ascii=False, indent=4)

with open(ground_truths_file, "w", encoding="utf-8") as f:
    json.dump(test_ground_truths, f, ensure_ascii=False, indent=4)

In [17]:
generated_reports_file = os.path.join(data_dir, "qwen2_test_generated_reports.json")
ground_truths_file = os.path.join(data_dir, "test_ground_truths.json")

with open(generated_reports_file, "r", encoding="utf-8") as f:
    generated_reports = json.load(f)

with open(ground_truths_file, "r", encoding="utf-8") as f:
    ground_truths = json.load(f)

print(generated_reports)
print(ground_truths)

[{'bone': '', 'heart': 'Cardiac contours are within normal limits.', 'lung': 'No focal airspace disease. No pleural effusion or pneumothorax.', 'mediastinum': 'Mediastinal contours are within normal limits.', 'mediastinal': ''}, {'bone': '', 'heart': 'Heart size is within normal limits.', 'lung': 'No focal airspace disease. No pleural effusion or pneumothorax.', 'mediastinum': 'Mediastinal contours are within normal limits.', 'mediastinal': ''}, {'bone': '', 'heart': 'Heart size is normal.', 'lung': 'Lungs are clear. No focal consolidation, pleural effusion, or pneumothorax.', 'mediastinum': 'Mediastinal contours are within normal limits.', 'mediastinal': ''}, {'bone': 'No acute bony abnormality.', 'heart': 'Cardiac contours are within normal limits.', 'lung': 'No focal airspace disease. No pleural effusion or pneumothorax.', 'mediastinum': 'Mediastinal contours are within normal limits.', 'mediastinal': ''}, {'bone': '', 'heart': 'Heart size is within normal limits.', 'lung': 'No foca

In [18]:
green_scorer = GREEN("StanfordAIMI/GREEN-RadPhi2", output_dir=".")
results = evaluate_green_scores(ground_truths, generated_reports, green_scorer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/876 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

KeyboardInterrupt: 

## Validation Set

In [13]:
val_dataset = []

with open("validation_dataset.json", "r") as f:
    val_data = json.load(f)

for item in val_data:
    image_folder = os.path.join(data_dir, "images", item["id"])
    image_paths = glob.glob(os.path.join(image_folder, "*.png"))
    filtered_report = {k: v for k, v in item["classification"].items() if k != "others"}

    if image_paths:
        val_dataset.append({"images": image_paths, "report": filtered_report})

val_dataset_message = [format_test_data(sample) for sample in val_dataset]

In [14]:
val_generated_reports, val_ground_truths = generate_report(val_dataset_message, val_dataset, processor, model)

Processing Samples: 100%|██████████| 296/296 [1:24:15<00:00, 17.08s/sample]


In [15]:
generated_reports_file = os.path.join(data_dir, "qwen2_val_generated_reports.json")
ground_truths_file = os.path.join(data_dir, "val_ground_truths.json")

with open(generated_reports_file, "w", encoding="utf-8") as f:
    json.dump(val_generated_reports, f, ensure_ascii=False, indent=4)

with open(ground_truths_file, "w", encoding="utf-8") as f:
    json.dump(val_ground_truths, f, ensure_ascii=False, indent=4)

In [19]:
generated_reports_file = os.path.join(data_dir, "qwen2_val_generated_reports.json")
ground_truths_file = os.path.join(data_dir, "val_ground_truths.json")

with open(generated_reports_file, "r", encoding="utf-8") as f:
    generated_reports = json.load(f)

with open(ground_truths_file, "r", encoding="utf-8") as f:
    ground_truths = json.load(f)

print(generated_reports)
print(ground_truths)

[{'bone': '', 'heart': 'Heart size is within normal limits.', 'lung': 'Lungs are clear. No focal consolidation, pleural effusion, or pneumothorax.', 'mediastinum': 'Mediastinal contours are within normal limits.', 'mediastinal': ''}, {'bone': 'No acute bony abnormality.', 'heart': 'Cardiac contours are within normal limits.', 'lung': 'No focal airspace disease. No pleural effusion or pneumothorax.', 'mediastinum': 'Mediastinal contours are within normal limits.', 'mediastinal': ''}, {'bone': '', 'heart': 'Heart size is within normal limits.', 'lung': 'Lungs are clear. No focal consolidation, pleural effusion, or pneumothorax.', 'mediastinum': 'Mediastinal contours are within normal limits.', 'mediastinal': ''}, {'bone': '', 'heart': 'Heart size is within normal limits.', 'lung': 'Lungs are clear. No focal consolidation, pleural effusion, or pneumothorax.', 'mediastinum': 'Mediastinal contours are within normal limits.', 'mediastinal': ''}, {'bone': '', 'heart': 'Heart size is within no

In [20]:
green_scorer = GREEN("StanfordAIMI/GREEN-RadPhi2", output_dir=".")
evaluate_green_scores(ground_truths, generated_reports, green_scorer)

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   1%|          | 31.5M/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/21.0k [00:00<?, ?B/s]

tokenization_chexagent.py:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/999k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.85k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

Processing data...making prompts


Map:   0%|          | 0/296 [00:00<?, ? examples/s]

Done.
==== Beginning Inference ====


100%|██████████| 37/37 [15:08<00:00, 24.55s/it]


==== End Inference ====
Computing summary ...


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/594 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


Seconds per example:  3.1266053923078485

### GREEN Score Summary for Lung ###

-------------GREEN-RadPhi2----------------
 [Summary]: Green average 0.6525498712998713 and standard deviation 0.30328769862517585 
 [Clinically Significant Errors Analyses]: <accuracy>. <representative error>

(a) False report of a finding in the candidate: 0.9054054054054054. 
  No focal airspace disease 

(b) Missing a finding present in the reference: 0.33783783783783783. 
  Low lung volumes. 

(c) Misidentification of a finding's anatomic location/position: 0.9966216216216216. 
  The findings were not specified to be greater on the right side than the left. 

(d) Misassessment of the severity of a finding: 0.9966216216216216. 
  The candidate report uses "focal consolidation" instead of "pneumonia". 

(e) Mentioning a comparison that isn't in the reference: 1.0. 
 None 

(f) Omitting a comparison detailing a change from a prior study: 0.9966216216216216. 
  The candidate report omits the comparison det

Map:   0%|          | 0/296 [00:00<?, ? examples/s]

Done.
==== Beginning Inference ====


100%|██████████| 37/37 [13:20<00:00, 21.62s/it]


==== End Inference ====
Computing summary ...
Seconds per example:  2.728641361803622

### GREEN Score Summary for Heart ###

-------------GREEN-RadPhi2----------------
 [Summary]: Green average 0.6342905405405406 and standard deviation 0.4299766296819872 
 [Clinically Significant Errors Analyses]: <accuracy>. <representative error>

(a) False report of a finding in the candidate: 0.9966216216216216. 
  Heart size is within normal limits. 

(b) Missing a finding present in the reference: 0.6993243243243243. 
  The candidate report does not mention the heart size. 

(c) Misidentification of a finding's anatomic location/position: 0.9966216216216216. 
  The candidate report misidentified the cardiac contour as heart size. 

(d) Misassessment of the severity of a finding: 0.9256756756756757. 
  The candidate report incorrectly states that the heart size is within normal limits, while the reference report indicates that the heart size is slightly enlarged. 

(e) Mentioning a comparison tha

Map:   0%|          | 0/296 [00:00<?, ? examples/s]

Done.
==== Beginning Inference ====


100%|██████████| 37/37 [13:21<00:00, 21.66s/it]


==== End Inference ====
Computing summary ...


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


Seconds per example:  2.734330862760544

### GREEN Score Summary for Mediastinum ###

-------------GREEN-RadPhi2----------------
 [Summary]: Green average 0.42961711711711714 and standard deviation 0.43708021879852016 
 [Clinically Significant Errors Analyses]: <accuracy>. <representative error>

(a) False report of a finding in the candidate: 0.9155405405405406. 
  Mediastinal contours are within normal limits. 

(b) Missing a finding present in the reference: 0.6148648648648649. 
  The candidate report does not mention the tortuous and calcified thoracic aorta. 

(c) Misidentification of a finding's anatomic location/position: 0.9966216216216216. 
  The candidate report mentions "Mediastinal contours" instead of "Aortic contours". 

(d) Misassessment of the severity of a finding: 0.9864864864864865. 
  The candidate report misassesses the mediastinal contours as normal instead of stable. 

(e) Mentioning a comparison that isn't in the reference: 1.0. 
 None 

(f) Omitting a compariso

Map:   0%|          | 0/296 [00:00<?, ? examples/s]

Done.
==== Beginning Inference ====


100%|██████████| 37/37 [12:09<00:00, 19.71s/it]


==== End Inference ====
Computing summary ...


  return fit_method(estimator, *args, **kwargs)


Seconds per example:  2.47973091537888

### GREEN Score Summary for Bone ###

-------------GREEN-RadPhi2----------------
 [Summary]: Green average 0.1413288288288288 and standard deviation 0.3460630659653744 
 [Clinically Significant Errors Analyses]: <accuracy>. <representative error>

(a) False report of a finding in the candidate: 0.9898648648648649. 
  No acute bony abnormality. 

(b) Missing a finding present in the reference: 0.7533783783783784. 
  The candidate report is missing. 

(c) Misidentification of a finding's anatomic location/position: 0.9966216216216216. 
  The candidate report includes parts of the skeleton that are not mentioned in the reference report. 

(d) Misassessment of the severity of a finding: 1.0. 
 None 

(e) Mentioning a comparison that isn't in the reference: 1.0. 
 None 

(f) Omitting a comparison detailing a change from a prior study: 1.0. 
 None 

----------------------------------


### Detailed GREEN Scores for Bone ###
+----+----------------------

{'lung': {'mean': 0.6525498712998713,
  'std': 0.30328769862517585,
  'scores': [0.6666666666666666,
   1.0,
   0.6,
   1.0,
   0.6666666666666666,
   1.0,
   1.0,
   0.6666666666666666,
   0.75,
   1.0,
   0.6666666666666666,
   0.25,
   0.3333333333333333,
   1.0,
   0.4,
   0.75,
   1.0,
   0.5,
   1.0,
   0,
   0.75,
   1.0,
   0.25,
   0.6666666666666666,
   1.0,
   1.0,
   0.5,
   1.0,
   1.0,
   1.0,
   0.6666666666666666,
   0,
   0.6,
   0,
   0.6,
   0.5,
   1.0,
   0.5,
   1.0,
   0,
   1.0,
   0.75,
   0.5,
   1.0,
   0.6666666666666666,
   0.5,
   1.0,
   0.6,
   0.5,
   0.75,
   0,
   0.75,
   1.0,
   0.75,
   0.75,
   0.6666666666666666,
   0.3333333333333333,
   0.75,
   1.0,
   1.0,
   1.0,
   0.6666666666666666,
   0.6,
   0.75,
   0.75,
   1.0,
   0.75,
   1.0,
   0.6666666666666666,
   0,
   0.3333333333333333,
   1.0,
   1.0,
   0,
   1.0,
   0.5,
   1.0,
   0.75,
   0.5,
   0.75,
   0.5,
   0.75,
   0.2857142857142857,
   1.0,
   0.5,
   1.0,
   0.5,
   1.0,
   0.