# 0. Downloading the drvd-bench dataset

In [1]:
import kagglehub
# It appears that in the Kaggle dataset the image paths include an extra subfolder with the same name. 
# Since we're not allowed to modify the data during the rebuttal phase, please download the dataset and then manually delete that extra folder layer.
# Download latest version
path = kagglehub.dataset_download("tianhongzhou/drvd-bench")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/tianhongzhou/drvd-bench?dataset_version_number=4...


100%|██████████| 6.54G/6.54G [05:54<00:00, 19.8MB/s]  

Extracting files...





Path to dataset files: /home/tiger/.cache/kagglehub/datasets/tianhongzhou/drvd-bench/versions/4


In [3]:
import os
current_dir = os.getcwd()
os.system(f"mv /home/tiger/.cache/kagglehub/datasets/tianhongzhou/drvd-bench/versions/4 {current_dir}")

0

# 1. Import dependencies

In [4]:
import json
import os
from pathlib import Path

from openai import OpenAI
from tqdm import tqdm

# function of drvd-bench
from drvd_bench import (
    get_drvd_data,
    map_result,
    compute_choice_metric,
    compute_report_generation_metric
)

print("✅ Import successful.")

✅ 导入成功


# 2.Model inference and metric calculation.

## 2.1 visual_evidence_qa.jsonl

In [6]:
# Relevant Parameters
QWEN_API_KEY   = "YOUR_QWEN_API_KEY"  # API key for qwen2.5vl
MODEL     = "qwen2.5-vl-7b-instruct"
BASE_URL  = "https://dashscope.aliyuncs.com/compatible-mode/v1"

JSONL_PATH   = Path("./4/visual_evidence_qa.jsonl")     # Input data
IMAGE_ROOT   = Path("./4")       # Image root directory
DATA_TYPE    = "single"                      # "single" | "joint"
NUM_SAMPLES  = 100                            # Use the first n samples for testing

RAW_OUT_PATH    = Path("visual_evidence_qa_result.jsonl")  # Raw model output

print("✅ Configuration ready.")

✅ 配置就绪


In [7]:
# API call related logic
client = OpenAI(api_key=QWEN_API_KEY, base_url=BASE_URL)
print("✅ client initialization completed")

import base64
import mimetypes
import time
from io import BytesIO
from pathlib import Path
from typing import Optional

from PIL import Image

def compress_image(
    path: str | Path,
    max_dim: int = 1024,
    quality: int = 85,
    size_limit_mb: int = 10,
) -> bytes:
    """Recursively compress the image so that the Base64 size ≤ size_limit_mb MiB."""
    img = Image.open(path)
    if img.mode in ("RGBA", "P"):
        img = img.convert("RGB")

    w, h = img.size
    if max(w, h) > max_dim:
        ratio = max_dim / max(w, h)
        img = img.resize((int(w * ratio), int(h * ratio)), Image.LANCZOS)

    buf = BytesIO()
    img.save(buf, format="JPEG", quality=quality)
    data = buf.getvalue()

    # Base64 expands about 1.37 times
    if len(data) * 1.37 > size_limit_mb * 1024 * 1024:
        # Continue lowering resolution / quality
        return compress_image(
            path, int(max_dim * 0.9), int(quality * 0.9), size_limit_mb
        )
    return data


def encode_image(
    path: str | Path,
    max_dim: int = 1024,
    quality: int = 85,
    size_limit_mb: int = 10,
) -> str:
    """Return a data URL that can be directly used in OpenAI messages."""
    try:
        img_bytes = compress_image(path, max_dim, quality, size_limit_mb)
    except Exception:
        with open(path, "rb") as f:
            img_bytes = f.read()

    b64 = base64.b64encode(img_bytes).decode("utf-8")
    mime, _ = mimetypes.guess_type(str(path))
    if not mime:
        mime = "image/jpeg"
    return f"data:{mime};base64,{b64}"

from openai import OpenAI

MAX_RETRIES  = 5
RETRY_DELAY  = 1      # seconds
DEFAULT_SYS_PROMPT = "You are a helpful medical image analysis assistant."

def api_infer(
    prompt: str,
    image_path: str | Path,
    client: OpenAI,
    *,
    system_prompt: str = DEFAULT_SYS_PROMPT,
    max_dim: int = 1024,
    quality: int = 85,
    size_limit_mb: int = 10,
    max_tokens: Optional[int] = 300,
    temperature: Optional[float] = 0.0,
    model_name: str = "qwen2.5-vl-72b-instruct",
) -> str:
    """
    Full implementation, equivalent to api_infer in qwen2.5vl_example.py.
    - Automatically converts image_path to data URL
    - Retries up to 5 times
    """
    data_url = encode_image(image_path, max_dim, quality, size_limit_mb)
    messages = [
        {"role": "system", "content": system_prompt},
        {
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
                {"type": "image_url", "image_url": {"url": data_url}},
            ],
        },
    ]

    last_err = None
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            params = {"model": model_name, "messages": messages}
            if max_tokens is not None:
                params["max_tokens"] = max_tokens
            if temperature is not None:
                params["temperature"] = temperature
            resp = client.chat.completions.create(**params)
            return resp.choices[0].message.content.strip()
        except Exception as e:
            last_err = e
            print(f"[Warning] attempt {attempt}/{MAX_RETRIES} failed: {e}")
            if attempt < MAX_RETRIES:
                time.sleep(RETRY_DELAY)

    # If still failed, raise the last exception
    raise last_err

✅ client 初始化完成


In [10]:
# Perform inference and save results
RAW_OUT_PATH.write_text("", encoding="utf-8")

for idx, (img_path, prompt, record) in enumerate(
    get_drvd_data(JSONL_PATH, IMAGE_ROOT, data_type=DATA_TYPE, verbose=True)
):
    if idx >= NUM_SAMPLES:
        break
    try:
        answer = api_infer(prompt, img_path, client)
    except Exception as e:
        answer = f"[ERROR] {e}"
    record["model_response"] = answer

    with RAW_OUT_PATH.open("a", encoding="utf-8") as f:
        f.write(json.dumps(record, ensure_ascii=False) + "\n")
        f.flush()

print(f"✅ Completed {idx+1} entries: {RAW_OUT_PATH.resolve()}")

Loading DrVD items: 100it [01:29,  1.11it/s]

✅ 已完成101 条：/opt/tiger/try_new_codes/visual_evidence_qa_result.jsonl





In [11]:
# Metric calculation
compute_choice_metric(RAW_OUT_PATH, mode="single")

Modality: CT
  modality_recognition: 100.00% (100/100)



# 2.2 independent_qa.jsonl

In [12]:
# Relevant Parameters
JSONL_PATH   = Path("./4/independent_qa.jsonl")     # Input data
IMAGE_ROOT   = Path("./4")       # Image root directory
DATA_TYPE    = "single"                      # "single" | "joint"
NUM_SAMPLES  = 100                            # Use the first n samples for testing

RAW_OUT_PATH    = Path("independent_qa_result.jsonl")  # Raw model output

print("✅ Configuration ready.")

✅ 配置就绪


In [13]:
# Perform inference and save results
RAW_OUT_PATH.write_text("", encoding="utf-8")

for idx, (img_path, prompt, record) in enumerate(
    get_drvd_data(JSONL_PATH, IMAGE_ROOT, data_type=DATA_TYPE, verbose=True)
):
    if idx >= NUM_SAMPLES:
        break
    try:
        answer = api_infer(prompt, img_path, client)
    except Exception as e:
        answer = f"[ERROR] {e}"
    record["model_response"] = answer

    with RAW_OUT_PATH.open("a", encoding="utf-8") as f:
        f.write(json.dumps(record, ensure_ascii=False) + "\n")
        f.flush()

print(f"✅ Completed {idx+1} entries: {RAW_OUT_PATH.resolve()}")

Loading DrVD items: 99it [01:10,  1.69it/s]



Loading DrVD items: 100it [01:22,  1.21it/s]

✅ 已完成101 条：/opt/tiger/try_new_codes/independent_qa_result.jsonl





In [14]:
# Metric calculation
compute_choice_metric(RAW_OUT_PATH, mode="single")

Modality: CT
  modality: 90.00% (18/20)
  bodypart: 85.00% (17/20)
  organ: 40.00% (8/20)
  lesion: 45.00% (9/20)
  diagnosis: 35.00% (7/20)



## 2.3 joint_qa.jsonl

In [15]:
# Relevant Parameters
JSONL_PATH   = Path("./4/joint_qa.jsonl")     # Input data
IMAGE_ROOT   = Path("./4")       # Image root directory
DATA_TYPE    = "joint"                      # "single" | "joint"
NUM_SAMPLES  = 20                            # Use the first n samples for testing

RAW_OUT_PATH    = Path("joint_qa_result.jsonl")  # Raw model output

print("✅ Configuration ready.")

✅ 配置就绪


In [16]:
# Perform inference and save results
RAW_OUT_PATH.write_text("", encoding="utf-8")

for idx, (img_path, prompt, record) in enumerate(
    get_drvd_data(JSONL_PATH, IMAGE_ROOT, data_type=DATA_TYPE, verbose=True)
):
    if idx >= NUM_SAMPLES:
        break
    try:
        answer = api_infer(prompt, img_path, client)
    except Exception as e:
        answer = f"[ERROR] {e}"
    record["model_response"] = answer

    with RAW_OUT_PATH.open("a", encoding="utf-8") as f:
        f.write(json.dumps(record, ensure_ascii=False) + "\n")
        f.flush()

print(f"✅ Completed {idx+1} entries: {RAW_OUT_PATH.resolve()}")

Loading DrVD items: 20it [00:15,  1.27it/s]

✅ 已完成21 条：/opt/tiger/try_new_codes/joint_qa_result.jsonl





In [17]:
# Metric calculation
compute_choice_metric(RAW_OUT_PATH, mode="joint")


✅ Hierarchical Accuracy Report by Modality:

Modality: CT
      modality: 90.00% (18/20)
         organ: 65.00% (13/20)
        lesion: 45.00% (9/20)
     diagnosis: 35.00% (7/20)


## 3.4 report_generation.jsonl

In [20]:
# Relevant Parameters
JSONL_PATH   = Path("./4/report_generation.jsonl")     # Input data
IMAGE_ROOT   = Path("./4")       # Image root directory
DATA_TYPE    = "single"                      # "single" | "joint"
NUM_SAMPLES  = 10                            # Use the first n samples for testing

RAW_OUT_PATH    = Path("report_generation_result.jsonl")  # Raw model output

print("✅ Configuration ready.")

✅ 配置就绪


In [21]:
# Perform inference and save results
RAW_OUT_PATH.write_text("", encoding="utf-8")

for idx, (img_path, prompt, record) in enumerate(
    get_drvd_data(JSONL_PATH, IMAGE_ROOT, data_type=DATA_TYPE, verbose=True)
):
    if idx >= NUM_SAMPLES:
        break
    try:
        answer = api_infer(prompt, img_path, client)
    except Exception as e:
        answer = f"[ERROR] {e}"
    record["model_response"] = answer

    with RAW_OUT_PATH.open("a", encoding="utf-8") as f:
        f.write(json.dumps(record, ensure_ascii=False) + "\n")
        f.flush()

print(f"✅ Completed {idx+1} entries: {RAW_OUT_PATH.resolve()}")

Loading DrVD items: 10it [01:32,  9.25s/it]

✅ 已完成11 条：/opt/tiger/try_new_codes/report_generation_result.jsonl





In [32]:
# Metric calculation
DEEPSEEK_API_KEY   = "YOUR_DEEPSEEK_API_KEY" #deepseek的api
# 计算report generation指标
compute_report_generation_metric(api_key=DEEPSEEK_API_KEY, json_path=RAW_OUT_PATH)
print("✅ 指标计算完成")

  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 20/20 [00:19<00:00,  1.05it/s]
Scoring:   0%|          | 0/1 [00:00<?, ?it/s]

HBox(children=(FloatProgress(value=0.0, description='tokenizer_config.json', max=28.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='config.json', max=385.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='vocab.txt', layout=Layout(width='20px')…




Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


HBox(children=(FloatProgress(value=0.0, description='pytorch_model.bin', max=440474434.0, style=ProgressStyle(…




Scoring: 100%|██████████| 1/1 [01:36<00:00, 96.86s/it]

CT: BERTScore F1 = 0.8992, BLEU = 0.0271
✅ 指标计算完成





# 3. Model result mapping function

In [None]:
# Used to address the issue where models with poor instruction-following ability cannot compute metrics; not needed for models with good instruction-following
_ = map_result(
    api_key=YOUR_DEEPSEEK_API_KEY,
    input_path=RAW_OUT_PATH,
    output_path=MAPPED_OUT_PATH,
    base_url="https://api.deepseek.com",
    show_preview=3       # Only print the first 3 examples in the terminal
)
print(f"✅ Mapped results written to {MAPPED_OUT_PATH.resolve()}")