## **Step1. 先將PDF使用MinerU 2.5 辨識成 .json 格式**

MinerU 2.5.3 以上版本 (包含使用傳統OCR、及 VLM 辨識方式)

In [None]:
!pip install --upgrade pip
!pip install "mineru[all]>=2.5.3"

Collecting pip
  Downloading pip-25.3-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.3-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.3
Collecting mineru>=2.5.3 (from mineru[all]>=2.5.3)
  Downloading mineru-2.6.5-py3-none-any.whl.metadata (72 kB)
Collecting boto3>=1.28.43 (from mineru>=2.5.3->mineru[all]>=2.5.3)
  Downloading boto3-1.41.5-py3-none-any.whl.metadata (6.8 kB)
Collecting loguru>=0.7.2 (from mineru>=2.5.3->mineru[all]>=2.5.3)
  Downloading loguru-0.7.3-py3-none-any.whl.metadata (22 kB)
Collecting pdfminer.six==20250506 (from mineru>=2.5.3->mineru[all]>=2.5.3)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.30.0 (fro

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import copy
import json
import os
from pathlib import Path
from loguru import logger

from mineru.cli.common import convert_pdf_bytes_to_bytes_by_pypdfium2, prepare_env, read_fn
from mineru.data.data_reader_writer import FileBasedDataWriter
from mineru.utils.draw_bbox import draw_layout_bbox, draw_span_bbox
from mineru.utils.enum_class import MakeMode
from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze
from mineru.backend.pipeline.pipeline_analyze import doc_analyze as pipeline_doc_analyze
from mineru.backend.pipeline.pipeline_middle_json_mkcontent import union_make as pipeline_union_make
from mineru.backend.pipeline.model_json_to_middle_json import result_to_middle_json as pipeline_result_to_middle_json
from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
from mineru.utils.guess_suffix_or_lang import guess_suffix_by_path

Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


In [None]:
def do_parse(
    output_dir,
    pdf_file_names: list[str],
    pdf_bytes_list: list[bytes],
    p_lang_list: list[str],
    backend="pipeline",
    parse_method="auto",
    formula_enable=True,
    table_enable=True,
    server_url=None,
    f_draw_layout_bbox=False,  # 改為False節省空間
    f_draw_span_bbox=False,   # 改為False節省空間
    f_dump_md=True,
    f_dump_middle_json=False,  #改為False，我們只需要content_list
    f_dump_model_output=False,
    f_dump_orig_pdf=False,
    f_dump_content_list=True,
    f_make_md_mode=MakeMode.MM_MD,
    start_page_id=0,
    end_page_id=None,
):
    # ========== Pipeline Backend ==========
    if backend == "pipeline":
        for idx, pdf_bytes in enumerate(pdf_bytes_list):
            new_pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id, end_page_id)
            pdf_bytes_list[idx] = new_pdf_bytes

        infer_results, all_image_lists, all_pdf_docs, lang_list, ocr_enabled_list = pipeline_doc_analyze(
            pdf_bytes_list, p_lang_list,
            parse_method=parse_method,
            formula_enable=formula_enable,
            table_enable=table_enable
        )

        for idx, model_list in enumerate(infer_results):
            model_json = copy.deepcopy(model_list)
            pdf_file_name = pdf_file_names[idx]
            local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
            image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)

            images_list = all_image_lists[idx]
            pdf_doc = all_pdf_docs[idx]
            _lang = lang_list[idx]
            _ocr_enable = ocr_enabled_list[idx]
            middle_json = pipeline_result_to_middle_json(
                model_list, images_list, pdf_doc, image_writer, _lang, _ocr_enable, formula_enable
            )

            pdf_info = middle_json["pdf_info"]
            pdf_bytes = pdf_bytes_list[idx]

            _process_output(
                pdf_info, pdf_bytes, pdf_file_name, local_md_dir, local_image_dir,
                md_writer, f_draw_layout_bbox, f_draw_span_bbox, f_dump_orig_pdf,
                f_dump_md, f_dump_content_list, f_dump_middle_json, f_dump_model_output,
                f_make_md_mode, middle_json, model_json, is_pipeline=True
            )

    # ========== VLM Backend ==========
    else:
        # 處理 VLM backend 前綴
        if backend.startswith("vlm-"):
            backend = backend[4:]  # 移除 "vlm-" 前綴

        f_draw_span_bbox = False  # VLM 不支援 span bbox
        parse_method = "vlm"

        for idx, pdf_bytes in enumerate(pdf_bytes_list):
            pdf_file_name = pdf_file_names[idx]
            pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id, end_page_id)
            local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
            image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)

            # VLM 解析
            middle_json, infer_result = vlm_doc_analyze(
                pdf_bytes,
                image_writer=image_writer,
                backend=backend,
                server_url=server_url
            )

            pdf_info = middle_json["pdf_info"]

            _process_output(
                pdf_info, pdf_bytes, pdf_file_name, local_md_dir, local_image_dir,
                md_writer, f_draw_layout_bbox, f_draw_span_bbox, f_dump_orig_pdf,
                f_dump_md, f_dump_content_list, f_dump_middle_json, f_dump_model_output,
                f_make_md_mode, middle_json, infer_result, is_pipeline=False
            )


def _process_output(
    pdf_info, pdf_bytes, pdf_file_name, local_md_dir, local_image_dir,
    md_writer, f_draw_layout_bbox, f_draw_span_bbox, f_dump_orig_pdf,
    f_dump_md, f_dump_content_list, f_dump_middle_json, f_dump_model_output,
    f_make_md_mode, middle_json, model_output=None, is_pipeline=True
):
    """處理輸出檔案"""
    if f_draw_layout_bbox:
        draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_layout.pdf")

    if f_draw_span_bbox:
        draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_span.pdf")

    if f_dump_orig_pdf:
        md_writer.write(f"{pdf_file_name}_origin.pdf", pdf_bytes)

    image_dir = str(os.path.basename(local_image_dir))

    if f_dump_md:
        make_func = pipeline_union_make if is_pipeline else vlm_union_make
        md_content_str = make_func(pdf_info, f_make_md_mode, image_dir)
        md_writer.write_string(f"{pdf_file_name}.md", md_content_str)

    if f_dump_content_list:
        make_func = pipeline_union_make if is_pipeline else vlm_union_make
        content_list = make_func(pdf_info, MakeMode.CONTENT_LIST, image_dir)
        md_writer.write_string(
            f"{pdf_file_name}_content_list.json",
            json.dumps(content_list, ensure_ascii=False, indent=4),
        )

    if f_dump_middle_json:
        md_writer.write_string(
            f"{pdf_file_name}_middle.json",
            json.dumps(middle_json, ensure_ascii=False, indent=4),
        )

    if f_dump_model_output:
        md_writer.write_string(
            f"{pdf_file_name}_model.json",
            json.dumps(model_output, ensure_ascii=False, indent=4),
        )

    logger.info(f"✅ Output directory: {local_md_dir}")


def parse_doc(
    path_list: list[Path],
    output_dir,
    lang="ch",
    backend="pipeline",
    method="auto",
    server_url=None,
    start_page_id=0,
    end_page_id=None
):
    """解析文檔的主函數"""
    try:
        file_name_list = []
        pdf_bytes_list = []
        lang_list = []

        for path in path_list:
            file_name = str(Path(path).stem)
            pdf_bytes = read_fn(path)
            file_name_list.append(file_name)
            pdf_bytes_list.append(pdf_bytes)
            lang_list.append(lang)

        do_parse(
            output_dir=output_dir,
            pdf_file_names=file_name_list,
            pdf_bytes_list=pdf_bytes_list,
            p_lang_list=lang_list,
            backend=backend,
            parse_method=method,
            server_url=server_url,
            start_page_id=start_page_id,
            end_page_id=end_page_id
        )
    except Exception as e:
        logger.exception(e)


if __name__ == '__main__':
    # 設定路徑
    pdf_files_dir = "/content/drive/MyDrive/11401NTUAIClub/專案組/PDF/"
    output_dir = "/content/drive/MyDrive/11401NTUAIClub/專案組/Output/"
    os.makedirs(output_dir, exist_ok=True)

    # 收集檔案
    pdf_suffixes = ["pdf"]
    image_suffixes = ["png", "jpeg", "jp2", "webp", "gif", "bmp", "jpg"]

    doc_path_list = []
    for doc_path in Path(pdf_files_dir).glob('*'):
        if guess_suffix_by_path(doc_path) in pdf_suffixes + image_suffixes:
            doc_path_list.append(doc_path)

    print(f"📄 原始找到 {len(doc_path_list)} 個檔案")

    # ============================================
    # ⚠️ 測試模式：只取第一個檔案
    # ============================================
    # if doc_path_list:
    #     doc_path_list = doc_path_list[:1]
    #     print(f"⚠️ 測試模式開啟：僅處理第一個檔案 -> {doc_path_list[0].name}")
    # else:
    #     print("❌ 未找到任何支援的檔案")

    # for path in doc_path_list:
    #     print(f"  - {path.name}")

    # ============================================
    # 🔧 選擇 Backend（三選一）
    # ============================================

    if doc_path_list:
        # 選項 1: Pipeline（CPU，適合 Colab 免費版）✅ 推薦新手
        # print("\n🚀 使用 Pipeline Backend...")
        # parse_doc(
        #     doc_path_list,
        #     output_dir,
        #     backend="pipeline",
        #     lang="chinese_cht"
        # )

        # 選項 2: VLM Transformers（需要 GPU，較慢但更準確）
        # print("\n🚀 使用 VLM Transformers Backend...")
        # parse_doc(
        #     doc_path_list,
        #     output_dir,
        #     backend="vlm-transformers",
        #     lang="chinese_cht"
        # )

        # 選項 3: VLM vLLM Engine（需要強大 GPU，最快）
        print("\n🚀 使用 VLM vLLM Engine Backend...")
        parse_doc(
            doc_path_list,
            output_dir,
            backend="vlm-vllm-engine",
            lang="chinese_cht"
        )

        # 選項 4: VLM HTTP Client（需要先啟動 server）
        # print("\n🚀 使用 VLM HTTP Client Backend...")
        # parse_doc(
        #     doc_path_list,
        #     output_dir,
        #     backend="vlm-http-client",
        #     server_url="http://127.0.0.1:30000",
        #     lang="chinese_cht"
        # )

📄 原始找到 7 個檔案

🚀 使用 VLM vLLM Engine Backend...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 14 files:   0%|          | 0/14 [00:00<?, ?it/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/800 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

generation_config.json:   0%|          | 0.00/272 [00:00<?, ?B/s]

chat_template.json: 0.00B [00:00, ?B/s]

.gitattributes: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/2.31G [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

preprocessor_config.json:   0%|          | 0.00/346 [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

[32m2025-11-27 03:10:12.326[0m | [1mINFO    [0m | [36mmineru.backend.vlm.utils[0m:[36menable_custom_logits_processors[0m:[36m40[0m - [1mcompute_capability: 7.5 < 8.0, but vllm version: 0.11.2 >= 0.10.2, enable custom_logits_processors[0m


INFO 11-27 03:10:21 [utils.py:253] non-default args: {'gpu_memory_utilization': 0.7, 'disable_log_stats': True, 'logits_processors': [<class 'mineru_vl_utils.logits_processor.vllm_v1_no_repeat_ngram.VllmV1NoRepeatNGramLogitsProcessor'>], 'model': '/root/.cache/huggingface/hub/models--opendatalab--MinerU2.5-2509-1.2B/snapshots/879e58bdd9566632b27a8a81f0e2961873311f67'}
INFO 11-27 03:10:36 [model.py:631] Resolved architecture: Qwen2VLForConditionalGeneration
INFO 11-27 03:10:36 [model.py:1745] Using max model len 16384
INFO 11-27 03:10:38 [scheduler.py:216] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 11-27 03:12:51 [llm.py:352] Supported tasks: ['generate']


[32m2025-11-27 03:12:51.582[0m | [1mINFO    [0m | [36mmineru.backend.vlm.vlm_analyze[0m:[36mget_model[0m:[36m189[0m - [1mget vllm-engine predictor cost: 174.32s[0m


Adding requests:   0%|          | 0/57 [00:00<?, ?it/s]

The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.


Processed prompts:   0%|          | 0/57 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/791 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/791 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[32m2025-11-27 03:15:25.353[0m | [1mINFO    [0m | [36m__main__[0m:[36m_process_output[0m:[36m135[0m - [1m✅ Output directory: /content/drive/MyDrive/11401NTUAIClub/專案組/Output/星宇航空_25Q2/vlm[0m


Adding requests:   0%|          | 0/7 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/7 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/120 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/120 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[32m2025-11-27 03:15:52.484[0m | [1mINFO    [0m | [36m__main__[0m:[36m_process_output[0m:[36m135[0m - [1m✅ Output directory: /content/drive/MyDrive/11401NTUAIClub/專案組/Output/長榮航_華南_20250520/vlm[0m


Adding requests:   0%|          | 0/59 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/59 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/773 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/773 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[32m2025-11-27 03:18:48.306[0m | [1mINFO    [0m | [36m__main__[0m:[36m_process_output[0m:[36m135[0m - [1m✅ Output directory: /content/drive/MyDrive/11401NTUAIClub/專案組/Output/長榮航_25Q2/vlm[0m


Adding requests:   0%|          | 0/10 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/10 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/168 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/168 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[32m2025-11-27 03:19:29.458[0m | [1mINFO    [0m | [36m__main__[0m:[36m_process_output[0m:[36m135[0m - [1m✅ Output directory: /content/drive/MyDrive/11401NTUAIClub/專案組/Output/長榮航_國票_20250321/vlm[0m


Adding requests:   0%|          | 0/10 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/10 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/159 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/159 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[32m2025-11-27 03:20:08.753[0m | [1mINFO    [0m | [36m__main__[0m:[36m_process_output[0m:[36m135[0m - [1m✅ Output directory: /content/drive/MyDrive/11401NTUAIClub/專案組/Output/星宇航空_國票_20250324/vlm[0m


Adding requests:   0%|          | 0/54 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/54 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/770 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/770 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[32m2025-11-27 03:22:35.968[0m | [1mINFO    [0m | [36m__main__[0m:[36m_process_output[0m:[36m135[0m - [1m✅ Output directory: /content/drive/MyDrive/11401NTUAIClub/專案組/Output/星宇航空_25Q1/vlm[0m


Adding requests:   0%|          | 0/10 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/10 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/272 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/272 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[32m2025-11-27 03:23:18.179[0m | [1mINFO    [0m | [36m__main__[0m:[36m_process_output[0m:[36m135[0m - [1m✅ Output directory: /content/drive/MyDrive/11401NTUAIClub/專案組/Output/星宇航空_國泰_20241004.pdf/vlm[0m


## **Step2. 輸出的 .json 檔, 再使用「Gemini-2.5-flash」增加每張圖片的描述(也包含表格)**

新增: 使用 gemini-2.5-flash (Gemini API 方式) 用 LLM 處理圖片描述， 包含被 VLM 辨識的表格截圖。

In [None]:
# Cell 1: 安裝 Google Generative AI SDK
!pip install -q google-generativeai

In [None]:
# Cell 2: 定義 ImageDescriptionEnhancer 類別

import json
import base64
from pathlib import Path
import google.generativeai as genai

class ImageDescriptionEnhancer:
    """為 MinerU 解析結果的圖片添加 Gemini 生成的描述"""

    def __init__(self, api_key: str, max_words: int = 50):
        """
        初始化

        Args:
            api_key: Google API Key
            max_words: 生成描述的最大字數
        """
        genai.configure(api_key=api_key)
        self.model = genai.GenerativeModel('gemini-2.5-flash')
        self.max_words = max_words

    def generate_description(self, image_path: str, custom_prompt: str = None) -> str:
        """
        使用 Gemini 生成圖片描述

        Args:
            image_path: 圖片路徑
            custom_prompt: 自定義提示詞（可選）

        Returns:
            圖片描述文字
        """
        # 讀取圖片
        img = Path(image_path)
        if not img.exists():
            return None

        # 預設提示詞
        if custom_prompt is None:
            custom_prompt = f"""
請用繁體中文簡潔描述這張圖片的內容。

要求：
1. 字數限制在 {self.max_words} 字以內
2. 專注於圖片的核心訊息（數據、趨勢、重點）
3. 如果是圖表，請說明：
   - 圖表類型（折線圖、長條圖、圓餅圖等）
   - 主要趨勢或數據
   - 時間範圍（如果有）
4. 如果是表格截圖，請說明表格內容主題
5. 使用客觀、專業的語氣

只輸出描述文字，不要前綴或解釋。
"""

        # 準備圖片
        with open(image_path, 'rb') as f:
            image_data = f.read()

        # 呼叫 Gemini API
        response = self.model.generate_content([
            custom_prompt,
            {'mime_type': 'image/jpeg', 'data': image_data}
        ])

        return response.text.strip()

    def enhance_content_list(
        self,
        json_path: str,
        output_path: str = None,
        base_dir: str = None,
        custom_prompt: str = None,
        skip_existing: bool = True,
        process_tables: bool = True  # ← 新增參數
    ):
        """
        為 content_list.json 中的圖片和表格添加描述

        Args:
            process_tables: 是否也處理 type='table' 的圖片
        """

        with open(json_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        if base_dir is None:
            base_dir = Path(json_path).parent
        else:
            base_dir = Path(base_dir)

        total_items = 0
        processed_items = 0
        skipped_items = 0

        # 統計
        print(f"\n🔍 掃描 JSON...")
        images = [item for item in data if item.get('type') == 'image' and item.get('img_path')]
        tables = [item for item in data if item.get('type') == 'table' and item.get('img_path')]

        print(f"   找到 {len(images)} 張圖片 (type='image')")
        print(f"   找到 {len(tables)} 個表格圖片 (type='table')")

        if process_tables:
            print(f"   ✅ 將處理圖片 + 表格，共 {len(images) + len(tables)} 個項目")
        else:
            print(f"   ⏭️  只處理圖片，跳過表格")

        print(f"\n{'='*50}")
        print(f"開始處理...")
        print(f"{'='*50}\n")

        for idx, item in enumerate(data):
            item_type = item.get('type')

            # 決定是否處理這個項目
            should_process = False

            if item_type == 'image' and item.get('img_path'):
                should_process = True
                type_label = "圖片"

            if process_tables and item_type == 'table' and item.get('img_path'):
                should_process = True
                type_label = "表格"

            if not should_process:
                continue

            total_items += 1

            # 檢查是否已有描述
            if skip_existing and item.get('llm_description'):
                desc = item.get('llm_description', '').strip()
                if desc and desc != 'None':
                    skipped_items += 1
                    print(f"⏭️  跳過 ({skipped_items}): [{type_label}] {item['img_path']}")
                    continue

            # 組合完整路徑
            img_path = base_dir / item['img_path']

            if not img_path.exists():
                print(f"⚠️  檔案不存在: {img_path}")
                continue

            print(f"🖼️  處理 ({processed_items + 1}): [{type_label}] {item['img_path']}")

            try:
                # 為表格使用特殊的 Prompt（可選）
                if item_type == 'table' and custom_prompt is None:
                    table_prompt = self._get_table_prompt()
                    description = self.generate_description(str(img_path), table_prompt)
                else:
                    description = self.generate_description(str(img_path), custom_prompt)

                if description:
                    item['llm_description'] = description
                    processed_items += 1
                    print(f"   ✅ {description[:60]}...")

                    # 間隔
                    if processed_items < total_items - skipped_items:
                        time.sleep(self.delay)
                else:
                    item['llm_description'] = None
                    print(f"   ❌ 生成失敗")

            except Exception as e:
                print(f"   ❌ 錯誤: {e}")
                item['llm_description'] = None

        # 儲存結果
        if output_path is None:
            output_path = json_path

        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=4)

        print(f"\n{'='*50}")
        print(f"📊 處理完成！")
        print(f"   總項目數: {total_items}")
        print(f"   已跳過: {skipped_items}")
        print(f"   成功處理: {processed_items}")
        print(f"   輸出檔案: {output_path}")
        print(f"{'='*50}")

    def _get_table_prompt(self) -> str:
        """為表格設計的專用 Prompt"""
        return f"""
這是一份財務報告中的「表格」截圖。

請用繁體中文、{self.max_words} 字內描述：
1. 表格的主題（例如：資產負債表、損益表、現金流量表）
2. 時間範圍（如果有）
3. 主要欄位或關鍵數據

只輸出描述文字，不要前綴。
"""

print("✅ ImageDescriptionEnhancer 類別已載入（支援 table 處理）")

✅ ImageDescriptionEnhancer 類別已載入（支援 table 處理）


In [None]:
# Cell 3: 設定 Google API Key

# 方法 1: 直接輸入（不安全，僅測試用）
GOOGLE_API_KEY = ""

# 方法 2: 使用 Colab Secrets（推薦）
# from google.colab import userdata
# GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')

print("✅ API Key 已設定")

✅ API Key 已設定


In [None]:
# Cell 4: 處理特定檔案

# 初始化
enhancer = ImageDescriptionEnhancer(
    api_key=GOOGLE_API_KEY,
    max_words=50  # 限制 50 字以內
)

# 設定路徑
JSON_PATH = "/content/drive/MyDrive/11401NTUAIClub/專案組/Output/長榮航_25Q1/vlm/長榮航_25Q1_content_list.json"
BASE_DIR = "/content/drive/MyDrive/11401NTUAIClub/專案組/Output/長榮航_25Q1/vlm"

# 執行增強
enhancer.enhance_content_list(
    json_path=JSON_PATH,
    base_dir=BASE_DIR,
    # skip_existing=True,   # 跳過已處理的圖片
    skip_existing=False,  # 重新處理所有項目
    process_tables=True  # 啟用表格處理
)


🔍 掃描 JSON...
   找到 2 張圖片 (type='image')
   找到 103 個表格圖片 (type='table')
   ✅ 將處理圖片 + 表格，共 105 個項目

開始處理...

🖼️  處理 (1): [表格] images/505e40a9fc0c98de9818c7cb9003dd5bb5188f770dd4bf379892d1b8706236ea.jpg
   ✅ 主題為合併財務報告目錄，涵蓋合併資產負債表、損益表、權益變動表、現金流量表與附註。時間範圍未標示。主要欄位為報告項目與頁...
🖼️  處理 (2): [圖片] images/20561af6c35aa95fd6b9662d2870811c784b3c5282fcff75a8e687a1d62f5338.jpg
   ✅ 一枚紅色方形印章，其上刻有數行篆體漢字。...
🖼️  處理 (3): [圖片] images/b9672f7b8d883ee357bbe8f2980a9c269a2cb1ec734f2e99304c20de4f366b90.jpg
   ✅ 紅色印章，內容為「國立中山大學圖書館」字樣。...
🖼️  處理 (4): [表格] images/bab6c7ef94d41d44ff8a87939fbccb9035c52b8790210da64334f5cdcaa2c360.jpg
   ✅ 這是一份資產負債表的資產項目截圖，時間範圍未提供。主要包含流動資產與非流動資產，細項有現金、金融資產、應收票據、存貨、不...
🖼️  處理 (5): [表格] images/04d08484a0574d44af6e8cf6f51054dabe6c52e2823a8b1dc91c2c2b43fb65d4.jpg
   ✅ 比較性財務狀況表分項數據，涵蓋114.3.31、113.12.31及113.3.31三時點。呈現各項金額及其佔總額百分比...
🖼️  處理 (6): [表格] images/d5d21f81dbb788b1af154fed966c911a9c99c260f8971d391f17cd3c348d6c3d.jpg
   ✅ 這是一份資產負債表項目分析，顯示三個期間的財務狀況。主要數據包含各項目金額及其佔總計的百分比。各期總計金額分別為258,..