In [1]:
import json
import os
import re
import pdfplumber
from PIL import Image, ImageOps
import io


In [2]:
file = os.path.join("question_bank", "101030_101_02_Q.pdf")

with pdfplumber.open(file) as pdf:
    page = pdf.pages[7]
    as_img = page.to_image()
    
    images = page.images
    print(len(images))
    
    # 依x0分組圖片
    grouped_images = {}
    for img in images:
        x0 = img['x0']
        if x0 not in grouped_images:
            grouped_images[x0] = []
        grouped_images[x0].append(img)

    # 建立合併圖片儲存目錄
    os.makedirs("merged_images", exist_ok=True)
    
    # 遍歷每組圖片進行合併
    for group_idx, (group_key, imgs) in enumerate(grouped_images.items(), 1):
        if not imgs:
            continue
            
        # 依y0排序圖片（PDF座標y0越小表示越上方）
        sorted_imgs = sorted(imgs, key=lambda x: x['y0'], reverse=True)
        
        # 提取所有PIL圖片並計算總尺寸
        pil_imgs = []
        total_height = 0
        max_width = 0
        for img in sorted_imgs:
            stream = img['stream']
            pil_img = Image.open(io.BytesIO(stream.get_data()))
            pil_imgs.append(pil_img)
            total_height += pil_img.height
            max_width = max(max_width, pil_img.width)
        
        # 建立空白畫布並合併圖片
        merged_img = Image.new('RGB', (max_width, total_height))
        y_offset = 0
        for pil_img in pil_imgs:
            merged_img.paste(pil_img, (0, y_offset))
            y_offset += pil_img.height
        
        # 儲存合併後的圖片
        output_path = os.path.join("merged_images", f"merged_{page.page_number}_{group_idx}.png")
        merged_img.save(output_path, "PNG")
        
        # 計算合併後圖片的實際座標範圍
        x0 = min(img['x0'] for img in sorted_imgs)
        x1 = max(img['x1'] for img in sorted_imgs)
        y0 = min(img['y0'] for img in sorted_imgs)
        y1 = max(img['y1'] for img in sorted_imgs)
        
        print(f"已合併圖片 {group_idx}，儲存至: {output_path}")
        print(f"原始座標範圍: x0={x0:.2f}, y0={y0:.2f}, x1={x1:.2f}, y1={y1:.2f}")
    

    
    
    # as_img.show()
    # as_img.draw_rects(page.extract_text())
    # as_img.save(f"{pdf_name}_page0.png")

13
已合併圖片 1，儲存至: merged_images/merged_8_1.png
原始座標範圍: x0=182.82, y0=458.24, x1=412.85, y1=628.28
已合併圖片 2，儲存至: merged_images/merged_8_2.png
原始座標範圍: x0=168.54, y0=261.68, x1=427.20, y1=431.66
已合併圖片 3，儲存至: merged_images/merged_8_3.png
原始座標範圍: x0=244.74, y0=52.04, x1=473.21, y1=236.36
