# 1. 1차 label

In [2]:
import os
import cv2
from doclayout_yolo import YOLOv10

# 경로 설정
input_dir = "images"
output_dir = "output1"  # 주석된 이미지를 저장할 폴더
annotations_dir = "output_ann1"  # 주석을 저장할 폴더
model_path = "doclayout_yolo_docstructbench_imgsz1024.pt"

# 결과 저장 폴더가 없으면 생성
os.makedirs(output_dir, exist_ok=True)
os.makedirs(annotations_dir, exist_ok=True)

# 모델 불러오기
model = YOLOv10(model_path)

# 이미지 파일 리스트 얻기
img_extensions = ['.jpg', '.jpeg', '.png', '.bmp']
img_files = [f for f in os.listdir(input_dir) if os.path.splitext(f)[1].lower() in img_extensions]

for img_file in img_files:
    img_path = os.path.join(input_dir, img_file)
    base_name = os.path.splitext(img_file)[0]
    
    # 예측 수행
    det_res = model.predict(
        img_path,
        imgsz=1024,
        conf=0.2,
        device="cuda:0"
    )
    
    # 주석된 이미지 생성
    annotated_frame = det_res[0].plot(
        pil=True,
        line_width=5,
        font_size=20
    )
    
    # 이미지 저장
    save_path = os.path.join(output_dir, img_file)
    cv2.imwrite(save_path, annotated_frame)
    
    # 텍스트 파일로 주석 저장
    txt_path = os.path.join(annotations_dir, base_name + '.txt')
    with open(txt_path, 'w', encoding='utf-8') as f:
        f.write(f"Image: {img_file}\n")
        f.write(f"Original Size: {det_res[0].orig_shape[1]} x {det_res[0].orig_shape[0]}\n")
        f.write("-" * 50 + "\n")
        
        if det_res[0].boxes is not None and len(det_res[0].boxes) > 0:
            boxes = det_res[0].boxes
            f.write(f"Total detections: {len(boxes)}\n\n")
            
            for i, box in enumerate(boxes):
                # 클래스 정보
                class_id = int(box.cls[0])
                class_name = det_res[0].names[class_id] if det_res[0].names else f"class_{class_id}"
                confidence = float(box.conf[0])
                
                # 좌표 정보 (픽셀 단위)
                x1, y1, x2, y2 = box.xyxy[0].tolist()
                x_center = (x1 + x2) / 2
                y_center = (y1 + y2) / 2
                width = x2 - x1
                height = y2 - y1
                
                f.write(f"Detection {i+1}:\n")
                f.write(f"  Class: {class_name} (ID: {class_id})\n")
                f.write(f"  Confidence: {confidence:.4f}\n")
                f.write(f"  Bounding Box (pixels):\n")
                f.write(f"    Top-left: ({x1:.1f}, {y1:.1f})\n")
                f.write(f"    Bottom-right: ({x2:.1f}, {y2:.1f})\n")
                f.write(f"    Center: ({x_center:.1f}, {y_center:.1f})\n")
                f.write(f"    Size: {width:.1f} x {height:.1f}\n")
                f.write("\n")
        else:
            f.write("No detections found\n")
    
    print(f"처리 완료: {img_file}")
    print(f"  - 주석된 이미지: {save_path}")
    print(f"  - 주석 텍스트 파일: {txt_path}")



image 1/1 c:\Users\helen\Desktop\창의학기제\ocr\images\page_1.jpg: 576x1024 1 title, 4 plain texts, 1 abandon, 38.0ms
Speed: 5.0ms preprocess, 38.0ms inference, 1.0ms postprocess per image at shape (1, 3, 576, 1024)
처리 완료: page_1.jpg
  - 주석된 이미지: output1\page_1.jpg
  - 주석 텍스트 파일: output_ann1\page_1.txt

image 1/1 c:\Users\helen\Desktop\창의학기제\ocr\images\page_10.jpg: 576x1024 1 title, 3 plain texts, 2 abandons, 37.0ms
Speed: 4.0ms preprocess, 37.0ms inference, 1.5ms postprocess per image at shape (1, 3, 576, 1024)
처리 완료: page_10.jpg
  - 주석된 이미지: output1\page_10.jpg
  - 주석 텍스트 파일: output_ann1\page_10.txt

image 1/1 c:\Users\helen\Desktop\창의학기제\ocr\images\page_11.jpg: 576x1024 1 title, 2 plain texts, 1 abandon, 1 figure, 33.5ms
Speed: 4.0ms preprocess, 33.5ms inference, 1.5ms postprocess per image at shape (1, 3, 576, 1024)
처리 완료: page_11.jpg
  - 주석된 이미지: output1\page_11.jpg
  - 주석 텍스트 파일: output_ann1\page_11.txt

image 1/1 c:\Users\helen\Desktop\창의학기제\ocr\images\page_12.jpg: 576x1024 2 titles

# 이미지 외 마스킹

In [4]:
import os
import re
import cv2
import numpy as np

# --------------------------- 설정 ---------------------------
IMG_DIR        = "images"   # 원본 이미지 폴더
ANN_DIR        = "output_ann1"     # 주석(.txt) 폴더
OUT_DIR        = "masked_output1"   # 결과 저장 폴더
WHITE_COLOR    = (255, 255, 255)   # 덮어쓸 색 (B,G,R)
IMG_EXTS       = {".jpg", ".jpeg", ".png", ".bmp"}

os.makedirs(OUT_DIR, exist_ok=True)

# -------------------- 주석 파싱 함수 ------------------------
#  ⦁ Detection 블록 속 'Class' - 이름
#  ⦁ Top-left / Bottom-right 좌표 추출
DET_START  = re.compile(r"^Detection\s+\d+:")
CLASS_LINE = re.compile(r"^Class:\s*(.*?)\s*\(")
TOP_LINE   = re.compile(r"^Top-left:\s*\(([^)]+)\)")
BOT_LINE   = re.compile(r"^Bottom-right:\s*\(([^)]+)\)")

def parse_annotation_file(txt_path):
    """txt 한 개에서 [ {class, x1,y1,x2,y2}, … ] 리스트 반환"""
    dets, det = [], {}
    with open(txt_path, "r", encoding="utf-8") as f:
        for line in map(str.strip, f):
            if not line:
                continue
            if DET_START.match(line):               # Detection n:  ⇒ 새 블록
                if det:
                    dets.append(det); det = {}
            elif CLASS_LINE.match(line):
                det["class"] = CLASS_LINE.findall(line)[0].lower()
            elif TOP_LINE.match(line):
                x, y = map(float, TOP_LINE.findall(line)[0].split(","))
                det["x1"], det["y1"] = int(round(x)), int(round(y))
            elif BOT_LINE.match(line):
                x, y = map(float, BOT_LINE.findall(line)[0].split(","))
                det["x2"], det["y2"] = int(round(x)), int(round(y))
        if det:
            dets.append(det)
    return dets

# ----------------------- 메인 루프 --------------------------
for ann_file in os.listdir(ANN_DIR):
    if not ann_file.lower().endswith(".txt"):
        continue

    base_name = os.path.splitext(ann_file)[0]
    img_path  = None
    # 주석에 명시된 이미지 이름을 그대로 쓰거나, 폴더에서 탐색
    for ext in IMG_EXTS:
        p = os.path.join(IMG_DIR, base_name + ext)
        if os.path.exists(p):
            img_path = p
            break
    if img_path is None:
        print(f"[WARN] 이미지가 없음: {base_name}")
        continue

    # ---------- 1) 이미지 & 주석 읽기 ----------
    detections = parse_annotation_file(os.path.join(ANN_DIR, ann_file))
    img        = cv2.imread(img_path)
    if img is None:
        print(f"[WARN] 읽기 실패: {img_path}")
        continue

    h, w = img.shape[:2]

    # ---------- 2) figure 이외 박스를 흰색 처리 ----------
    for d in detections:
        if d.get("class") == "figure":
            continue
        # 좌표 클립(안전)
        x1 = max(0, min(w, d["x1"]))
        y1 = max(0, min(h, d["y1"]))
        x2 = max(0, min(w, d["x2"]))
        y2 = max(0, min(h, d["y2"]))
        img[y1:y2, x1:x2] = WHITE_COLOR

    # ---------- 3) 결과 저장 ----------
    out_path = os.path.join(OUT_DIR, base_name + ".png")
    cv2.imwrite(out_path, img)
    print(f"[DONE] {base_name} → {out_path}")


[DONE] page_1 → masked_output1\page_1.png
[DONE] page_10 → masked_output1\page_10.png
[DONE] page_11 → masked_output1\page_11.png
[DONE] page_12 → masked_output1\page_12.png
[DONE] page_13 → masked_output1\page_13.png
[DONE] page_14 → masked_output1\page_14.png
[DONE] page_15 → masked_output1\page_15.png
[DONE] page_16 → masked_output1\page_16.png
[DONE] page_17 → masked_output1\page_17.png
[DONE] page_18 → masked_output1\page_18.png
[DONE] page_19 → masked_output1\page_19.png
[DONE] page_2 → masked_output1\page_2.png
[DONE] page_20 → masked_output1\page_20.png
[DONE] page_21 → masked_output1\page_21.png
[DONE] page_22 → masked_output1\page_22.png
[DONE] page_23 → masked_output1\page_23.png
[DONE] page_24 → masked_output1\page_24.png
[DONE] page_25 → masked_output1\page_25.png
[DONE] page_26 → masked_output1\page_26.png
[DONE] page_27 → masked_output1\page_27.png
[DONE] page_28 → masked_output1\page_28.png
[DONE] page_29 → masked_output1\page_29.png
[DONE] page_3 → masked_output1\page_

# 3.주석 수정

In [12]:
import re
import os
from pathlib import Path
from typing import List, Tuple

Box = Tuple[float, float, float, float, float, str, int]  # (x1, y1, x2, y2, confidence, label, class_id)


def parse_annotation_file(path: Path) -> List[Box]:
    """
    텍스트 형식의 annotation 파일을 읽어 각 Detection의 바운딩 박스 리스트 반환.
    Class, ID, Confidence 함께 파싱.
    """
    boxes: List[Box] = []
    with path.open("r", encoding="utf-8") as f:
        lines = [line.rstrip() for line in f if line.strip()]

    i = 0
    curr_conf = None
    curr_label = None
    curr_id = None
    while i < len(lines):
        m_cls = re.match(r'Class:\s*(.+)\s*\(ID:\s*(\d+)\)', lines[i])
        if m_cls:
            curr_label = m_cls.group(1)
            curr_id = int(m_cls.group(2))
            i += 1
            continue

        m_conf = re.match(r'Confidence:\s*([\d.]+)', lines[i])
        if m_conf:
            curr_conf = float(m_conf.group(1))
            i += 1
            continue

        if lines[i].startswith("Bounding Box"):
            tl_line = lines[i + 1]  # Top-left: (x, y)
            br_line = lines[i + 2]  # Bottom-right: (x, y)
            nums = re.findall(r'\d+\.?\d*', tl_line + br_line)
            if len(nums) >= 4 and curr_conf is not None and curr_label is not None and curr_id is not None:
                x1, y1, x2, y2 = map(float, nums[:4])
                boxes.append((
                    min(x1, x2), min(y1, y2),
                    max(x1, x2), max(y1, y2),
                    curr_conf,
                    curr_label,
                    curr_id
                ))
            curr_conf = None
            curr_label = None
            curr_id = None
            i += 3
        else:
            i += 1

    return boxes


def intersects(b1: Box, b2: Box) -> bool:
    return not (
        b1[2] <= b2[0] or
        b2[2] <= b1[0] or
        b1[3] <= b2[1] or
        b2[3] <= b1[1]
    )


def merge_two(b1: Box, b2: Box) -> Box:
    x1 = min(b1[0], b2[0])
    y1 = min(b1[1], b2[1])
    x2 = max(b1[2], b2[2])
    y2 = max(b1[3], b2[3])
    conf = (b1[4] + b2[4]) / 2
    return (x1, y1, x2, y2, conf, b1[5], b1[6])


def merge_figures(boxes: List[Box]) -> List[Box]:
    figures = [b for b in boxes if b[5] == "figure"]
    others = [b for b in boxes if b[5] != "figure"]

    merged_flag = True
    while merged_flag:
        merged_flag = False
        result: List[Box] = []
        while figures:
            current = figures.pop()
            changed = True
            while changed:
                changed = False
                for other in figures[:]:
                    if intersects(current, other):
                        current = merge_two(current, other)
                        figures.remove(other)
                        changed = True
                        merged_flag = True
            result.append(current)
        figures = result

    return figures + others


def process_folder(input_folder: str,
                   output_folder: str,
                   extensions: Tuple[str, ...] = (".txt", ".ann")) -> None:
    in_path = Path(input_folder).expanduser().resolve()
    out_path = Path(output_folder).expanduser().resolve()
    out_path.mkdir(parents=True, exist_ok=True)

    for file_path in in_path.iterdir():
        if not file_path.is_file() or file_path.suffix.lower() not in extensions:
            continue

        raw_boxes = parse_annotation_file(file_path)
        merged_boxes = merge_figures(raw_boxes)

        out_file = out_path / (file_path.name + ".merged.txt")
        with out_file.open("w", encoding="utf-8") as f:
            f.write(f"Image: {file_path.stem}\n")
            f.write("Original Size: WIDTH x HEIGHT\n")
            f.write("--------------------------------------------------\n")
            f.write(f"Total detections: {len(raw_boxes)}\n\n")
            for i, (x1, y1, x2, y2, confidence, class_name, class_id) in enumerate(merged_boxes):
                x_center = (x1 + x2) / 2
                y_center = (y1 + y2) / 2
                width = x2 - x1
                height = y2 - y1
                f.write(f"Detection {i+1}:\n")
                f.write(f"  Class: {class_name} (ID: {class_id})\n")
                f.write(f"  Confidence: {confidence:.4f}\n")
                f.write(f"  Bounding Box (pixels):\n")
                f.write(f"    Top-left: ({x1:.1f}, {y1:.1f})\n")
                f.write(f"    Bottom-right: ({x2:.1f}, {y2:.1f})\n")
                f.write(f"    Center: ({x_center:.1f}, {y_center:.1f})\n")
                f.write(f"    Size: {width:.1f} x {height:.1f}\n")
                f.write("\n")


if __name__ == "__main__":
    process_folder("output_ann1", "output_ann2", extensions=(".txt", ".ann"))


In [23]:
import os
from pathlib import Path
from PIL import Image, ImageDraw

def load_boxes(txt_path: Path):
    """
    merged_annotation 파일에서 한 줄씩 읽어 (x1,y1,x2,y2) 튜플 리스트로 반환.
    """
    boxes = []
    with txt_path.open('r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split(',')
            if len(parts) == 4:
                x1, y1, x2, y2 = map(float, parts)
                boxes.append((x1, y1, x2, y2))
    return boxes

def draw_boxes_on_image(img_path: Path, boxes, out_path: Path, color='red', width=3):
    """
    이미지에 박스를 그리고 out_path에 저장.
    """
    img = Image.open(img_path).convert('RGB')
    draw = ImageDraw.Draw(img)
    for (x1, y1, x2, y2) in boxes:
        draw.rectangle([x1, y1, x2, y2], outline=color, width=width)
    img.save(out_path)

def process_folder(input_folder: str, output_folder: str,
                   img_exts=('.jpg', '.png', '.jpeg'),
                   ann_ext='.txt.merged.txt'):
    """
    1. 입력 폴더 내 이미지 파일과 같은 이름의 annotation(.merged.txt) 파일 탐색
    2. 이미지에 병합된 박스를 그려 출력 폴더에 저장
    """
    in_dir = Path(input_folder).expanduser().resolve()
    out_dir = Path(output_folder).expanduser().resolve()
    out_dir.mkdir(parents=True, exist_ok=True)

    for img_path in in_dir.iterdir():
        if img_path.suffix.lower() in img_exts:
            ann_path = img_path.with_suffix(ann_ext)
            if ann_path.exists():
                boxes = load_boxes(ann_path)
                out_path = out_dir / img_path.name
                draw_boxes_on_image(img_path, boxes, out_path)
                print(f"[OK] {img_path.name} → {out_path.name} ({len(boxes)} boxes)")
            else:
                print(f"[SKIP] 주석 파일 없음: {ann_path.name}")

if __name__ == '__main__':
    # 설정: 입력 이미지·주석 폴더, 출력 폴더
    input_folder = 'merged_results'
    output_folder = './boxed_images'
    process_folder(input_folder, output_folder)


[OK] page_1.jpg → page_1.jpg (0 boxes)
[OK] page_10.jpg → page_10.jpg (0 boxes)
[OK] page_11.jpg → page_11.jpg (1 boxes)
[OK] page_12.jpg → page_12.jpg (1 boxes)
[OK] page_13.jpg → page_13.jpg (1 boxes)
[OK] page_14.jpg → page_14.jpg (2 boxes)
[OK] page_15.jpg → page_15.jpg (2 boxes)
[OK] page_16.jpg → page_16.jpg (2 boxes)
[OK] page_17.jpg → page_17.jpg (2 boxes)
[OK] page_18.jpg → page_18.jpg (1 boxes)
[OK] page_19.jpg → page_19.jpg (1 boxes)
[OK] page_2.jpg → page_2.jpg (1 boxes)
[OK] page_20.jpg → page_20.jpg (2 boxes)
[OK] page_21.jpg → page_21.jpg (2 boxes)
[OK] page_22.jpg → page_22.jpg (1 boxes)
[OK] page_23.jpg → page_23.jpg (1 boxes)
[OK] page_24.jpg → page_24.jpg (2 boxes)
[OK] page_25.jpg → page_25.jpg (1 boxes)
[OK] page_26.jpg → page_26.jpg (1 boxes)
[OK] page_27.jpg → page_27.jpg (1 boxes)
[OK] page_28.jpg → page_28.jpg (1 boxes)
[OK] page_29.jpg → page_29.jpg (2 boxes)
[OK] page_3.jpg → page_3.jpg (1 boxes)
[OK] page_30.jpg → page_30.jpg (1 boxes)
[OK] page_31.jpg → pag