In [1]:
import pdfplumber

def extract_metadata_first_page(pdf_path) -> dict:
    """
    첫 페이지에서 법령명, 발의일, 발의자 추출
    """
    metadata = {
        "법령명": None,
        "발의일": None,
        "발의자": None,
    }
    with pdfplumber.open(pdf_path) as pdf:
        first_text = pdf.pages[0].extract_text() or ""
        lines = [line.strip() for line in first_text.split("\n") if line.strip()]
        for line in lines:
            if "법률" in line and "개정" in line:
                metadata["법령명"] = line
            elif "발의연월일" in line:
                metadata["발의일"] = line.split(":")[-1].strip()
            elif "발 의 자" in line or "발의자" in line:
                metadata["발의자"] = line.split(":")[-1].strip()
            elif "의원" in line and not metadata["발의자"]:
                metadata["발의자"] = line.strip()
    return metadata


In [2]:
def extract_table_from_syn_gu_page(pdf_path) -> list[list[str]]:
    """
    신구조문대비표 추출
    """
    rows = []
    capturing = False
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text() or ""
            if not capturing and "신구조문대비표" in text.replace(" ", ""):
                capturing = True
            if capturing:
                tables = page.extract_tables()
                for table in tables:
                    for row in table:
                        if row and len(row) >= 2:
                            left = row[0].strip() if row[0] else ""
                            right = row[1].strip() if row[1] else ""
                            rows.append([left, right])
                break  # 첫 신구조문대비표 페이지만
    return rows


In [3]:
import os
import pandas as pd

base_dir = "../data/"
pdf_path = os.path.join(base_dir, "no_upload/2210437_의사국 의안과_의안원문.pdf")

meta = extract_metadata_first_page(pdf_path)
table = extract_table_from_syn_gu_page(pdf_path)

print("📘 메타데이터:", meta)

pd.DataFrame(table, columns=["현행", "개정안"]).to_excel(os.path.join(base_dir, "processed/2210437_의사국 의안과_의안원문_표출력결과.xlsx"), index=False)

📘 메타데이터: {'법령명': '헌법재판소법 일부개정법률안', '발의일': '2025. 5. 9.', '발의자': '윤준병ㆍ허성무ㆍ장종태'}


In [None]:
from collections import defaultdict

def extract_syn_gu_by_word(path, line_tol=2):
    import pdfplumber
    result_rows = []
    capturing = False

    with pdfplumber.open(path) as pdf:
        for page in pdf.pages:
            text = page.extract_text() or ""
            if not capturing and "신구조문대비표" in text.replace(" ", ""):
                capturing = True

            if capturing:
                words = page.extract_words()
                midpoint = page.width / 2
                line_map = defaultdict(lambda: {"left": "", "right": ""})

                for word in words:
                    top_key = round(word['top'] / line_tol) * line_tol
                    if word['x0'] < midpoint:
                        line_map[top_key]["left"] += " " + word['text']
                    else:
                        line_map[top_key]["right"] += " " + word['text']

                for top in sorted(line_map.keys()):
                    l = line_map[top]["left"].strip()
                    r = line_map[top]["right"].strip()
                    result_rows.append([l, r])
                break  # 첫 신구 페이지만

    return result_rows


In [4]:
from collections import defaultdict
import pdfplumber

def extract_syn_gu_by_word(path, line_tol=3):
    result_rows = []
    capturing = False

    with pdfplumber.open(path) as pdf:
        for page in pdf.pages:
            text = page.extract_text() or ""
            if not capturing and "신구조문대비표" in text.replace(" ", ""):
                capturing = True

            if capturing:
                words = page.extract_words()
                midpoint = page.width / 2
                line_map = defaultdict(lambda: {"left": "", "right": ""})

                for word in words:
                    top_key = round(word['top'] / line_tol) * line_tol
                    if word['x0'] < midpoint:
                        line_map[top_key]["left"] += " " + word['text']
                    else:
                        line_map[top_key]["right"] += " " + word['text']

                for top in sorted(line_map.keys()):
                    l = line_map[top]["left"].strip()
                    r = line_map[top]["right"].strip()
                    result_rows.append([l, r])
                break

    return result_rows


In [5]:
from pathlib import Path
import pandas as pd

pdf_dir = Path("../data/no_upload")
pdf_files = list(pdf_dir.glob("*.pdf"))
all_rows = []

for path in pdf_files:
    try:
        rows = extract_syn_gu_by_word(str(path))
        for l, r in rows:
            all_rows.append({"파일명": path.name, "현행": l, "개정안": r})
    except Exception as e:
        all_rows.append({"파일명": path.name, "현행": f"[ERROR] {e}", "개정안": ""})


In [None]:
for rows in all_rows:
    print(rows)