In [9]:
import fitz  # PyMuPDF
import pandas as pd
import os
import re

# ✅ 1. 메타데이터 추출
def extract_meta_info(text: str) -> dict:
    law_name = ""
    revision_type = "일부"
    bill_number = ""
    proposal_date = ""

    lines = text.split("\n")
    for line in lines:
        if "법률안" in line and not law_name:
            law_name = line.strip()
            if "전부개정" in law_name:
                revision_type = "전부"
        if "의안번호" in line:
            bill_number = re.sub(r"[^0-9]", "", line)
        if "발의연월일" in line or "발의일" in line:
            proposal_date = re.sub(r"[^0-9]", "", line)

    return {
        "법령명": law_name,
        "개정유형": revision_type,
        "의안번호": bill_number,
        "발의일": proposal_date
    }

# ✅ 2. 신구조문이 포함된 페이지 탐색
def extract_clause_table_pages(doc):
    result_pages = []
    for page_num in range(len(doc)):
        text = doc[page_num].get_text()
        if "신·구조문대비표" in text or "신·구 조문 대비표" in text:
            result_pages.append(page_num)
    return result_pages

# ✅ 3. 블록 텍스트 추출 (x0 좌표 기준으로)
def extract_text_blocks(doc, pages):
    clauses = []
    for page_num in pages:
        blocks = doc[page_num].get_text("blocks")
        for block in blocks:
            x0, y0, x1, y1, text, block_no = block
            if len(text.strip()) > 5:
                clauses.append((x0, text.strip()))
    return clauses

# ✅ 4. 좌우 구분
def classify_blocks(clauses):
    left_texts = []
    right_texts = []
    for x0, text in clauses:
        if x0 < 250:
            left_texts.append(text)
        elif x0 > 300:
            right_texts.append(text)
    return left_texts, right_texts

# ✅ 5. 변경유형 분류
def classify_change_type(old_text, new_text):
    if "(현행과 같음)" in new_text or "---" in new_text:
        return "변동" if old_text != new_text else "유지"
    elif "(신설)" in old_text or "없음" in old_text:
        return "신설"
    elif "(삭제)" in new_text or "없음" in new_text:
        return "삭제"
    else:
        return "변동" if old_text != new_text else "유지"

# ✅ 6. 조/항/호/목 분리
def split_article_id(article_id):
    parts = {"조": "", "항": "", "호": "", "목": ""}
    if "조" in article_id:
        parts["조"] = re.findall(r"(제?\d+조[\w]*)", article_id)[0] if re.findall(r"(제?\d+조[\w]*)", article_id) else ""
    if "항" in article_id:
        parts["항"] = re.findall(r"(제?\d+항)", article_id)[0] if re.findall(r"(제?\d+항)", article_id) else ""
    if "호" in article_id:
        parts["호"] = re.findall(r"(제?\d+호)", article_id)[0] if re.findall(r"(제?\d+호)", article_id) else ""
    if "목" in article_id:
        parts["목"] = re.findall(r"(제?\d+목)", article_id)[0] if re.findall(r"(제?\d+목)", article_id) else ""
    return parts

# ✅ 7. 메인 처리 함수
def process_single_pdf_v5(pdf_path: str) -> pd.DataFrame:
    doc = fitz.open(pdf_path)
    full_text = "\n".join([page.get_text() for page in doc])
    meta = extract_meta_info(full_text)
    if meta["개정유형"] == "전부":
        print(f"[PASS] 전부개정법률안 제외됨: {os.path.basename(pdf_path)}")
        return pd.DataFrame()

    target_pages = extract_clause_table_pages(doc)
    blocks = extract_text_blocks(doc, target_pages)
    left_texts, right_texts = classify_blocks(blocks)

    data = []
    for old, new in zip(left_texts, right_texts):
        article_id_match = re.search(r"(제?\d+조[\w의\d]*)", new)
        article_id = article_id_match.group(1) if article_id_match else ""
        article_id = article_id.replace("~", "").strip()
        levels = split_article_id(article_id)
        change_type = classify_change_type(old, new)
        if change_type == "유지":
            continue

        data.append({
            "법령명": meta["법령명"],
            "조문ID": article_id,
            "조": levels["조"],
            "항": levels["항"],
            "호": levels["호"],
            "목": levels["목"],
            "기존": old,
            "변경": new,
            "변경유형": change_type,
            "파일이름": os.path.basename(pdf_path)
        })

    return pd.DataFrame(data)


In [10]:
import pandas as pd

df = process_single_pdf_v5("../data/no_upload/2210568_의사국 의안과_의안원문.pdf")
df.to_csv("../data/processed/조문_비교결과_v5.csv", index=False)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Empty DataFrame


In [8]:
new_df = pd.read_csv("../data/processed/조문_비교결과_v5.csv")

new_df.info()


EmptyDataError: No columns to parse from file

In [None]:

new_df.head()

new_df.columns