✅ 개선 사항 정리

| 항목              | 반영 여부 | 설명                                                                  |
| --------------- | ----- | ------------------------------------------------------------------- |
| `~` 붙은 조문 ID 제거 | ✅     | `replace("~", "")` 처리                                               |
| 조문 ID 계층 구조 분리  | ✅     | 조/항/호/목 구분, `의`까지 포함                                                |
| CSV 컬럼 구조 개선    | ✅     | `법령명, 조문ID, 조, 항, 호, 목, 기존, 변경, 변경유형, 파일이름`                         |
| 전부개정 PASS       | ✅     | `전부개정`일 경우 `print`로 예외 처리                                           |
| 파일이름 포함         | ✅     | `os.path.basename(pdf_path)` 활용                                     |
| 함수 명세 일관성       | ✅     | `compare_clauses_v4`, `save_to_csv_v4`, `process_single_pdf_v4`로 변경 |


In [1]:
import re
import pdfplumber
import csv
from difflib import SequenceMatcher
import os


def extract_text_from_pdf(path):
    with pdfplumber.open(path) as pdf:
        text = "\n".join(
            page.extract_text() for page in pdf.pages if page.extract_text()
        )
    return text


def extract_meta(text):
    meta = {}
    law_match = re.search(r"([\w\d가-힣]+법)\s*(일부|전부)?개정법률안", text)
    if law_match:
        meta["법령명"] = law_match.group(1)
        meta["개정유형"] = law_match.group(2)
    bill_match = re.search(r"의안[\s:：]*번호[\s:：]*(제?\d+호)", text)
    if bill_match:
        meta["의안번호"] = bill_match.group(1)
    date_match = re.search(r"발의[\s:：]*연월일[\s:：]*([\d\.\-]+)", text)
    if date_match:
        meta["발의연월일"] = date_match.group(1)
    return meta


def extract_table_section(text):
    lines = text.splitlines()
    start_idx = next((i for i, line in enumerate(lines) if "신" in line and "구" in line), 0)
    return "\n".join(lines[start_idx:])


def split_left_right(text):
    lines = text.splitlines()
    left_lines, right_lines = [], []
    midpoint = max(len(line) for line in lines) // 2
    for line in lines:
        left = line[:midpoint].strip()
        right = line[midpoint:].strip()
        left_lines.append(left)
        right_lines.append(right)
    return "\n".join(left_lines), "\n".join(right_lines)


def split_by_clause(text):
    pattern = r"(제\d+조(?:의\d+)?(?:\s*제\d+항)?(?:\s*제\d+호)?(?:\s*제\d+목)?)"
    parts = re.split(pattern, text)
    clauses = []
    for i in range(1, len(parts), 2):
        law_id = parts[i].strip().replace("~", "")  # ~ 제거
        law_body = parts[i + 1].strip() if i + 1 < len(parts) else ""
        clauses.append((law_id, law_body))
    return clauses


def clean_text(text):
    remove_keywords = ["현행과 같음", "------", "생략"]
    lines = text.splitlines()
    lines = [line.strip() for line in lines if line.strip() and not any(k in line for k in remove_keywords)]
    return " ".join(lines).replace("~", "")  # ~ 제거


def parse_clause_id(clause_id):
    jo = hang = ho = mok = ""
    m = re.match(r"제(\d+)조(?:의(\d+))?(?:\s*제(\d+)항)?(?:\s*제(\d+)호)?(?:\s*제(\d+)목)?", clause_id)
    if m:
        jo = m.group(1)
        if m.group(2):
            jo += f"의{m.group(2)}"
        hang = m.group(3) or ""
        ho = m.group(4) or ""
        mok = m.group(5) or ""
    return jo, hang, ho, mok


def compare_clauses_v4(old_clauses, new_clauses, law_name, revision_type, filename, threshold=0.85):
    results = []
    old_dict = {cid: clean_text(text) for cid, text in old_clauses}
    new_ids = set(cid for cid, _ in new_clauses)

    for cid, new_text_raw in new_clauses:
        new_text = clean_text(new_text_raw)
        old_text = old_dict.get(cid)
        jo, hang, ho, mok = parse_clause_id(cid)
        if old_text:
            sim = SequenceMatcher(None, old_text, new_text).ratio()
            if sim < threshold:
                results.append([law_name, cid, jo, hang, ho, mok, old_text, new_text, "수정", filename])
        else:
            results.append([law_name, cid, jo, hang, ho, mok, "", new_text, "신설", filename])

    for cid, old_text_raw in old_clauses:
        if cid not in new_ids:
            old_text = clean_text(old_text_raw)
            jo, hang, ho, mok = parse_clause_id(cid)
            results.append([law_name, cid, jo, hang, ho, mok, old_text, "", "삭제", filename])

    return results


def save_to_csv_v4(data, path):
    with open(path, mode="w", encoding="utf-8-sig", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["법령명", "조문ID", "조", "항", "호", "목", "기존", "변경", "변경유형", "파일이름"])
        for row in data:
            writer.writerow(row)


def process_single_pdf_v4(pdf_path, output_csv):
    filename = os.path.basename(pdf_path)
    text = extract_text_from_pdf(pdf_path)
    meta = extract_meta(text)
    if meta.get("개정유형") == "전부":
        print(f"[예외처리] {meta.get('법령명', '알 수 없음')}은 전부개정으로 비교 생략")
        return meta

    law_name = meta.get("법령명", "")
    revision_type = meta.get("개정유형", "")

    table_section = extract_table_section(text)
    old_text, new_text = split_left_right(table_section)
    old_clauses = split_by_clause(old_text)
    new_clauses = split_by_clause(new_text)
    result = compare_clauses_v4(old_clauses, new_clauses, law_name, revision_type, filename)
    save_to_csv_v4(result, output_csv)
    return meta


In [2]:
meta = process_single_pdf_v4("../data/no_upload/2210568_의사국 의안과_의안원문.pdf", "../data/processed/조문_비교결과_v4.csv")

In [3]:
import pandas as pd

file_path = "../data/processed/조문_비교결과_v4.csv"

df = pd.read_csv(file_path)

df.head()

df.columns

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   법령명     5 non-null      object 
 1   조문ID    5 non-null      object 
 2   조       5 non-null      object 
 3   항       0 non-null      float64
 4   호       0 non-null      float64
 5   목       0 non-null      float64
 6   기존      5 non-null      object 
 7   변경      1 non-null      object 
 8   변경유형    5 non-null      object 
 9   파일이름    5 non-null      object 
dtypes: float64(3), object(7)
memory usage: 532.0+ bytes


In [4]:
file_path = "../data/processed/조문_비교결과_v4.csv"
df = pd.read_csv(file_path)

df.to_excel("../data/processed/조문_비교결과_v4.xlsx", index=False)