In [1]:
import re
import pdfplumber
import csv
from difflib import SequenceMatcher

In [3]:
def extract_text_from_pdf(path):
    with pdfplumber.open(path) as pdf:
        text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
    return text


def split_by_clause(text):
    pattern = r"(제\d+조(?:\s*제\d+항)?)"
    parts = re.split(pattern, text)
    clauses = []
    for i in range(1, len(parts), 2):
        law_id = parts[i].strip()
        law_body = parts[i + 1].strip() if i + 1 < len(parts) else ""
        clauses.append((law_id, law_body))
    return clauses


def compare_clauses(old_clauses, new_clauses, similarity_threshold=0.8):
    results = []
    old_dict = {cid: text for cid, text in old_clauses}
    new_ids = set(cid for cid, _ in new_clauses)

    for cid, new_text in new_clauses:
        old_text = old_dict.get(cid)
        if old_text:
            sim = SequenceMatcher(None, old_text, new_text).ratio()
            if sim < similarity_threshold:
                results.append((cid, old_text, new_text, "수정"))
        else:
            results.append((cid, "", new_text, "신설"))

    for cid, old_text in old_clauses:
        if cid not in new_ids:
            results.append((cid, old_text, "", "삭제"))

    return results

def save_to_csv(data, path):
    with open(path, mode="w", encoding="utf-8-sig", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["조문 ID", "구조문", "신조문", "변경유형"])
        writer.writerows(data)

In [5]:
import os

base_path = r"C:\Jimin\cg_DeltaLaw\data\raw"
output_path = r"C:\Jimin\cg_DeltaLaw\data\processed"
old_text = extract_text_from_pdf(os.path.join(base_path, "2205429_의사국 의안과_의안원문.pdf"))
new_text = extract_text_from_pdf(os.path.join(base_path, "new.pdf"))

old_clauses = split_by_clause(old_text)
new_clauses = split_by_clause(new_text)

results = compare_clauses(old_clauses, new_clauses)
save_to_csv(results, "조문_비교결과.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Jimin\\cg_DeltaLaw\\data\\raw\\new.pdf'