In [1]:
import re
import pdfplumber
import csv
from difflib import SequenceMatcher
import os

In [2]:
def extract_text_from_pdf(path):
    with pdfplumber.open(path) as pdf:
        text = "\n".join(
            page.extract_text() for page in pdf.pages if page.extract_text()
        )
    return text


def extract_law_name(text):
    # 예: "한국은행법 일부개정법률안" 형태 추출
    match = re.search(r"([\w\d가-힣]+법)\s*(일부|전부)?개정법률안", text)
    if match:
        name = match.group(1)
        revision_type = match.group(2)
        return name, revision_type
    return "법령명 미확인", None


def split_by_clause(text):
    pattern = r"(제\d+조(?:\s*제\d+항)?(?:\s*제\d+호)?(?:\s*제\d+목)?)"
    parts = re.split(pattern, text)
    clauses = []
    for i in range(1, len(parts), 2):
        law_id = parts[i].strip()
        law_body = parts[i + 1].strip() if i + 1 < len(parts) else ""
        clauses.append((law_id, law_body))
    return clauses


def clean_text(text):
    remove_keywords = ["현행과 같음", "------", "생략"]
    lines = text.splitlines()
    lines = [line.strip() for line in lines if line.strip() and not any(k in line for k in remove_keywords)]
    return " ".join(lines)


def compare_clauses_v2(old_clauses, new_clauses, similarity_threshold=0.85):
    results = []
    old_dict = {cid: clean_text(text) for cid, text in old_clauses}
    new_ids = set(cid for cid, _ in new_clauses)

    for cid, new_text_raw in new_clauses:
        new_text = clean_text(new_text_raw)
        old_text = old_dict.get(cid)
        if old_text:
            sim = SequenceMatcher(None, old_text, new_text).ratio()
            if sim < similarity_threshold:
                results.append((cid, old_text, new_text, "수정"))
        else:
            results.append((cid, "", new_text, "신설"))

    for cid, old_text_raw in old_clauses:
        if cid not in new_ids:
            old_text = clean_text(old_text_raw)
            results.append((cid, old_text, "", "삭제"))

    return results


def save_to_csv(data, path, law_name=""):
    with open(path, mode="w", encoding="utf-8-sig", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["법령명", "조문 ID", "구조문", "신조문", "변경유형"])
        for row in data:
            writer.writerow([law_name] + list(row))


def batch_compare_folder(raw_folder, output_csv):
    all_results = []
    file_list = sorted([f for f in os.listdir(raw_folder) if f.endswith(".pdf")])

    law_name = ""
    revision_type = "일부"
    old_clauses, new_clauses = [], []

    for file in file_list:
        filepath = os.path.join(raw_folder, file)
        try:
            text = extract_text_from_pdf(filepath)
            if not law_name:
                law_name, revision_type = extract_law_name(text)
                if revision_type == "전부":
                    print(f"[예외처리] {law_name}은 전부개정으로 비교 생략")
                    return
            if "old" in file:
                old_clauses = split_by_clause(text)
            elif "new" in file:
                new_clauses = split_by_clause(text)
        except Exception as e:
            print(f"Error reading {file}: {e}")

    if old_clauses and new_clauses:
        result = compare_clauses_v2(old_clauses, new_clauses)
        all_results.extend(result)

    save_to_csv(all_results, output_csv, law_name=law_name)

In [4]:
batch_compare_folder(
    raw_folder="../data/no_upload", 
    output_csv="../data/processed/조문_비교결과_v2.csv"
)

In [5]:
import pandas as pd

file_path = "../data/processed/조문_비교결과_v2.csv"
df = pd.read_csv(file_path)

df.to_excel("../data/processed/조문_비교결과_v2.xlsx", index=False)

In [6]:
df = pd.read_csv(file_path)

df.head()

df.columns

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   법령명     0 non-null      object
 1   조문 ID   0 non-null      object
 2   구조문     0 non-null      object
 3   신조문     0 non-null      object
 4   변경유형    0 non-null      object
dtypes: object(5)
memory usage: 132.0+ bytes


In [10]:
import re
import pdfplumber
import csv
from difflib import SequenceMatcher
import os


def extract_text_from_pdf(path):
    with pdfplumber.open(path) as pdf:
        text = "\n".join(
            page.extract_text() for page in pdf.pages if page.extract_text()
        )
    return text


def extract_law_name(text):
    match = re.search(r"([\w\d가-힣]+법)\s*(일부|전부)?개정법률안", text)
    if match:
        name = match.group(1)
        revision_type = match.group(2)
        return name, revision_type
    return "법령명 미확인", None


def extract_table_section(text):
    lines = text.splitlines()
    start_idx = next((i for i, line in enumerate(lines) if "신" in line and "구" in line), 0)
    return "\n".join(lines[start_idx:])


def split_left_right(text):
    lines = text.splitlines()
    left_lines, right_lines = [], []
    midpoint = max(len(line) for line in lines) // 2
    for line in lines:
        left = line[:midpoint].strip()
        right = line[midpoint:].strip()
        left_lines.append(left)
        right_lines.append(right)
    return "\n".join(left_lines), "\n".join(right_lines)


def split_by_clause(text):
    pattern = r"(제\d+조(?:\s*제\d+항)?(?:\s*제\d+호)?(?:\s*제\d+목)?)"
    parts = re.split(pattern, text)
    clauses = []
    for i in range(1, len(parts), 2):
        law_id = parts[i].strip()
        law_body = parts[i + 1].strip() if i + 1 < len(parts) else ""
        clauses.append((law_id, law_body))
    return clauses


def clean_text(text):
    remove_keywords = ["현행과 같음", "------", "생략"]
    lines = text.splitlines()
    lines = [line.strip() for line in lines if line.strip() and not any(k in line for k in remove_keywords)]
    return " ".join(lines)


def compare_clauses_v2(old_clauses, new_clauses, similarity_threshold=0.85):
    results = []
    old_dict = {cid: clean_text(text) for cid, text in old_clauses}
    new_ids = set(cid for cid, _ in new_clauses)

    for cid, new_text_raw in new_clauses:
        new_text = clean_text(new_text_raw)
        old_text = old_dict.get(cid)
        if old_text:
            sim = SequenceMatcher(None, old_text, new_text).ratio()
            if sim < similarity_threshold:
                results.append((cid, old_text, new_text, "수정"))
        else:
            results.append((cid, "", new_text, "신설"))

    for cid, old_text_raw in old_clauses:
        if cid not in new_ids:
            old_text = clean_text(old_text_raw)
            results.append((cid, old_text, "", "삭제"))

    return results


def save_to_csv(data, path, law_name="", revision_type=""):
    with open(path, mode="w", encoding="utf-8-sig", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["법령명", "개정유형", "조문 ID", "구조문", "신조문", "변경유형"])
        for row in data:
            writer.writerow([law_name, revision_type] + list(row))


def process_single_pdf(pdf_path, output_csv):
    text = extract_text_from_pdf(pdf_path)
    law_name, revision_type = extract_law_name(text)
    if revision_type == "전부":
        print(f"[예외처리] {law_name}은 전부개정으로 비교 생략")
        return

    table_section = extract_table_section(text)
    old_text, new_text = split_left_right(table_section)
    old_clauses = split_by_clause(old_text)
    new_clauses = split_by_clause(new_text)
    result = compare_clauses_v2(old_clauses, new_clauses)
    save_to_csv(result, output_csv, law_name, revision_type)



In [11]:
process_single_pdf(
    "../data/no_upload/2205429_의사국 의안과_의안원문.pdf", 
    "../data/processed/조문_비교결과_v2_2.csv"
)

In [12]:
import pandas as pd

file_path = "../data/processed/조문_비교결과_v2_2.csv"
df = pd.read_csv(file_path)

df.to_excel("../data/processed/조문_비교결과_v2_2.xlsx", index=False)

# 🧪 조문 비교 파이프라인 v2

## 📌 목적
개정 법률안 PDF에서 신·구조문대비표를 추출하고,
- **법령명**
- **개정유형** (일부/전부)
- **조문 ID**
- **구조문 / 신조문**
- **변경유형** (신설/수정/삭제)

을 판별해 CSV로 저장하는 파이프라인

---

## 📂 전체 처리 흐름

1. **PDF 열기 및 텍스트 추출**  
   → `extract_text_from_pdf()`

2. **법령명 / 개정유형 추출**  
   → `extract_law_name()`

3. **신·구 구조문 구간 추출**  
   → `extract_table_section()`

4. **신조문/구조문 좌우 분리**  
   → `split_left_right()`

5. **조문별 분리 (제X조 제X항 기준)**  
   → `split_by_clause()`

6. **불필요 문구 제거**  
   → `clean_text()`

7. **유사도 기반 비교 (수정/신설/삭제)**  
   → `compare_clauses_v2()`

8. **CSV 저장**  
   → `save_to_csv()`

---

## 📄 주요 컬럼 예시 (CSV 출력)

| 법령명       | 개정유형 | 조문 ID  | 구조문             | 신조문             | 변경유형 |
|--------------|----------|-----------|---------------------|---------------------|----------|
| 한국은행법    | 일부     | 제5조     | 자본금 30조         | 자본금 60조         | 수정     |
| 한국은행법    | 일부     | 제6조의2  | (없음)              | 한국은행은 다음을… | 신설     |
| 한국은행법    | 일부     | 제7조     | 지급준비금          | (없음)              | 삭제     |

---

## 🔧 특이사항 및 한계점

- "전부개정"은 비교하지 않음 (예외처리)
- 좌/우 분리는 고정된 **문자 수 기준**으로 자름 → **PDF 레이아웃 변화에 취약**
- 조문 매칭은 정규식 기반이며, 유사도는 `difflib.SequenceMatcher` 사용

---

## ✅ 실행 예시

```python
process_single_pdf(
    "./data/raw/2210568_의사국 의안과_의안원문.pdf",
    "./data/processed/조문_비교결과_v2.csv"
)
```

---

## 📦 추후 개선 방향 (v3 예고)

- 좌/우 위치를 **PDF 좌표 기준**으로 분리 (bounding box 활용)
- 조문 비교 정밀도 향상: BLIP-2, KoSimCSE 등 도입 검토
- 부칙 처리 및 부속 조항 매칭 향상
- `전부개정법률안`도 분석 가능하도록 확장

