# EPUB 영어-한국어 문장 정렬 도구

영어와 한국어 EPUB 파일을 챕터 및 문장 단위로 정렬하여
번역 학습 데이터셋(JSONL)을 생성합니다.

```
EpubAligner
├── TextProcessor (텍스트 전처리 및 문장 분리)
│   ├── split_sentences (언어별 문장 분리)
│   └── clean_html (HTML에서 텍스트 추출)
├── EpubLoader (EPUB 파일 로더)
│   ├── load (EPUB에서 챕터 추출)
│   └── _extract_chapter (단일 챕터 추출)
├── DPAligner (동적 프로그래밍 기반 정렬)
│   ├── load (EPUB에서 챕터 추출)
│   ├── align_chapters (챕터 레벨 정렬)
│   ├── align_sentences (문장 레벨 정렬 (N:M 매칭 지원))
│   ├── _compute_chapter_embeddings (챕터 임베딩 계산 (처음 N문장 요약))
│   ├── _dp_align (기본 DP 정렬 (1:1 매칭))
│   ├── _build_sentence_dp (문장 정렬용 DP 테이블 (N:M 매칭))
│   └── _backtrack_sentences (문장 정렬 역추적)
├── OutputWriter (결과 파일 작성기)
│   ├── write_header
│   ├── write_chapter_header
│   └── write_result
├── Chapter (챕터 데이터)
├── AlignConfig (정렬 설정)
├── AlignmentResult (정렬 결과)
├── process_pair (단일 책 쌍 처리)
└── run (전체 파이프라인 실행)
```

In [1]:
import os
import glob
import json
import argparse
import re
import warnings
from dataclasses import dataclass, field
from pathlib import Path
from typing import Iterator

import numpy as np
import torch
from bs4 import BeautifulSoup
from tqdm import tqdm
from colorama import init, Fore, Style

In [2]:
import ebooklib
from ebooklib import epub
from nltk.tokenize import sent_tokenize
import nltk
from kiwipiepy import Kiwi
from sentence_transformers import SentenceTransformer, util

In [15]:
# 초기화
init(autoreset=True)
warnings.filterwarnings('ignore')

# NLTK 데이터 확인 (SSL 우회)
def ensure_nltk_data():
    try:
        nltk.data.find('tokenizers/punkt_tab')
    except LookupError:
        import ssl
        try:
            _create_unverified_https_context = ssl._create_unverified_context
        except AttributeError:
            pass
        else:
            ssl._create_default_https_context = _create_unverified_https_context
        nltk.download('punkt_tab', quiet=True)

ensure_nltk_data()

## 설정 및 상수

In [4]:
@dataclass
class AlignConfig:
    """정렬 설정"""
    input_dir: str = 'data'
    device: str = field(default_factory=lambda: 'cuda' if torch.cuda.is_available() else 'cpu')
    threshold: float = 0.65
    max_merge_n: int = 5
    merge_bonus: float = 0.001
    
    # 내부 상수
    chapter_skip_penalty: float = -0.2
    sentence_skip_penalty: float = -0.3
    chapter_sim_threshold: float = 0.3
    min_chapter_length: int = 50
    chapter_summary_sentences: int = 10


@dataclass
class Chapter:
    """챕터 데이터"""
    idx: int
    text: str
    sentences: list[str]
    embedding: torch.Tensor | None = None


@dataclass
class AlignmentResult:
    """정렬 결과"""
    match_type: str  # '1:1', '1:2', '2:1', 'SKIP_EN', 'SKIP_KR'
    en_indices: list[int]
    kr_indices: list[int]
    similarity: float

## 텍스트 처리

In [5]:
class TextProcessor:
    """텍스트 전처리 및 문장 분리"""
    
    REMOVE_TAGS = ['script', 'style', 'nav', 'sup', 'footer', 'header', 'table']
    
    def __init__(self):
        self.kiwi = Kiwi()
    
    def clean_html(self, html: str) -> str:
        """HTML에서 텍스트 추출"""
        if not html:
            return ""
        
        soup = BeautifulSoup(html, 'html.parser')
        for tag in soup(self.REMOVE_TAGS):
            tag.decompose()
        
        text = soup.get_text()
        return re.sub(r'\s+', ' ', text).strip()
    
    def split_sentences(self, text: str, lang: str) -> list[str]:
        """언어별 문장 분리"""
        if lang == 'en':
            sentences = sent_tokenize(text)
        else:
            sentences = [s.text for s in self.kiwi.split_into_sents(text)]
        
        return [s for s in sentences if len(s.strip()) > 1]

## EPUB 로더

In [6]:
class EpubLoader:
    """EPUB 파일 로더"""
    
    def __init__(self, processor: TextProcessor, config: AlignConfig):
        self.processor = processor
        self.config = config
    
    def load(self, filepath: str, lang: str) -> list[Chapter]:
        """EPUB에서 챕터 추출"""
        try:
            book = epub.read_epub(filepath)
        except Exception as e:
            print(f"{Fore.RED}[Error] Failed to read {filepath}: {e}{Style.RESET_ALL}")
            return []
        
        chapters = []
        
        for idx, (item_id, _) in enumerate(book.spine):
            chapter = self._extract_chapter(book, item_id, idx, lang)
            if chapter:
                chapter.idx = len(chapters)
                chapters.append(chapter)
        
        return chapters
    
    def _extract_chapter(self, book, item_id: str, idx: int, lang: str) -> Chapter | None:
        """단일 챕터 추출"""
        item = book.get_item_with_id(item_id)
        if not item or item.get_type() != ebooklib.ITEM_DOCUMENT:
            return None
        
        content = item.get_content().decode('utf-8', errors='ignore')
        cleaned = self.processor.clean_html(content)
        
        if len(cleaned) < self.config.min_chapter_length:
            return None
        
        sentences = self.processor.split_sentences(cleaned, lang)
        if not sentences:
            return None
        
        return Chapter(idx=idx, text=cleaned, sentences=sentences)

## 동적 프로그래밍 정렬기

In [7]:
class DPAligner:
    """동적 프로그래밍 기반 정렬"""
    
    def __init__(self, model: SentenceTransformer, config: AlignConfig):
        self.model = model
        self.config = config
    
    def align_chapters(
        self, 
        en_chapters: list[Chapter], 
        kr_chapters: list[Chapter]
    ) -> list[tuple[Chapter, Chapter]]:
        """챕터 레벨 정렬"""
        if not en_chapters or not kr_chapters:
            return []
        
        n_en, n_kr = len(en_chapters), len(kr_chapters)
        print(f"{Fore.YELLOW}[Chapter] Aligning {n_en} EN ↔ {n_kr} KR chapters{Style.RESET_ALL}")
        
        # 임베딩 계산
        en_embs = self._compute_chapter_embeddings(en_chapters)
        kr_embs = self._compute_chapter_embeddings(kr_chapters)
        sim_matrix = util.cos_sim(en_embs, kr_embs).cpu().numpy()
        
        # DP 정렬
        matches = self._dp_align(
            sim_matrix, 
            skip_penalty=self.config.chapter_skip_penalty,
            sim_threshold=self.config.chapter_sim_threshold
        )
        
        return [(en_chapters[i], kr_chapters[j]) for i, j in matches]
    
    def align_sentences(
        self, 
        en_sents: list[str], 
        kr_sents: list[str]
    ) -> Iterator[AlignmentResult]:
        """문장 레벨 정렬 (N:M 매칭 지원)"""
        if not en_sents or not kr_sents:
            return
        
        # 임베딩 계산
        en_emb = self.model.encode(en_sents, convert_to_tensor=True, device=self.config.device)
        kr_emb = self.model.encode(kr_sents, convert_to_tensor=True, device=self.config.device)
        sim_matrix = util.cos_sim(en_emb, kr_emb).cpu().numpy()
        
        # DP 테이블 구축
        n_en, n_kr = len(en_sents), len(kr_sents)
        dp, backtrack = self._build_sentence_dp(en_emb, kr_emb, sim_matrix, n_en, n_kr)
        
        # 역추적하여 결과 생성
        yield from self._backtrack_sentences(dp, backtrack, sim_matrix, n_en, n_kr)
    
    def _compute_chapter_embeddings(self, chapters: list[Chapter]) -> torch.Tensor:
        """챕터 임베딩 계산 (처음 N문장 요약)"""
        summaries = [
            " ".join(c.sentences[:self.config.chapter_summary_sentences]) 
            for c in chapters
        ]
        return self.model.encode(summaries, convert_to_tensor=True, device=self.config.device)
    
    def _dp_align(
        self, 
        sim_matrix: np.ndarray, 
        skip_penalty: float,
        sim_threshold: float = 0.0
    ) -> list[tuple[int, int]]:
        """기본 DP 정렬 (1:1 매칭)"""
        n_en, n_kr = sim_matrix.shape
        
        dp = np.full((n_en + 1, n_kr + 1), -np.inf)
        backtrack = np.zeros((n_en + 1, n_kr + 1), dtype=int)
        dp[0, 0] = 0
        
        # Actions: 0=Stop, 1=Match, 2=Skip_EN, 3=Skip_KR
        for i in range(n_en + 1):
            for j in range(n_kr + 1):
                if i == 0 and j == 0:
                    continue
                
                candidates = []
                
                if i > 0 and j > 0:
                    candidates.append((dp[i-1, j-1] + sim_matrix[i-1, j-1], 1))
                if i > 0:
                    candidates.append((dp[i-1, j] + skip_penalty, 2))
                if j > 0:
                    candidates.append((dp[i, j-1] + skip_penalty, 3))
                
                if candidates:
                    best_score, best_action = max(candidates, key=lambda x: x[0])
                    dp[i, j] = best_score
                    backtrack[i, j] = best_action
        
        # 역추적
        matches = []
        i, j = n_en, n_kr
        
        while i > 0 or j > 0:
            action = backtrack[i, j]
            if action == 0:
                break
            elif action == 1:
                if sim_matrix[i-1, j-1] > sim_threshold:
                    matches.append((i-1, j-1))
                i -= 1
                j -= 1
            elif action == 2:
                i -= 1
            else:
                j -= 1
        
        matches.reverse()
        return matches
    
    def _build_sentence_dp(
        self, 
        en_emb: torch.Tensor, 
        kr_emb: torch.Tensor,
        sim_matrix: np.ndarray,
        n_en: int, 
        n_kr: int
    ) -> tuple[np.ndarray, np.ndarray]:
        """문장 정렬용 DP 테이블 (N:M 매칭)"""
        max_n = self.config.max_merge_n
        skip_penalty = self.config.sentence_skip_penalty
        merge_bonus = self.config.merge_bonus
        
        dp = np.full((n_en + 1, n_kr + 1), -np.inf)
        backtrack = np.zeros((n_en + 1, n_kr + 1), dtype=int)
        dp[0, 0] = 0
        
        # Actions:
        # 1: 1:1, 2: Skip_EN, 3: Skip_KR
        # 10+k: 1:k (merge k KR), 20+k: k:1 (merge k EN)
        
        for r in range(n_en + 1):
            for c in range(n_kr + 1):
                if r == 0 and c == 0:
                    continue
                
                candidates = []
                
                # Skip
                if r > 0:
                    candidates.append((dp[r-1, c] + skip_penalty, 2))
                if c > 0:
                    candidates.append((dp[r, c-1] + skip_penalty, 3))
                
                # 1:1 Match
                if r > 0 and c > 0:
                    candidates.append((dp[r-1, c-1] + sim_matrix[r-1, c-1], 1))
                
                # 1:k Match (merge KR)
                for k in range(2, max_n + 1):
                    if r > 0 and c >= k:
                        vec_avg = torch.mean(kr_emb[c-k:c], dim=0)
                        sim = util.cos_sim(en_emb[r-1], vec_avg).item()
                        bonus = merge_bonus * (k - 1)
                        candidates.append((dp[r-1, c-k] + sim + bonus, 10 + k))
                
                # k:1 Match (merge EN)
                for k in range(2, max_n + 1):
                    if r >= k and c > 0:
                        vec_avg = torch.mean(en_emb[r-k:r], dim=0)
                        sim = util.cos_sim(vec_avg, kr_emb[c-1]).item()
                        bonus = merge_bonus * (k - 1)
                        candidates.append((dp[r-k, c-1] + sim + bonus, 20 + k))
                
                if candidates:
                    best_score, best_action = max(candidates, key=lambda x: x[0])
                    dp[r, c] = best_score
                    backtrack[r, c] = best_action
        
        return dp, backtrack
    
    def _backtrack_sentences(
        self, 
        dp: np.ndarray, 
        backtrack: np.ndarray,
        sim_matrix: np.ndarray,
        n_en: int, 
        n_kr: int
    ) -> Iterator[AlignmentResult]:
        """문장 정렬 역추적"""
        results = []
        r, c = n_en, n_kr
        merge_bonus = self.config.merge_bonus
        
        while r > 0 or c > 0:
            action = backtrack[r, c]
            if action == 0:
                break
            
            if action == 1:  # 1:1
                sim = dp[r, c] - dp[r-1, c-1]
                results.append(AlignmentResult('1:1', [r-1], [c-1], sim))
                r -= 1
                c -= 1
            
            elif action == 2:  # Skip EN
                results.append(AlignmentResult('SKIP_EN', [r-1], [], 0))
                r -= 1
            
            elif action == 3:  # Skip KR
                c -= 1
            
            elif action >= 20:  # k:1 (merge EN)
                k = action - 20
                sim = dp[r, c] - dp[r-k, c-1] - merge_bonus * (k - 1)
                results.append(AlignmentResult(f'{k}:1', list(range(r-k, r)), [c-1], sim))
                r -= k
                c -= 1
            
            elif action >= 10:  # 1:k (merge KR)
                k = action - 10
                sim = dp[r, c] - dp[r-1, c-k] - merge_bonus * (k - 1)
                results.append(AlignmentResult(f'1:{k}', [r-1], list(range(c-k, c)), sim))
                r -= 1
                c -= k
        
        results.reverse()
        yield from results

## 출력 작성기

In [8]:
class OutputWriter:
    """결과 파일 작성기"""
    
    def __init__(self, json_path: str, log_path: str, config: AlignConfig):
        self.json_file = open(json_path, 'w', encoding='utf-8')
        self.log_file = open(log_path, 'w', encoding='utf-8')
        self.config = config
        self.match_count = 0
    
    def write_header(self, title: str):
        self.log_file.write(f"=== Log for {title} (Max Merge: {self.config.max_merge_n}) ===\n\n")
    
    def write_chapter_header(self, en_idx: int, kr_idx: int):
        self.log_file.write(f"\n### Chapter Pair: EN[{en_idx}] ↔ KR[{kr_idx}] ###\n")
    
    def write_result(
        self, 
        result: AlignmentResult, 
        en_sents: list[str], 
        kr_sents: list[str]
    ):
        en_text = " ".join(en_sents[i] for i in result.en_indices)
        
        if result.match_type == 'SKIP_EN':
            self._write_skip(en_text, result.similarity)
            return
        
        kr_text = " ".join(kr_sents[i] for i in result.kr_indices)
        is_match = result.similarity >= self.config.threshold
        
        self._write_log(result, en_text, kr_text, is_match)
        
        if is_match:
            self._write_json(en_text, kr_text)
            self.match_count += 1
    
    def _write_skip(self, en_text: str, sim: float):
        self.log_file.write(
            f"[SKIP] (Score: {sim:.4f})\n"
            f" EN: {en_text}\n"
            f" KR: (No Match)\n"
            f"{'-'*50}\n"
        )
    
    def _write_log(self, result: AlignmentResult, en_text: str, kr_text: str, is_match: bool):
        status = "MATCH" if is_match else "REJECT"
        self.log_file.write(
            f"[{status}] (Score: {result.similarity:.4f} | Type: {result.match_type})\n"
            f" EN: {en_text}\n"
            f" KR: {kr_text}\n"
            f"{'-'*50}\n"
        )
    
    def _write_json(self, en_text: str, kr_text: str):
        data = {
            "system": "전문 번역가 스타일로 번역하세요.",
            "user": en_text,
            "assistant": kr_text
        }
        self.log_file.flush()
        self.json_file.write(json.dumps(data, ensure_ascii=False) + "\n")
    
    def close(self):
        self.json_file.close()
        self.log_file.close()


## 메인 파이프라인

In [9]:
class EpubAligner:
    """EPUB 정렬 파이프라인"""
    
    def __init__(self, config: AlignConfig):
        self.config = config
        
        print(f"{Fore.CYAN}[System] Loading models...{Style.RESET_ALL}")
        
        self.processor = TextProcessor()
        self.loader = EpubLoader(self.processor, config)
        
        model = SentenceTransformer('sentence-transformers/LaBSE', device=config.device)
        model.eval()
        self.aligner = DPAligner(model, config)
    
    def process_pair(self, en_path: str, kr_path: str):
        """단일 책 쌍 처리"""
        title = Path(en_path).stem.replace('en_', '')
        print(f"\n{Fore.BLUE}=== Processing: {title} ==={Style.RESET_ALL}")
        
        # 챕터 로드
        en_chapters = self.loader.load(en_path, 'en')
        kr_chapters = self.loader.load(kr_path, 'kr')
        print(f" - Loaded: EN={len(en_chapters)}, KR={len(kr_chapters)} chapters")
        
        # 챕터 정렬
        matched_chapters = self.aligner.align_chapters(en_chapters, kr_chapters)
        print(f" - Matched: {len(matched_chapters)} chapter pairs")
        
        # 문장 정렬 및 출력
        writer = OutputWriter(
            f"dataset_{title}.jsonl",
            f"dataset_{title}.log",
            self.config
        )
        writer.write_header(title)
        
        for en_chap, kr_chap in tqdm(matched_chapters, desc="Aligning"):
            writer.write_chapter_header(en_chap.idx, kr_chap.idx)
            
            for result in self.aligner.align_sentences(en_chap.sentences, kr_chap.sentences):
                writer.write_result(result, en_chap.sentences, kr_chap.sentences)
        
        writer.close()
        
        print(f"{Fore.GREEN} >> Done! Matches: {writer.match_count}{Style.RESET_ALL}")
        print(f" >> Log: dataset_{title}.log")
    
    def run(self):
        """전체 파이프라인 실행"""
        input_dir = Path(self.config.input_dir)
        input_dir.mkdir(exist_ok=True)
        
        en_files = sorted(input_dir.glob("en_*.epub"))
        
        for en_file in en_files:
            kr_file = en_file.with_name(en_file.name.replace("en_", "kr_"))
            
            if kr_file.exists():
                self.process_pair(str(en_file), str(kr_file))
            else:
                print(f"{Fore.YELLOW}[Skip] No pair for {en_file.name}{Style.RESET_ALL}")


In [11]:
def parse_args() -> AlignConfig:
    parser = argparse.ArgumentParser(
        description="EPUB 영어-한국어 문장 정렬 도구",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    
    parser.add_argument('--input_dir', default='data', help='EPUB 파일 디렉토리')
    parser.add_argument('--device', default=None, help='PyTorch device (auto-detect if not set)')
    parser.add_argument('--threshold', type=float, default=0.65, help='유사도 임계값')
    parser.add_argument('--max_merge_n', type=int, default=5, help='최대 병합 문장 수')
    parser.add_argument('--merge_bonus', type=float, default=0.001, help='병합 보너스')
    
    args = parser.parse_args()
    print(f"args: {args}, type{args}")
    
    config = AlignConfig(
        input_dir=args.input_dir,
        threshold=args.threshold,
        max_merge_n=args.max_merge_n,
        merge_bonus=args.merge_bonus
    )
    
    if args.device:
        config.device = args.device
    
    return config


def main():
    config = parse_args()
    aligner = EpubAligner(config)
    aligner.run()


if __name__ == "__main__":
    main()


usage: ipykernel_launcher.py [-h] [--input_dir INPUT_DIR] [--device DEVICE]
                             [--threshold THRESHOLD]
                             [--max_merge_n MAX_MERGE_N]
                             [--merge_bonus MERGE_BONUS]
ipykernel_launcher.py: error: unrecognized arguments: -f /Users/seongjungkim/Library/Jupyter/runtime/kernel-b8804d04-143c-4f78-9a90-87485c472642.json


SystemExit: 2

In [14]:
config = AlignConfig(
    input_dir="data",
    threshold=0.65,
    max_merge_n=5,
    merge_bonus=0.001
)
config.device = 'cuda' if torch.cuda.is_available() else 'cpu'

aligner = EpubAligner(config)
aligner.run()

[System] Loading models...


Quantization is not supported for ArchType::neon. Fall back to non-quantized model.



=== Processing: Harry Potter and the Goblet of Fire ===
 - Loaded: EN=38, KR=43 chapters
[Chapter] Aligning 38 EN ↔ 43 KR chapters
 - Matched: 38 chapter pairs


Aligning: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 38/38 [19:49<00:00, 31.31s/it]


 >> Done! Matches: 5566
 >> Log: dataset_Harry Potter and the Goblet of Fire.log

=== Processing: Harry Potter and the Sorcerors Stone ===
 - Loaded: EN=18, KR=22 chapters
[Chapter] Aligning 18 EN ↔ 22 KR chapters
 - Matched: 17 chapter pairs


Aligning: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 17/17 [08:12<00:00, 28.99s/it]


 >> Done! Matches: 4485
 >> Log: dataset_Harry Potter and the Sorcerors Stone.log
