In [1]:
import os
import re
import unicodedata
from collections import defaultdict
import difflib
import pandas as pd
from paddleocr import PaddleOCR




In [2]:
ocr = PaddleOCR(
    use_angle_cls=True,
    lang='ch',
    show_log=False,
    use_gpu=True,
    gpu_id=0,
    gpu_mem=4096
)


In [3]:
def strip_accents(s: str) -> str:
    # Loại bỏ dấu 
    return "".join(c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn")

def is_chinese(text: str) -> bool:
    return any('\u4e00' <= ch <= '\u9fff' for ch in text)

def is_page_number_line(text: str) -> bool:
    return re.fullmatch(r"\d{1,4}", text.strip()) is not None

# Kiểm tra dòng chữ "Bức thư..." để xác định vùng tiếng Việt
def looks_like_vietnamese(text: str) -> bool:
    norm = strip_accents(text).lower()
    tokens = re.findall(r"[a-z0-9]+", norm)

    def has_similar(target: str, thr: float) -> bool:
        for t in tokens:
            if difflib.SequenceMatcher(None, t, target).ratio() >= thr:
                return True
        return False

    req_ok = (
        has_similar("buc", 0.5) and
        has_similar("thu", 0.5) and
        has_similar("viet", 0.5) and
        has_similar("cho", 0.5) and
        has_similar("chinh", 0.5) and
        has_similar("minh", 0.5)
    )

    return req_ok

# Trích xuất ID của bức thư
def extract_id(text: str):
    def norm_digits(tok: str):
        tok = tok.strip()
        tok = tok.replace('O', '0').replace('o', '0')
        tok = tok.replace('I', '1').replace('l', '1').replace('|', '1').replace('!', '1')

        digits = "".join(re.findall(r"\d+", tok))  # "50 4" -> "504"
        if not digits:
            return None
        if len(digits) < 3:
            return None
        return digits

    # Chinese: 第xxx 
    m = re.search(r"第\s*([0-9OoIl|!\s]{1,20})", text)
    if m:
        return norm_digits(m.group(1))

    # Vietnamese: so/s0/s6 ...
    norm = strip_accents(text).lower()
    m = re.search(r"(?:\bso\b|\bs0\b|\bs6\b|s[0o6])\s*([0-9OoIl|!\s]{1,20})", norm)
    if m:
        return norm_digits(m.group(1))

    return None

In [4]:
DATA_MAP = defaultdict(lambda: {"src": [], "tgt": [], "vi_started": False})
STATE = {"current_id": None, "mode": None, "pending_src": []}

# OCR
def process_image_to_data(img_path: str, data_map=DATA_MAP, state=STATE):
    print(f"Đang xử lý: {img_path}...")
    result = ocr.ocr(img_path, cls=True)
    if not result or result[0] is None:
        return

    blocks = result[0]
    blocks = sorted(
        blocks,
        key=lambda b: (min(p[1] for p in b[0]), min(p[0] for p in b[0]))
    )
    lines = [b[1][0].strip() for b in blocks if b and b[1] and b[1][0]]

    current_id = state["current_id"]
    mode = state["mode"]
    pending_src = state["pending_src"]

    for text in lines:
        if not text:
            continue
        if is_page_number_line(text):
            continue
        
        # Title tiếng Việt (fuzzy) -> mode=vi (lấy ID) và chỉ xử lý như title nếu extract được ID
        if looks_like_vietnamese(text):
            maybe_id = extract_id(text)
            if maybe_id:
                # Title thật
                current_id = maybe_id
                _ = data_map[current_id]
                if pending_src:
                    data_map[current_id]["src"].extend(pending_src)
                    pending_src.clear()

                mode = "vi"
                if not data_map[current_id]["vi_started"]:
                    data_map[current_id]["tgt"] = []
                    data_map[current_id]["vi_started"] = True
                continue

        # Anchor ID (ưu tiên từ tiếng Trung: 第...)
        found_id = extract_id(text)
        if found_id:
            current_id = found_id
            _ = data_map[current_id]
            if pending_src:
                data_map[current_id]["src"].extend(pending_src)
                pending_src.clear()

            mode = "zh" if is_chinese(text) else "vi"
            if mode == "vi" and (not data_map[current_id]["vi_started"]):
                data_map[current_id]["tgt"] = []
                data_map[current_id]["vi_started"] = True
            continue

        # Chưa có ID -> giữ tiếng Trung để chờ, bỏ qua Pinyin
        if current_id is None:
            if is_chinese(text):
                pending_src.append(text)
            continue

        # Gom nội dung theo mode
        if is_chinese(text):
            if mode == "vi":
                # Gặp tiếng Trung sau tiếng Việt -> Reset để chờ ID mới
                pending_src.append(text)
                current_id = None
                mode = "zh"
            else:
                data_map[current_id]["src"].append(text)
        else:
            if mode == "vi":
                data_map[current_id]["tgt"].append(text)

    state["current_id"] = current_id
    state["mode"] = mode
    state["pending_src"] = pending_src


In [5]:
IMG_DIR = "../data/image/PDF1"
START_PAGE = 203
END_PAGE = 250
OUT_DIR = "../data/preprocessing_data"
os.makedirs(OUT_DIR, exist_ok=True)

for p in range(START_PAGE, END_PAGE + 1):
    img_path = os.path.join(IMG_DIR, f"PDF1-{p}.png")
    if os.path.exists(img_path):
        process_image_to_data(img_path, DATA_MAP, STATE)

# Join thành df 
rows = []
for sid, v in DATA_MAP.items():
    src_text = " ".join(v["src"]).strip()
    tgt_text = " ".join(v["tgt"]).strip()
    if src_text or tgt_text:
        rows.append({"src_id": sid, "src_lang": src_text, "tgt_lang": tgt_text})

df = pd.DataFrame(rows, columns=["src_id", "src_lang", "tgt_lang"])
if not df.empty:
    df["src_id_int"] = df["src_id"].astype(int)
    df = df.sort_values("src_id_int").drop(columns=["src_id_int"]).reset_index(drop=True)

min_id = int(df["src_id"].astype(int).min()) if not df.empty else 0
max_id = int(df["src_id"].astype(int).max()) if not df.empty else 0
out_csv = os.path.join(OUT_DIR, f"PDF1_{min_id}_{max_id}.csv")
df.to_csv(out_csv, index=False, encoding="utf-8-sig")
print("Saved:", out_csv)


Đang xử lý: ../data/image/PDF1\PDF1-203.png...
Đang xử lý: ../data/image/PDF1\PDF1-204.png...
Đang xử lý: ../data/image/PDF1\PDF1-205.png...
Đang xử lý: ../data/image/PDF1\PDF1-206.png...
Đang xử lý: ../data/image/PDF1\PDF1-207.png...
Đang xử lý: ../data/image/PDF1\PDF1-208.png...
Đang xử lý: ../data/image/PDF1\PDF1-209.png...
Đang xử lý: ../data/image/PDF1\PDF1-210.png...
Đang xử lý: ../data/image/PDF1\PDF1-211.png...
Đang xử lý: ../data/image/PDF1\PDF1-212.png...
Đang xử lý: ../data/image/PDF1\PDF1-213.png...
Đang xử lý: ../data/image/PDF1\PDF1-214.png...
Đang xử lý: ../data/image/PDF1\PDF1-215.png...
Đang xử lý: ../data/image/PDF1\PDF1-216.png...
Đang xử lý: ../data/image/PDF1\PDF1-217.png...
Đang xử lý: ../data/image/PDF1\PDF1-218.png...
Đang xử lý: ../data/image/PDF1\PDF1-219.png...
Đang xử lý: ../data/image/PDF1\PDF1-220.png...
Đang xử lý: ../data/image/PDF1\PDF1-221.png...
Đang xử lý: ../data/image/PDF1\PDF1-222.png...
Đang xử lý: ../data/image/PDF1\PDF1-223.png...
Đang xử lý: .

In [6]:
df.head()

Unnamed: 0,src_id,src_lang,tgt_lang
0,500,,yu ' y en q o nu d nu n nu xem lai xem minh...
1,501,懂得让步的人是聪明的，这是把决定事态走向的主动权握在了自己手 上。感情对抗战中，赢了面子就输...,"Nguoi thong minh la nguoi biet lui buoc, dieu ..."
2,502,说到底，女人还是要自强：不容易生病的身体、够用的收入、养心的 爱好、强大的小宇宙。拥有这些不...,"Suy cho cung, phu nu van can tu lap, can co su..."
3,503,感情就是这样，没有失去你不会成熟，生活也是这样，没有遇到点险 恶，你不会长大。你想要的，老天...,"Tinh cam la nhu vay d6, khong danh mat thi kho..."
4,504,能打动我的从来不是花言巧语，而是恰到好处的温柔以及真挚的内心。,Thur thyrc su lay dong trai tim em khong phai ...
