In [1]:
import os, re, json, math, random
from typing import List, Dict, Tuple, Optional
import numpy as np
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Config
INPUT_PATH = "../data/preprocessing_data/json2.csv" # Thay đường dẫn input_path để chạy các file khác nhau
OUT_DIR = "../output"
os.makedirs(OUT_DIR, exist_ok=True)

MAX_N = 5                 # 1-n alignment (1-1, 1-2, 1-3)
BATCH_SIZE = 16           # encode batch size
ALPHA_LEN = 0.05          # penalty độ dài 
END_TGT_PENALTY = 0.05    # phạt nhẹ phần tgt dư không dùng
AUDIT_SEED = 0
AUDIT_RATE = 0.10

In [3]:
# Load input
def load_input_any(path: str) -> pd.DataFrame:
    ext = os.path.splitext(path)[1].lower()

    if ext == ".csv":
        df = pd.read_csv(path)
    elif ext == ".json":
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        if isinstance(data, dict) and "data" in data and isinstance(data["data"], list):
            data = data["data"]
        df = pd.DataFrame(data)
    elif ext in [".jsonl", ".jl"]:
        rows = []
        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if line:
                    rows.append(json.loads(line))
        df = pd.DataFrame(rows)
    else:
        raise ValueError(f"Unsupported file type: {ext}")

    df = df[["src_id", "src_lang", "tgt_lang"]].copy()
    df["src_id"] = df["src_id"].astype(str)
    df["src_lang"] = df["src_lang"].astype(str)
    df["tgt_lang"] = df["tgt_lang"].astype(str)
    return df

In [4]:
# Embedding (LaBSE)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
labse = SentenceTransformer("sentence-transformers/LaBSE", device=DEVICE)

def embed_sentences(sents: List[str], batch_size: int = 64) -> np.ndarray:
    if not sents:
        return np.zeros((0, 768), dtype=np.float32)
    emb = labse.encode(
        sents,
        batch_size=batch_size,
        show_progress_bar=False,
        convert_to_numpy=True,
        normalize_embeddings=True,
        device=DEVICE
    )
    return emb.astype(np.float32)

In [5]:
# Tách câu (ZH / VI)
def _compact_spaces(s: str) -> str:
    return re.sub(r"\s+", " ", s).strip()

def split_sentences_zh(text: str) -> List[str]:
    text = _compact_spaces(text)
    if not text:
        return []
    parts = re.split(r"(?<=[。！？!?；;])\s*", text)
    return [p.strip() for p in parts if p and p.strip()]

def split_sentences_vi_basic(text: str) -> List[str]:
    text = _compact_spaces(text)
    if not text:
        return []
    parts = re.split(r"(?:(?<=[\.\?\!…])\s+)|(?:(?<=\.\.\.)\s+)", text)
    return [p.strip() for p in parts if p and p.strip()]

def _pick_split_point(s: str, seps: List[str]) -> Optional[int]:
    L = len(s)
    best = None
    for sep in seps:
        for m in re.finditer(re.escape(sep), s):
            cut = m.start() + len(sep)
            if cut < 5 or cut > L - 5:
                continue
            score = abs(cut - L / 2)
            if best is None or score < best[0]:
                best = (score, cut)
    return best[1] if best else None

def refine_vi_segments(sents: List[str], min_count: int) -> List[str]:
    sents = list(sents)
    seps = [";", ":", ",", " - ", " – ", " — "]
    while len(sents) < min_count:
        best_idx, best_cut, best_len = None, None, -1
        for i, s in enumerate(sents):
            cut = _pick_split_point(s, seps)
            if cut is None:
                continue
            if len(s) > best_len:
                best_len = len(s)
                best_idx = i
                best_cut = cut
        if best_idx is None:
            break
        s = sents[best_idx]
        left = s[:best_cut].strip()
        right = s[best_cut:].strip()
        if not left or not right:
            break
        sents = sents[:best_idx] + [left, right] + sents[best_idx+1:]
    return sents

def split_sentences_vi(text: str, min_count: Optional[int] = None) -> List[str]:
    sents = split_sentences_vi_basic(text)
    if min_count is not None and len(sents) < min_count:
        sents = refine_vi_segments(sents, min_count)
    return sents


In [6]:
# Alignment (Monotonic 1-n DP with LaBSE cosine)
def text_len(s: str) -> int:
    return len(re.sub(r"\s+", "", s.strip()))

def align_1_to_n_labse(
    src_sents: List[str],
    tgt_sents: List[str],
    max_n: int = 3,
    alpha_len: float = 0.15,
    batch_size: int = 64,
    end_tgt_penalty: float = 0.20,
):
    """
    DP monotonic: mỗi src_i -> concat tgt[j:k], 1 <= k-j <= max_n
    Score dựa trên:
      cost = -cosine_sim + alpha_len * length_penalty
    Cho phép align rỗng khi tgt đã hết.
    """
    m, n = len(src_sents), len(tgt_sents)
    if m == 0:
        return []
    if n == 0:
        return [(src_sents[i], "", 0, 0) for i in range(m)]

    src_emb = embed_sentences(src_sents, batch_size=batch_size)  # normalized
    tgt_emb = embed_sentences(tgt_sents, batch_size=batch_size)  # normalized

    # prefix sum để lấy mean segment nhanh
    d = tgt_emb.shape[1]
    ps = np.zeros((n + 1, d), dtype=np.float32)
    ps[1:] = np.cumsum(tgt_emb, axis=0)

    def seg_mean_norm(j0: int, j1: int) -> np.ndarray:
        v = (ps[j1] - ps[j0]) / max(1, (j1 - j0))
        norm = np.linalg.norm(v) + 1e-9
        return (v / norm).astype(np.float32)

    # ratio kỳ vọng để penalty độ dài
    sum_src = sum(text_len(s) for s in src_sents) or 1
    sum_tgt = sum(text_len(t) for t in tgt_sents) or 1
    r0 = sum_tgt / sum_src

    INF = 1e18
    dp = [[INF] * (n + 1) for _ in range(m + 1)]
    back = [[None] * (n + 1) for _ in range(m + 1)]
    dp[0][0] = 0.0

    for i in range(m):
        for j in range(n + 1):
            if dp[i][j] >= INF:
                continue

            # Nếu tgt đã hết, các src còn lại align rỗng (cost nhỏ cố định)
            if j == n:
                ndp = dp[i][j] + 0.50  # penalty align empty
                if ndp < dp[i + 1][j]:
                    dp[i + 1][j] = ndp
                    back[i + 1][j] = (j, j)
                continue

            # thử ghép 1..max_n câu tgt
            for k in range(j + 1, min(n, j + max_n) + 1):
                v = seg_mean_norm(j, k)
                sim = float(np.dot(src_emb[i], v))  # cosine because normalized

                ls = text_len(src_sents[i]) or 1
                lt = text_len(" ".join(tgt_sents[j:k]))
                ratio = lt / ls
                len_pen = abs(math.log((ratio + 1e-9) / (r0 + 1e-9)))

                cost = (-sim) + alpha_len * len_pen
                ndp = dp[i][j] + cost

                if ndp < dp[i + 1][k]:
                    dp[i + 1][k] = ndp
                    back[i + 1][k] = (j, k)

    # chọn endpoint tốt nhất, phạt nhẹ tgt dư
    best_j, best = None, INF
    for j in range(n, -1, -1):
        if dp[m][j] < INF:
            score = dp[m][j] + end_tgt_penalty * (n - j)
            if score < best:
                best = score
                best_j = j

    if best_j is None:
        # fallback greedy
        aligns = []
        j = 0
        for i in range(m):
            if j < n:
                aligns.append((src_sents[i], tgt_sents[j], j, j + 1))
                j += 1
            else:
                aligns.append((src_sents[i], "", n, n))
        return aligns

    # backtrack
    j = best_j
    aligns = []
    for i in range(m, 0, -1):
        j0, j1 = back[i][j]
        aligns.append((src_sents[i - 1], " ".join(tgt_sents[j0:j1]).strip(), j0, j1))
        j = j0
    aligns.reverse()
    return aligns

In [7]:
# Metrics
def parse_numeric_id(x: str) -> Optional[int]:
    m = re.search(r"\d+", str(x))
    return int(m.group(0)) if m else None

def align_dataframe_labse(df_in: pd.DataFrame):
    out_rows = []
    n_dist = {}
    ratios = []
    sims = []

    tgt_used_total = 0
    tgt_total = 0
    empty_tgt = 0

    for _, row in df_in.iterrows():
        base_id = str(row["src_id"])
        src_text = str(row["src_lang"])
        tgt_text = str(row["tgt_lang"])

        src_sents = split_sentences_zh(src_text)
        tgt_sents = split_sentences_vi(tgt_text, min_count=len(src_sents))

        aligns = align_1_to_n_labse(
            src_sents, tgt_sents,
            max_n=MAX_N,
            alpha_len=ALPHA_LEN,
            batch_size=BATCH_SIZE,
            end_tgt_penalty=END_TGT_PENALTY
        )

        used = set()
        for i, (s, t, j0, j1) in enumerate(aligns, start=1):
            n = j1 - j0
            n_dist[n] = n_dist.get(n, 0) + 1
            used.update(range(j0, j1))

            ls = text_len(s)
            lt = text_len(t)
            if ls > 0 and lt > 0:
                ratios.append(lt / ls)
            if not t.strip():
                empty_tgt += 1

            out_rows.append({
                "src_id": f"{base_id}_{i}",
                "src_lang": s,
                "tgt_lang": t
            })

        tgt_used_total += len(used)
        tgt_total += len(tgt_sents)

    df_out = pd.DataFrame(out_rows, columns=["src_id", "src_lang", "tgt_lang"])

    metrics = {
        "input_rows": int(len(df_in)),
        "output_rows_aligned": int(len(df_out)),
        "alignment_type_distribution": {f"1-{k}": int(v) for k, v in sorted(n_dist.items())},
        "tgt_sentence_usage_coverage": float(tgt_used_total / tgt_total) if tgt_total > 0 else None,
        "avg_tgt_src_len_ratio": float(np.mean(ratios)) if ratios else None,
        "std_tgt_src_len_ratio": float(np.std(ratios)) if ratios else None,
        "empty_tgt_pairs": int(empty_tgt),
        "max_n": int(MAX_N),
        "alpha_len": float(ALPHA_LEN),
        "model": "sentence-transformers/LaBSE"
    }

    return df_out, metrics

def save_outputs(df_out: pd.DataFrame, metrics: dict, input_path: str, out_dir: str):
    src_name = os.path.splitext(os.path.basename(input_path))[0]
    base_ids = df_out["src_id"].astype(str).str.rsplit("_", n=1).str[0]
    nums = [parse_numeric_id(x) for x in base_ids.tolist()]
    nums = [x for x in nums if x is not None]
    from_id = min(nums) if nums else 0
    to_id = max(nums) if nums else 0

    out_csv = os.path.join(out_dir, f"{src_name}_{from_id}_{to_id}.csv")
    df_out.to_csv(out_csv, index=False, encoding="utf-8-sig")

    out_metrics = os.path.join(out_dir, f"{src_name}_{from_id}_{to_id}_metrics.json")
    with open(out_metrics, "w", encoding="utf-8") as f:
        json.dump(metrics, f, ensure_ascii=False, indent=2)

    return out_csv, out_metrics

def export_audit(df_out: pd.DataFrame, input_path: str, out_dir: str, rate: float = 0.10, seed: int = 0):
    src_name = os.path.splitext(os.path.basename(input_path))[0]
    base_ids = df_out["src_id"].astype(str).str.rsplit("_", n=1).str[0]
    nums = [parse_numeric_id(x) for x in base_ids.tolist()]
    nums = [x for x in nums if x is not None]
    from_id = min(nums) if nums else 0
    to_id = max(nums) if nums else 0
    
    n = max(1, int(rate * len(df_out)))
    
    sample = df_out.sample(n=n, random_state=seed).reset_index(drop=True)
    out_audit = os.path.join(out_dir, f"{src_name}_{from_id}_{to_id}_audit10pct.csv")
    sample.to_csv(out_audit, index=False, encoding="utf-8-sig")
    return out_audit

In [8]:
df_in = load_input_any(INPUT_PATH)
df_out, metrics = align_dataframe_labse(df_in)

out_csv, out_metrics = save_outputs(df_out, metrics, INPUT_PATH, OUT_DIR)
out_audit = export_audit(df_out, INPUT_PATH, OUT_DIR, rate=AUDIT_RATE, seed=AUDIT_SEED)

print("Saved aligned CSV:", out_csv)
print("Saved metrics JSON:", out_metrics)
print("Saved audit 10%:", out_audit)
print(json.dumps(metrics, ensure_ascii=False, indent=2))

Saved aligned CSV: ../output\json2_4653_9255.csv
Saved metrics JSON: ../output\json2_4653_9255_metrics.json
Saved audit 10%: ../output\json2_4653_9255_audit10pct.csv
{
  "input_rows": 4603,
  "output_rows_aligned": 7953,
  "alignment_type_distribution": {
    "1-0": 15,
    "1-1": 7666,
    "1-2": 246,
    "1-3": 19,
    "1-4": 6,
    "1-5": 1
  },
  "tgt_sentence_usage_coverage": 0.9997574581615328,
  "avg_tgt_src_len_ratio": 2.7560139154944947,
  "std_tgt_src_len_ratio": 1.0316128716097732,
  "empty_tgt_pairs": 15,
  "max_n": 5,
  "alpha_len": 0.05,
  "model": "sentence-transformers/LaBSE"
}


In [9]:
df_out.head()

Unnamed: 0,src_id,src_lang,tgt_lang
0,4653_1_1,我认为这将是主要的事情。,Tôi nghĩ rằng đó sẽ là điều chính.
1,4653_1_2,我认为，同意某人所说，应该选择HSV-1血清阳性且携带APOE4等位基因的人。,"Tôi nghĩ, đồng ý với người nào đó nói rằng nên..."
2,4654_1_1,我非常想说这个。,Tôi rất muốn nói điều này.
3,4654_1_2,我相信这是从许多问题中得出的结论。,Tôi chắc chắn điều này được tìm kiếm từ rất nh...
4,4654_1_3,史蒂文·雅各布森：但是马克，我可以提个建议吗？,"Steven Jacobson: Nhưng Mack, tôi có thể đưa ra..."
