In [1]:
import os, json, re
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import torch

# Đổi path khi cần chạy file khác
ALIGNED_CSV = "../output/json2_4653_9255.csv"   
OUT_METRICS_JSON = "../output/json2_4653_9255_metrics.json"

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
def char_len(s: str) -> int:
    return len(re.sub(r"\s+", "", str(s).strip()))


DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer("sentence-transformers/LaBSE", device=DEVICE)

df = pd.read_csv(ALIGNED_CSV)
df["src_lang"] = df["src_lang"].astype(str)
df["tgt_lang"] = df["tgt_lang"].astype(str)

src_sents = df["src_lang"].tolist()
tgt_sents = df["tgt_lang"].tolist()

src_emb = model.encode(src_sents, batch_size=16, normalize_embeddings=True, convert_to_numpy=True)
tgt_emb = model.encode(tgt_sents, batch_size=16, normalize_embeddings=True, convert_to_numpy=True)

sims = np.sum(src_emb * tgt_emb, axis=1)  # cosine vì normalized

df["semantic_sim"] = sims
df["len_ratio"] = [ (char_len(t)/max(1,char_len(s))) for s,t in zip(src_sents, tgt_sents) ]
df["src_empty"] = df["src_lang"].str.strip().eq("")
df["tgt_empty"] = df["tgt_lang"].str.strip().eq("")

# Threshold
THR = 0.1
metrics = {
    "n_pairs": int(len(df)),
    "semantic_sim": {
        "mean": float(np.mean(sims)),
        "median": float(np.median(sims)),
        "std": float(np.std(sims)),
        "p10": float(np.percentile(sims, 10)),
        "p90": float(np.percentile(sims, 90)),
        "outlier_rate_sim_lt_thr": float(np.mean(sims < THR)),
        "threshold": THR,
    },
    "length_ratio": {
        "mean": float(np.mean(df["len_ratio"])),
        "median": float(np.median(df["len_ratio"])),
        "std": float(np.std(df["len_ratio"])),
        "p10": float(np.percentile(df["len_ratio"], 10)),
        "p90": float(np.percentile(df["len_ratio"], 90)),
    },
    "empties": {
        "src_empty_rate": float(df["src_empty"].mean()),
        "tgt_empty_rate": float(df["tgt_empty"].mean()),
    }
}

with open(OUT_METRICS_JSON, "w", encoding="utf-8") as f:
    json.dump(metrics, f, ensure_ascii=False, indent=2)

print(json.dumps(metrics, ensure_ascii=False, indent=2))
df.head()


{
  "n_pairs": 7953,
  "semantic_sim": {
    "mean": 0.8106551766395569,
    "median": 0.8239011764526367,
    "std": 0.08908974379301071,
    "p10": 0.7077550411224365,
    "p90": 0.9048098087310791,
    "outlier_rate_sim_lt_thr": 0.0003772161448509996,
    "threshold": 0.1
  },
  "length_ratio": {
    "mean": 2.7520115627431463,
    "median": 2.6923076923076925,
    "std": 1.0352157452574413,
    "p10": 2.03125,
    "p90": 3.4285714285714284
  },
  "empties": {
    "src_empty_rate": 0.0,
    "tgt_empty_rate": 0.0
  }
}


Unnamed: 0,src_id,src_lang,tgt_lang,semantic_sim,len_ratio,src_empty,tgt_empty
0,4653_1_1,我认为这将是主要的事情。,Tôi nghĩ rằng đó sẽ là điều chính.,0.914596,2.25,False,False
1,4653_1_2,我认为，同意某人所说，应该选择HSV-1血清阳性且携带APOE4等位基因的人。,"Tôi nghĩ, đồng ý với người nào đó nói rằng nên...",0.890567,2.487179,False,False
2,4654_1_1,我非常想说这个。,Tôi rất muốn nói điều này.,0.924004,2.625,False,False
3,4654_1_2,我相信这是从许多问题中得出的结论。,Tôi chắc chắn điều này được tìm kiếm từ rất nh...,0.766083,2.705882,False,False
4,4654_1_3,史蒂文·雅各布森：但是马克，我可以提个建议吗？,"Steven Jacobson: Nhưng Mack, tôi có thể đưa ra...",0.890003,2.478261,False,False
