# person_name PM + NL

In [1]:
import pandas as pd

# CSV-faili laadimine
df = pd.read_csv("df_pairs_final.csv", sep=";")

In [4]:
import pandas as pd
import numpy as np
from jellyfish import metaphone, levenshtein_distance
from sklearn.metrics import precision_recall_fscore_support

# Parameetrid
MAX_LEV_ABS = 4
MAX_LEV_PCT = 0.25

# --- Foneetiline skoor (normaliseeritud Levenshtein) ---
def phonetic_score(name1, name2, min_code_len=3, max_lev_abs=MAX_LEV_ABS, max_lev_pct=MAX_LEV_PCT):
    def tokenize(name): return name.lower().split()
    def encode(tokens): return [
        metaphone(tok) for tok in tokens if len(metaphone(tok)) >= min_code_len
    ]
    tokens1 = encode(tokenize(name1))
    tokens2 = encode(tokenize(name2))
    if not tokens1 or not tokens2:
        return 0.0
    matches, used = [], set()
    for t1 in tokens1:
        best_match, best_score = None, 0.0
        for i, t2 in enumerate(tokens2):
            if i in used:
                continue
            if t1 == t2:
                best_match, best_score = i, 1.0
                break
            dist = levenshtein_distance(t1, t2)
            allowed = min(max_lev_abs, int(np.ceil(min(len(t1), len(t2)) * max_lev_pct)))
            if dist <= allowed:
                score = 1.0 - (dist / max(len(t1), len(t2)))
                if score > best_score:
                    best_match, best_score = i, score
        if best_match is not None:
            matches.append(best_score)
            used.add(best_match)
    return sum(matches) / len(tokens1) if matches else 0.0

# --- Foneetiliste skooride arvutus ---
def compute_phonetic_scores(df, target_col, compare_cols):
    for col in compare_cols:
        scores = []
        for name1, name2 in zip(df[col], df[target_col]):
            try:
                score = phonetic_score(name1, name2)
            except Exception:
                score = 0.0
            scores.append(score)
        df[f"{col}_score"] = scores
    return df

# --- Parima F1 lävendi leidmine ---
def find_best_threshold(df, score_col, label_col="label", thresholds=np.arange(0.0, 1.01, 0.01)):
    best_f1 = best_threshold = best_precision = best_recall = 0.0
    for threshold in thresholds:
        preds = df[score_col] >= threshold
        precision, recall, f1, _ = precision_recall_fscore_support(
            df[label_col], preds, average="binary", zero_division=0
        )
        if f1 > best_f1:
            best_f1, best_threshold, best_precision, best_recall = f1, threshold, precision, recall
    return best_threshold, best_f1, best_precision, best_recall


In [5]:
# Defineeri skeemid ja sihtväli
ascii_columns = [
    "iso9_ascii",
    "dstu_a_ascii",
    "icao_ascii",
    "dstu_b_ascii",
    "gost_b_ascii",
    "bgn_ascii",
    "rt_translit_ascii",
    "eki_ascii",
    "OS_ascii"
]
target_col = "name_ascii"
score_cols = [col + "_score" for col in ascii_columns]

# Arvuta foneetilised skoorid
df = compute_phonetic_scores(df, target_col=target_col, compare_cols=ascii_columns)

# Leia parimad lävendi tulemused
results = []
for score_col in score_cols:
    best_t, best_f1, best_p, best_r = find_best_threshold(df, score_col)
    scheme = score_col.replace("_score", "")
    results.append({
        "Skeem": scheme,
        "Parim lävend": round(best_t, 2),
        "Täpsus": round(best_p, 4),
        "Saagis": round(best_r, 4),
        "F1-skoor": round(best_f1, 4)
    })

# Tulemuste tabel
result_df = pd.DataFrame(results).sort_values(by="F1-skoor", ascending=False)
print(result_df.to_string(index=False))

            Skeem  Parim lävend  Täpsus  Saagis  F1-skoor
        bgn_ascii          0.26  0.9794  0.9317    0.9550
       icao_ascii          0.26  0.9777  0.9299    0.9532
         OS_ascii          0.25  0.9758  0.9289    0.9518
       iso9_ascii          0.25  0.9757  0.9256    0.9500
        eki_ascii          0.25  0.9757  0.9257    0.9500
rt_translit_ascii          0.20  0.9699  0.9292    0.9491
     gost_b_ascii          0.25  0.9761  0.9228    0.9487
     dstu_b_ascii          0.24  0.9782  0.9001    0.9375
     dstu_a_ascii          0.20  0.9710  0.9050    0.9368


In [None]:
# !jupyter nbconvert --to html person_name_PM_NL.ipynb --log-level=ERROR > nul 2>&1