# person_name PM

In [3]:
import pandas as pd

# CSV-faili laadimine
df = pd.read_csv("df_pairs_final.csv", sep=";")

In [9]:
import pandas as pd
import numpy as np
from jellyfish import metaphone, levenshtein_distance
from sklearn.metrics import precision_recall_fscore_support

# Parameetrid
MAX_LEV_ABS = 4
MAX_LEV_PCT = 0.25

# --- Foneetiline skoor ---
def phonetic_score(name1, name2, min_code_len=3, max_lev_abs=MAX_LEV_ABS, max_lev_pct=MAX_LEV_PCT):
    def tokenize(name): return name.lower().split()
    def encode(tokens): return [
        metaphone(tok) for tok in tokens if len(metaphone(tok)) >= min_code_len
    ]
    tokens1 = encode(tokenize(name1))
    tokens2 = encode(tokenize(name2))
    if not tokens1 or not tokens2:
        return 0.0
    matches, used = 0, set()
    for t1 in tokens1:
        best_match, best_dist = None, float("inf")
        for i, t2 in enumerate(tokens2):
            if i in used:
                continue
            allowed = min(max_lev_abs, int(np.ceil(min(len(t1), len(t2)) * max_lev_pct)))
            if t1 == t2:
                best_match = i
                break
            dist = levenshtein_distance(t1, t2)
            if dist <= allowed and dist < best_dist:
                best_match, best_dist = i, dist
        if best_match is not None:
            matches += 1
            used.add(best_match)
    return matches / len(tokens1)

# --- Skooride arvutus ---
def compute_phonetic_scores(df, target_col, compare_cols):
    for col in compare_cols:
        scores = []
        for name1, name2 in zip(df[col], df[target_col]):
            try:
                score = phonetic_score(name1, name2)
            except Exception:
                score = 0.0
            scores.append(score)
        df[f"{col}_score"] = scores
    return df

# --- Parima lävendi otsing ---
def find_best_threshold(df, score_col, label_col="label", thresholds=np.arange(0.0, 1.01, 0.01)):
    best_f1 = best_threshold = best_precision = best_recall = 0.0
    for threshold in thresholds:
        preds = df[score_col] >= threshold
        precision, recall, f1, _ = precision_recall_fscore_support(
            df[label_col], preds, average="binary", zero_division=0
        )
        if f1 > best_f1:
            best_f1, best_threshold, best_precision, best_recall = f1, threshold, precision, recall
    return best_threshold, best_f1, best_precision, best_recall


In [10]:
# Defineeri skeemid ja sihtväli
ascii_columns = [
    "iso9_ascii",
    "dstu_a_ascii",
    "icao_ascii",
    "dstu_b_ascii",
    "gost_b_ascii",
    "bgn_ascii",
    "rt_translit_ascii",
    "eki_ascii",
    "OS_ascii"
]
target_col = "name_ascii"
score_cols = [col + "_score" for col in ascii_columns]

# Arvuta foneetilised skoorid
df = compute_phonetic_scores(df, target_col=target_col, compare_cols=ascii_columns)

# Leia parimad lävendi tulemused
results = []
for score_col in score_cols:
    best_t, best_f1, best_p, best_r = find_best_threshold(df, score_col)
    scheme = score_col.replace("_score", "")
    results.append({
        "Skeem": scheme,
        "Parim lävend": round(best_t, 2),
        "Täpsus": round(best_p, 4),
        "Saagis": round(best_r, 4),
        "F1-skoor": round(best_f1, 4)
    })

# Tulemuste tabel
result_df = pd.DataFrame(results).sort_values(by="F1-skoor", ascending=False)
print(result_df.to_string(index=False))


            Skeem  Parim lävend  Täpsus  Saagis  F1-skoor
        bgn_ascii          0.01  0.9642  0.9370    0.9504
         OS_ascii          0.01  0.9668  0.9342    0.9503
       iso9_ascii          0.01  0.9670  0.9310    0.9487
rt_translit_ascii          0.01  0.9677  0.9304    0.9487
        eki_ascii          0.01  0.9662  0.9310    0.9483
       icao_ascii          0.01  0.9611  0.9351    0.9479
     gost_b_ascii          0.01  0.9642  0.9272    0.9453
     dstu_a_ascii          0.01  0.9687  0.9067    0.9367
     dstu_b_ascii          0.01  0.9679  0.9060    0.9359


In [None]:
# !jupyter nbconvert --to html person_name_PM.ipynb --log-level=ERROR > nul 2>&1