# person_name JW + ME

In [4]:
import pandas as pd

# CSV-faili laadimine
df = pd.read_csv("df_pairs_final.csv", sep=";")

In [5]:
import pandas as pd
import numpy as np
from difflib import SequenceMatcher
from sklearn.metrics import precision_recall_fscore_support

# --- Jaro-Winkler baasskoor ---
def jaro_winkler_std(s1, s2):
    if s1 == s2:
        return 1.0
    match = SequenceMatcher(None, s1, s2).ratio()
    prefix = 0
    for i in range(min(4, len(s1), len(s2))):
        if s1[i] == s2[i]:
            prefix += 1
        else:
            break
    return match + 0.1 * prefix * (1 - match)

# --- Nimeosade tükeldus ---
def _name_parts(name: str) -> list[str]:
    return name.strip().split()

# --- Monge-Elkan + Jaro-Winkler ---
def _monge_elkan_name_score(query: list[str], result: list[str]) -> float:
    if not query or not result:
        return 0.0
    max_scores = []
    for q in query:
        best_score = max(jaro_winkler_std(q, r) for r in result)
        max_scores.append(best_score)
    return np.mean(max_scores)

def person_name_jaro_winkler_std(query_name: str, result_name: str) -> float:
    qn_parts = _name_parts(query_name)
    rn_parts = _name_parts(result_name)
    qn_joined = "".join(qn_parts)
    rn_joined = "".join(rn_parts)
    return max(
        jaro_winkler_std(qn_joined, rn_joined) ** len(qn_joined),
        _monge_elkan_name_score(qn_parts, rn_parts)
    )


In [6]:
# --- Skeemid ja sihtveeru nimi ---
columns = [
    "iso9", "dstu_a", "icao", "dstu_b",
    "gost_b", "bgn", "rt_translit", "eki", "OS_ascii"
]
target_col = "name_final"
score_cols = [col + "_score" for col in columns]

# --- Skooride arvutus ---
for col, score_col in zip(columns, score_cols):
    scores = []
    for name1, name2 in zip(df[col], df[target_col]):
        try:
            score = person_name_jaro_winkler_std(name1, name2)
        except Exception:
            score = 0.0
        scores.append(score)
    df[score_col] = scores

# --- Parima F1-lävendi otsing ---
results = []
for score_col in score_cols:
    best_f1 = best_threshold = best_precision = best_recall = 0.0
    for threshold in np.arange(0.0, 1.01, 0.01):
        preds = df[score_col] >= threshold
        precision, recall, f1, _ = precision_recall_fscore_support(
            df["label"], preds, average="binary", zero_division=0
        )
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
            best_precision = precision
            best_recall = recall

    scheme_name = score_col.replace("_score", "")
    results.append({
        "Skeem": scheme_name,
        "Parim lävend": round(best_threshold, 2),
        "Täpsus": round(best_precision, 4),
        "Saagis": round(best_recall, 4),
        "F1-skoor": round(best_f1, 4)
    })

# --- Tulemuste tabel ---
result_df = pd.DataFrame(results).sort_values(by="F1-skoor", ascending=False)
print("TULEMUSED (võrdlus: name_final):\n")
print(result_df.to_string(index=False))


TULEMUSED (võrdlus: name_final):

      Skeem  Parim lävend  Täpsus  Saagis  F1-skoor
        bgn          0.53  0.9811  0.9665    0.9738
   OS_ascii          0.53  0.9803  0.9668    0.9735
       icao          0.54  0.9816  0.9641    0.9728
     gost_b          0.52  0.9793  0.9659    0.9726
rt_translit          0.51  0.9796  0.9644    0.9720
        eki          0.51  0.9780  0.9654    0.9716
     dstu_b          0.49  0.9752  0.9644    0.9698
       iso9          0.49  0.9730  0.9617    0.9673
     dstu_a          0.47  0.9714  0.9535    0.9624


In [7]:
# --- Skeemid ja sihtveeru nimi ---
columns = [
    "iso9_ascii", "dstu_a_ascii", "icao_ascii", "dstu_b_ascii",
    "gost_b_ascii", "bgn_ascii", "rt_translit_ascii", "eki_ascii", "OS_ascii"
]
target_col = "name_ascii"
score_cols = [col + "_score" for col in columns]

# --- Skooride arvutus ---
for col, score_col in zip(columns, score_cols):
    scores = []
    for name1, name2 in zip(df[col], df[target_col]):
        try:
            score = person_name_jaro_winkler_std(name1, name2)
        except Exception:
            score = 0.0
        scores.append(score)
    df[score_col] = scores

# --- Parima F1-lävendi otsing ---
results = []
for score_col in score_cols:
    best_f1 = best_threshold = best_precision = best_recall = 0.0
    for threshold in np.arange(0.0, 1.01, 0.01):
        preds = df[score_col] >= threshold
        precision, recall, f1, _ = precision_recall_fscore_support(
            df["label"], preds, average="binary", zero_division=0
        )
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
            best_precision = precision
            best_recall = recall

    scheme_name = score_col.replace("_score", "")
    results.append({
        "Skeem": scheme_name,
        "Parim lävend": round(best_threshold, 2),
        "Täpsus": round(best_precision, 4),
        "Saagis": round(best_recall, 4),
        "F1-skoor": round(best_f1, 4)
    })

# --- Tulemuste tabel ---
result_df = pd.DataFrame(results).sort_values(by="F1-skoor", ascending=False)
print("TULEMUSED (võrdlus: name_final):\n")
print(result_df.to_string(index=False))


TULEMUSED (võrdlus: name_final):

            Skeem  Parim lävend  Täpsus  Saagis  F1-skoor
         OS_ascii          0.54  0.9831  0.9677    0.9753
       iso9_ascii          0.53  0.9809  0.9684    0.9746
        bgn_ascii          0.53  0.9806  0.9687    0.9746
        eki_ascii          0.53  0.9818  0.9672    0.9745
rt_translit_ascii          0.52  0.9796  0.9696    0.9745
       icao_ascii          0.54  0.9813  0.9663    0.9737
     gost_b_ascii          0.52  0.9789  0.9675    0.9732
     dstu_b_ascii          0.50  0.9780  0.9661    0.9720
     dstu_a_ascii          0.50  0.9789  0.9634    0.9711


In [None]:
# !jupyter nbconvert --to html person_name_JW_ME.ipynb --log-level=ERROR > nul 2>&1