# person_name JW

In [1]:
import pandas as pd

# CSV-faili laadimine
df = pd.read_csv("df_pairs_final.csv", sep=";")

KeyboardInterrupt: 

In [None]:
import pandas as pd
import numpy as np
from difflib import SequenceMatcher
from itertools import product
from sklearn.metrics import precision_recall_fscore_support

def jaro_winkler_std(s1, s2):
    if s1 == s2:
        return 1.0
    match = SequenceMatcher(None, s1, s2).ratio()
    prefix = 0
    for i in range(min(4, len(s1), len(s2))):
        if s1[i] == s2[i]:
            prefix += 1
        else:
            break
    return match + 0.1 * prefix * (1 - match)

def _name_parts(name):
    return name.strip().split()

def _align_name_parts_std(query, result):
    if len(query) == 0 or len(result) == 0:
        return 0.0
    scores = {}
    for qn, rn in product(set(query), set(result)):
        score = jaro_winkler_std(qn, rn)
        if score > 0.0:
            scores[(qn, rn)] = score
    pairs = []
    length = len(query)
    total_score = 1.0
    for (qn, rn), score in sorted(scores.items(), key=lambda i: i[1], reverse=True):
        while qn in query and rn in result:
            query.remove(qn)
            result.remove(rn)
            total_score *= score
            pairs.append((qn, rn))
    if len(pairs) < length:
        return 0.0
    return total_score

def person_name_jaro_winkler_std(query_name, result_name):
    qn_parts = _name_parts(query_name)
    rn_parts = _name_parts(result_name)
    qn_joined = "".join(qn_parts)
    rn_joined = "".join(rn_parts)
    return max(
        jaro_winkler_std(qn_joined, rn_joined) ** len(qn_joined),
        _align_name_parts_std(list(qn_parts), list(rn_parts))
    )


In [None]:
columns = [
    "iso9_ascii", "dstu_a_ascii", "icao_ascii", "dstu_b_ascii",
    "gost_b_ascii", "bgn_ascii", "rt_translit_ascii", "eki_ascii", "OS_ascii"
]
target_col = "name_ascii"

score_cols = [col + "_score" for col in columns]

# --- Skooride arvutus ---
for col, score_col in zip(columns, score_cols):
    scores = []
    for name1, name2 in zip(df[col], df[target_col]):
        if not name1 or not name2:
            score = 0.0
        else:
            try:
                score = person_name_jaro_winkler_std(name1, name2)
            except Exception:
                score = 0.0
        scores.append(score)
    df[score_col] = scores

# --- Lävend ja tulemused ---
results = []

for score_col in score_cols:
    best_f1 = 0.0
    best_threshold = 0.0
    best_precision = 0.0
    best_recall = 0.0
    for threshold in np.arange(0.0, 1.01, 0.01):
        preds = df[score_col] >= threshold
        precision, recall, f1, _ = precision_recall_fscore_support(
            df["label"], preds, average='binary', zero_division=0
        )
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
            best_precision = precision
            best_recall = recall

    scheme_name = score_col.replace("_score", "")
    results.append({
        "Skeem": scheme_name,
        "Parim lävend": round(best_threshold, 2),
        "Täpsus": round(best_precision, 4),
        "Saagis": round(best_recall, 4),
        "F1-skoor": round(best_f1, 4)
    })

result_df = pd.DataFrame(results).sort_values(by="F1-skoor", ascending=False)

print(f"TULEMUSED (võrdlus {target_col}):\n")
print(result_df.to_string(index=False))


TULEMUSED (võrdlus name_ascii):

            Skeem  Parim lävend  Täpsus  Saagis  F1-skoor
       iso9_ascii          0.24  0.9752  0.7743    0.8632
        eki_ascii          0.22  0.9698  0.7771    0.8628
rt_translit_ascii          0.22  0.9702  0.7763    0.8625
       icao_ascii          0.24  0.9737  0.7740    0.8624
        bgn_ascii          0.24  0.9748  0.7725    0.8619
         OS_ascii          0.24  0.9739  0.7716    0.8610
     gost_b_ascii          0.22  0.9705  0.7735    0.8609
     dstu_b_ascii          0.21  0.9722  0.7697    0.8592
     dstu_a_ascii          0.19  0.9658  0.7736    0.8591


In [None]:
columns = [
    "iso9", "dstu_a", "icao", "dstu_b",
    "gost_b", "bgn", "rt_translit", "eki", "OS_ascii"
]
target_col = "name_final"

score_cols = [col + "_score" for col in columns]

# --- Skooride arvutus ---
for col, score_col in zip(columns, score_cols):
    scores = []
    for name1, name2 in zip(df[col], df[target_col]):
        if not name1 or not name2:
            score = 0.0
        else:
            try:
                score = person_name_jaro_winkler_std(name1, name2)
            except Exception:
                score = 0.0
        scores.append(score)
    df[score_col] = scores

# --- Lävend ja tulemused ---
results = []

for score_col in score_cols:
    best_f1 = 0.0
    best_threshold = 0.0
    best_precision = 0.0
    best_recall = 0.0
    for threshold in np.arange(0.0, 1.01, 0.01):
        preds = df[score_col] >= threshold
        precision, recall, f1, _ = precision_recall_fscore_support(
            df["label"], preds, average='binary', zero_division=0
        )
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
            best_precision = precision
            best_recall = recall

    scheme_name = score_col.replace("_score", "")
    results.append({
        "Skeem": scheme_name,
        "Parim lävend": round(best_threshold, 2),
        "Täpsus": round(best_precision, 4),
        "Saagis": round(best_recall, 4),
        "F1-skoor": round(best_f1, 4)
    })

result_df = pd.DataFrame(results).sort_values(by="F1-skoor", ascending=False)

print(f"TULEMUSED (võrdlus {target_col}):\n")
print(result_df.to_string(index=False))


TULEMUSED (võrdlus name_final):

      Skeem  Parim lävend  Täpsus  Saagis  F1-skoor
       icao          0.24  0.9743  0.7712    0.8609
        bgn          0.22  0.9696  0.7733    0.8604
     gost_b          0.22  0.9712  0.7710    0.8596
        eki          0.21  0.9693  0.7708    0.8587
rt_translit          0.21  0.9700  0.7697    0.8583
     dstu_b          0.19  0.9663  0.7687    0.8563
       iso9          0.19  0.9632  0.7658    0.8533
   OS_ascii          0.24  0.9742  0.7591    0.8533
     dstu_a          0.17  0.9599  0.7617    0.8494


In [None]:
# !jupyter nbconvert --to html person_name_JW.ipynb #--log-level=ERROR > nul 2>&1