# Name_based

In [1]:
import pandas as pd

# CSV-faili laadimine
df = pd.read_csv("df_pairs_final.csv", sep=";")

In [2]:
import pandas as pd
import numpy as np
from jellyfish import jaro_winkler_similarity, soundex
from sklearn.metrics import precision_recall_fscore_support

# --- Käsitsi määratletud ASCII transliteratsiooniskeemide veerud ---
ascii_columns = [
    "iso9_ascii",
    "dstu_a_ascii",
    "icao_ascii",
    "dstu_b_ascii",
    "gost_b_ascii",
    "bgn_ascii",
    "rt_translit_ascii",
    "eki_ascii",
    "OS_ascii"
]

# --- Skooriveergude nimed ---
ascii_score_cols = [col + "_score" for col in ascii_columns]

# --- Name-based skooride arvutus ---
for col, score_col in zip(ascii_columns, ascii_score_cols):
    scores = []
    for name1, name2 in zip(df[col], df["name_ascii"]):
        if not name1 or not name2:
            score = 0.0
        else:
            name1 = name1.lower().strip()
            name2 = name2.lower().strip()
            jaro = jaro_winkler_similarity(name1, name2)
            s1 = soundex(name1)
            s2 = soundex(name2)
            sound_score = 1.0 if s1 == s2 else 0.0
            score = 0.5 * jaro + 0.5 * sound_score
        scores.append(score)
    df[score_col] = scores

# --- Parimate lävendite leidmine ---
results = []

for score_col in ascii_score_cols:
    best_f1 = 0.0
    best_threshold = 0.0
    best_precision = 0.0
    best_recall = 0.0

    for threshold in np.arange(0.0, 1.01, 0.01):
        preds = df[score_col] >= threshold
        precision, recall, f1, _ = precision_recall_fscore_support(
            df["label"], preds, average='binary', zero_division=0
        )
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
            best_precision = precision
            best_recall = recall

    scheme_name = score_col.replace("_score", "")
    results.append({
        "Skeem": scheme_name,
        "Parim lävend": round(best_threshold, 2),
        "Täpsus": round(best_precision, 4),
        "Saagis": round(best_recall, 4),
        "F1-skoor": round(best_f1, 4)
    })

# --- Tulemuste tabel ---
results_ascii = pd.DataFrame(results).sort_values(by="F1-skoor", ascending=False)

print("ASCII skeemide tulemused:\n")
print(results_ascii.to_string(index=False))

ASCII skeemide tulemused:

            Skeem  Parim lävend  Täpsus  Saagis  F1-skoor
        bgn_ascii          0.32  0.9430  0.8636    0.9015
         OS_ascii          0.32  0.9468  0.8564    0.8994
     gost_b_ascii          0.31  0.9210  0.8710    0.8953
        eki_ascii          0.31  0.9193  0.8726    0.8953
       iso9_ascii          0.31  0.9198  0.8703    0.8944
       icao_ascii          0.32  0.9379  0.8544    0.8942
rt_translit_ascii          0.31  0.9259  0.8638    0.8937
     dstu_b_ascii          0.30  0.9116  0.8741    0.8924
     dstu_a_ascii          0.30  0.9176  0.8592    0.8874


In [None]:
#!jupyter nbconvert --to html name_based.ipynb --log-level=ERROR > nul 2>&1