# Name_based + ME + DL

In [1]:
import pandas as pd

# CSV-faili laadimine
df = pd.read_csv("df_pairs_final.csv", sep=";")

In [4]:
import pandas as pd
import numpy as np
from jellyfish import jaro_winkler_similarity, soundex, damerau_levenshtein_distance
from sklearn.metrics import precision_recall_fscore_support

# --- ASCII skeemide veerud käsitsi ---
ascii_columns = [
    "iso9_ascii",
    "dstu_a_ascii",
    "icao_ascii",
    "dstu_b_ascii",
    "gost_b_ascii",
    "bgn_ascii",
    "rt_translit_ascii",
    "eki_ascii",
    "OS_ascii"
]

ascii_score_cols = [col + "_score" for col in ascii_columns]

# --- Monge-Elkan skooride arvutus ---
for col, score_col in zip(ascii_columns, ascii_score_cols):
    scores = []
    for name1, name2 in zip(df[col], df["name_ascii"]):
        if not name1 or not name2:
            score = 0.0
        else:
            tokens1 = name1.lower().strip().split()
            tokens2 = name2.lower().strip().split()
            if not tokens1 or not tokens2:
                score = 0.0
            else:
                sim_list = []
                for token_a in tokens1:
                    best_match = 0.0
                    for token_b in tokens2:
                        jaro = jaro_winkler_similarity(token_a, token_b)
                        s1 = soundex(token_a)
                        s2 = soundex(token_b)
                        lev_dist = damerau_levenshtein_distance(s1, s2)
                        max_len = max(len(s1), len(s2))
                        sound_score = 1 - (lev_dist / max_len) if max_len > 0 else 0.0
                        sim = 0.5 * jaro + 0.5 * sound_score
                        best_match = max(best_match, sim)
                    sim_list.append(best_match)
                score = sum(sim_list) / len(sim_list) if sim_list else 0.0
        scores.append(score)
    df[score_col] = scores

# --- Lävendite otsing ja tulemuste tabel ---
results = []

for score_col in ascii_score_cols:
    best_f1 = 0.0
    best_threshold = 0.0
    best_precision = 0.0
    best_recall = 0.0

    for threshold in np.arange(0.0, 1.01, 0.01):
        preds = df[score_col] >= threshold
        precision, recall, f1, _ = precision_recall_fscore_support(
            df["label"], preds, average='binary', zero_division=0
        )
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
            best_precision = precision
            best_recall = recall

    scheme_name = score_col.replace("_score", "")
    results.append({
        "Skeem": scheme_name,
        "Parim lävend": round(best_threshold, 2),
        "Täpsus": round(best_precision, 4),
        "Saagis": round(best_recall, 4),
        "F1-skoor": round(best_f1, 4)
    })

ascii_df = pd.DataFrame(results).sort_values(by="F1-skoor", ascending=False)

# --- Tulemuste väljastus ---
print("ASCII skeemide tulemused:\n")
print(ascii_df.to_string(index=False))


ASCII skeemide tulemused:

            Skeem  Parim lävend  Täpsus  Saagis  F1-skoor
         OS_ascii          0.60  0.9831  0.9717    0.9774
        bgn_ascii          0.60  0.9843  0.9689    0.9765
       icao_ascii          0.60  0.9834  0.9689    0.9761
       iso9_ascii          0.59  0.9813  0.9684    0.9748
     gost_b_ascii          0.59  0.9824  0.9667    0.9745
        eki_ascii          0.58  0.9783  0.9705    0.9744
rt_translit_ascii          0.58  0.9788  0.9689    0.9739
     dstu_b_ascii          0.57  0.9776  0.9655    0.9715
     dstu_a_ascii          0.57  0.9777  0.9643    0.9710


In [None]:
!jupyter nbconvert --to html name_based_ME_DL.ipynb --log-level=ERROR > nul 2>&1

This application is used to convert notebook files (*.ipynb)
        to various other formats.


Options
The options below are convenience aliases to configurable class-options,
as listed in the "Equivalent to" description-line of the aliases.
To see all configurable class-options for some <cmd>, use:
    <cmd> --help-all

--debug
    set log level to logging.DEBUG (maximize logging output)
    Equivalent to: [--Application.log_level=10]
--show-config
    Show the application's configuration (human-readable format)
    Equivalent to: [--Application.show_config=True]
--show-config-json
    Show the application's configuration (json format)
    Equivalent to: [--Application.show_config_json=True]
--generate-config
    generate default config file
    Equivalent to: [--JupyterApp.generate_config=True]
-y
    Answer yes to any questions instead of prompting.
    Equivalent to: [--JupyterApp.answer_yes=True]
--execute
    Execute the notebook prior to export.
    Equivalent to: [--ExecutePr

