# Vastendaja

In [4]:
import pandas as pd

# CSV-faili laadimine
df = pd.read_csv("df_pairs_final.csv", sep=";")

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_recall_fscore_support

# --- Tokeniseerimine n-märkide kaupa ---
def tokenize_with_spaces(text: str, token_length: int = 3) -> list:
    if pd.isna(text):
        return []
    tokens = []
    for word in str(text).split():
        for i in range(0, len(word), token_length):
            token = word[i:i+token_length]
            if len(token) >= 2:
                tokens.append(token)
    return tokens

# --- Cosine-skoor TF-IDF-iga ---
def ngram_similarity(str1: str, str2: str, n_min: int = 3, n_max: int = 4) -> float:
    tokens1 = ' '.join(tokenize_with_spaces(str1))
    tokens2 = ' '.join(tokenize_with_spaces(str2))
    vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(n_min, n_max))
    try:
        tfidf = vectorizer.fit_transform([tokens1, tokens2])
        score = cosine_similarity(tfidf[0:1], tfidf[1:2])[0][0]
    except:
        score = 0.0
    return score

# --- Parima F1 lävendi leidmine ---
def find_best_threshold(df, score_col: str, label_col: str = "label", 
                        thresholds: np.ndarray = np.arange(0.0, 1.01, 0.01)) -> tuple:
    best_f1 = 0.0
    best_threshold = 0.0
    best_metrics = (0.0, 0.0, 0.0)
    for threshold in thresholds:
        preds = df[score_col] >= threshold
        precision, recall, f1, _ = precision_recall_fscore_support(
            df[label_col], preds, average='binary', zero_division=0)
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
            best_metrics = (precision, recall, f1)
    return best_threshold, *best_metrics

# --- Skeemide hindamine ilma graafikuteta ---
def evaluate_all_schemes(df: pd.DataFrame, schemes: list, name_col: str = "name_final") -> pd.DataFrame:
    results = []
    for scheme in schemes:
        print(f"→ Töötlen skeemi: {scheme} ...")
        try:
            df[f"sim_{scheme}"] = df.apply(
                lambda row: ngram_similarity(row[name_col], row[scheme]), axis=1
            )
            threshold, precision, recall, f1 = find_best_threshold(df, f"sim_{scheme}")
            results.append({
                "Skeem": scheme,
                "Lävend": round(threshold, 2),
                "Täpsus": round(precision, 4),
                "Saagis": round(recall, 4),
                "F1": round(f1, 4)
            })
        except Exception as e:
            print(f"Viga skeemiga {scheme}: {e}")
            continue
    return pd.DataFrame(results).sort_values(by="F1", ascending=False)


In [5]:
columns = ["iso9", "dstu_a", "icao", "dstu_b", "gost_b", "bgn", "rt_translit", "eki", "OS_ascii"]
result_df = evaluate_all_schemes(df, columns, name_col="name_final")

print(result_df.to_string(index=False))

→ Töötlen skeemi: iso9 ...
→ Töötlen skeemi: dstu_a ...
→ Töötlen skeemi: icao ...
→ Töötlen skeemi: dstu_b ...
→ Töötlen skeemi: gost_b ...
→ Töötlen skeemi: bgn ...
→ Töötlen skeemi: rt_translit ...
→ Töötlen skeemi: eki ...
→ Töötlen skeemi: OS_ascii ...
      Skeem  Lävend  Täpsus  Saagis     F1
       icao    0.06  0.9588  0.9286 0.9434
        bgn    0.06  0.9615  0.9241 0.9424
   OS_ascii    0.05  0.9546  0.9197 0.9369
     gost_b    0.05  0.9526  0.9211 0.9366
        eki    0.05  0.9590  0.9139 0.9359
rt_translit    0.05  0.9606  0.9116 0.9355
       iso9    0.04  0.9486  0.9078 0.9278
     dstu_a    0.03  0.9408  0.8948 0.9172
     dstu_b    0.04  0.9485  0.8878 0.9171


In [6]:
ascii_columns = ["iso9_ascii", "dstu_a_ascii", "icao_ascii", "dstu_b_ascii", "gost_b_ascii", "bgn_ascii", "rt_translit_ascii", "eki_ascii", "OS_ascii"]
result_df_ascii = evaluate_all_schemes(df, ascii_columns, name_col="name_ascii")

print(result_df_ascii.to_string(index=False))

→ Töötlen skeemi: iso9_ascii ...
→ Töötlen skeemi: dstu_a_ascii ...
→ Töötlen skeemi: icao_ascii ...
→ Töötlen skeemi: dstu_b_ascii ...
→ Töötlen skeemi: gost_b_ascii ...
→ Töötlen skeemi: bgn_ascii ...
→ Töötlen skeemi: rt_translit_ascii ...
→ Töötlen skeemi: eki_ascii ...
→ Töötlen skeemi: OS_ascii ...
            Skeem  Lävend  Täpsus  Saagis     F1
       icao_ascii    0.06  0.9584  0.9310 0.9445
        bgn_ascii    0.06  0.9611  0.9268 0.9436
        eki_ascii    0.05  0.9565  0.9266 0.9413
rt_translit_ascii    0.05  0.9575  0.9249 0.9409
         OS_ascii    0.06  0.9647  0.9138 0.9386
     gost_b_ascii    0.06  0.9626  0.9143 0.9379
       iso9_ascii    0.05  0.9557  0.9191 0.9371
     dstu_a_ascii    0.04  0.9529  0.9058 0.9288
     dstu_b_ascii    0.04  0.9481  0.8942 0.9203


In [None]:
# !jupyter nbconvert --to html vastendaja.ipynb --log-level=ERROR > nul 2>&1