# LaBSE

In [1]:
import pandas as pd

# CSV-faili laadimine
df = pd.read_csv("df_pairs_final.csv", sep=";")

In [2]:
pip install sentence_transformers

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install matplotlib

Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install tabulate

Note: you may need to restart the kernel to use updated packages.


In [5]:
# --- Impordid ---
import pandas as pd
import numpy as np
import torch
from sklearn.metrics import precision_recall_curve, precision_recall_fscore_support
from sentence_transformers import SentenceTransformer
from tabulate import tabulate

# --- Parima F1 lävendi leidmine ---
def find_best_threshold(df: pd.DataFrame, score_col: str, label_col: str = "label"):
    y_true = df[label_col].astype(int).values
    y_scores = df[score_col].values
    precisions, recalls, thresholds = precision_recall_curve(y_true, y_scores)
    f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-10)
    best_idx = np.argmax(f1_scores)
    best_thresh = thresholds[best_idx]
    best_f1 = f1_scores[best_idx]
    best_prec = precisions[best_idx]
    best_rec = recalls[best_idx]
    f1_curve = list(zip(thresholds, f1_scores[:-1]))
    return best_thresh, best_f1, best_prec, best_rec, f1_curve

# --- Kõikide skeemide tulemused ---
def find_best_thresholds_for_all(df: pd.DataFrame, score_cols: list[str], label_col: str = "label"):
    results = []
    all_curves = {}
    for col in score_cols:
        best_thresh, best_f1, best_prec, best_rec, f1_curve = find_best_threshold(df, col, label_col)
        scheme_name = col.replace("_score", "")
        if "_ascii" in col:
            scheme_name += "_ascii"
        results.append({
            "Skeem": scheme_name,
            "Parim lävend": round(best_thresh, 2),
            "Täpsus": round(best_prec, 4),
            "Saagis": round(best_rec, 4),
            "F1-skoor": round(best_f1, 4)
        })
        all_curves[col] = f1_curve
    return pd.DataFrame(results).sort_values(by="F1-skoor", ascending=False), all_curves

# --- LaBSE sarnasuste arvutamine ---
def calculate_similarities(text_pairs, model):
    similarities = []
    for text1, text2 in text_pairs:
        emb1 = model.encode(text1, convert_to_tensor=True)
        emb2 = model.encode(text2, convert_to_tensor=True)
        sim = torch.nn.functional.cosine_similarity(emb1.unsqueeze(0), emb2.unsqueeze(0))
        similarities.append(sim.item())
    return np.array(similarities)

# --- Andmevalim ja mudel ---
print("Laadin andmevalimi ja mudeli...")
df_sample = pd.concat([
    df[df['label'] == True].sample(5000, random_state=42),
    df[df['label'] == False].sample(5000, random_state=42)
])
model = SentenceTransformer('LaBSE')


  from .autonotebook import tqdm as notebook_tqdm


Laadin andmevalimi ja mudeli...


In [6]:
columns_to_compare = [
                     "OS_ascii",
                     "alias_final",
                      
                     "iso9",
                     "dstu_a",
                     "icao",
                     "dstu_b",
                     "gost_b",
                     "bgn",
                     "rt_translit",
                     "eki",
                     
                     "iso9_ascii",
                     "dstu_a_ascii",
                     "icao_ascii",
                     "dstu_b_ascii",
                     "gost_b_ascii",
                     "bgn_ascii",
                     "rt_translit_ascii",
                     "eki_ascii"
                     ]
score_columns = []

for col in columns_to_compare:
    print(f"Töötlen skeemi: {col}")
    text_pairs = list(zip(df_sample["name_final"], df_sample[col]))
    similarities = calculate_similarities(text_pairs, model)
    df_sample[f"{col}_score"] = similarities
    score_columns.append(f"{col}_score")

results_df, _ = find_best_thresholds_for_all(df_sample, score_columns)

print(tabulate(results_df, headers='keys', tablefmt='plain', floatfmt=".4f"))


Töötlen skeemi: OS_ascii
Töötlen skeemi: alias_final
Töötlen skeemi: iso9
Töötlen skeemi: dstu_a
Töötlen skeemi: icao
Töötlen skeemi: dstu_b
Töötlen skeemi: gost_b
Töötlen skeemi: bgn
Töötlen skeemi: rt_translit
Töötlen skeemi: eki
Töötlen skeemi: iso9_ascii
Töötlen skeemi: dstu_a_ascii
Töötlen skeemi: icao_ascii
Töötlen skeemi: dstu_b_ascii
Töötlen skeemi: gost_b_ascii
Töötlen skeemi: bgn_ascii
Töötlen skeemi: rt_translit_ascii
Töötlen skeemi: eki_ascii

TULEMUSED (LaBSE + name_final):
    Skeem                      Parim lävend    Täpsus    Saagis    F1-skoor
12  icao_ascii_ascii                 0.5200    0.9534    0.9360      0.9446
 4  icao                             0.5200    0.9534    0.9360      0.9446
15  bgn_ascii_ascii                  0.5000    0.9456    0.9416      0.9436
 7  bgn                              0.5000    0.9463    0.9406      0.9434
 6  gost_b                           0.5100    0.9497    0.9260      0.9377
 0  OS_ascii_ascii                   0.5000    0.938

In [7]:
columns_to_compare = [
                     "alias_final",
                     "iso9_ascii",
                     "dstu_a_ascii",
                     "icao_ascii",
                     "dstu_b_ascii",
                     "gost_b_ascii",
                     "bgn_ascii",
                     "rt_translit_ascii",
                     "eki_ascii",
                     "OS_ascii"
                     ]
score_columns = []

for col in columns_to_compare:
    print(f"Töötlen skeemi: {col}")
    text_pairs = list(zip(df_sample["name_ascii"], df_sample[col]))
    similarities = calculate_similarities(text_pairs, model)
    df_sample[f"{col}_score"] = similarities
    score_columns.append(f"{col}_score")

results_df, _ = find_best_thresholds_for_all(df_sample, score_columns)

print(tabulate(results_df, headers='keys', tablefmt='plain', floatfmt=".4f"))


Töötlen skeemi: alias_final
Töötlen skeemi: iso9_ascii
Töötlen skeemi: dstu_a_ascii
Töötlen skeemi: icao_ascii
Töötlen skeemi: dstu_b_ascii
Töötlen skeemi: gost_b_ascii
Töötlen skeemi: bgn_ascii
Töötlen skeemi: rt_translit_ascii
Töötlen skeemi: eki_ascii
Töötlen skeemi: OS_ascii
    Skeem                      Parim lävend    Täpsus    Saagis    F1-skoor
 3  icao_ascii_ascii                 0.5000    0.9416    0.9484      0.9450
 6  bgn_ascii_ascii                  0.5000    0.9458    0.9414      0.9436
 9  OS_ascii_ascii                   0.5000    0.9389    0.9370      0.9379
 8  eki_ascii_ascii                  0.5000    0.9475    0.9280      0.9377
 4  dstu_b_ascii_ascii               0.4700    0.9393    0.9352      0.9373
 5  gost_b_ascii_ascii               0.5200    0.9568    0.9170      0.9365
 2  dstu_a_ascii_ascii               0.4600    0.9274    0.9456      0.9364
 7  rt_translit_ascii_ascii          0.5000    0.9503    0.9212      0.9355
 1  iso9_ascii_ascii                

In [None]:
#!jupyter nbconvert --to html labse.ipynb --log-level=ERROR > nul 2>&1