In [1]:
import pandas as pd

# Laadige oma andmed
df = pd.read_csv("df_pairs_final.csv", sep=";")

In [18]:
pip install jellyfish

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [8]:
import pandas as pd
import numpy as np
from jellyfish import levenshtein_distance
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support

# --- Tokeniseerimine ja mõõdikute arvutus ---
def tokenize(text):
    return set(text.lower().split())

def calculate_all_metrics(row, column_name, target_col):
    col_value = row[column_name]
    final_value = row[target_col]

    col_tokens = tokenize(col_value)
    final_tokens = tokenize(final_value)
    overlap = col_tokens.intersection(final_tokens)
    token_overlap = len(overlap) / max(min(len(col_tokens), len(final_tokens)), 2)

    dist = levenshtein_distance(col_value, final_value)
    max_len = max(len(col_value), len(final_value))
    levenshtein_score = 1 - dist / max_len if max_len > 0 else 1.0

    full_match = 1 if col_value == final_value else 0

    return pd.Series([token_overlap, levenshtein_score, full_match])

def run_logistic_regression_component_model(df, columns, target_col):
    for col in columns:
        df[[f"{col}_token_overlap", f"{col}_levenshtein", f"{col}_full_match"]] = \
            df.apply(lambda row: calculate_all_metrics(row, col, target_col), axis=1)

    y_true = df["label"].astype(int)
    thresholds = np.arange(0.0, 1.01, 0.01)
    results = []

    for col in columns:
        feature_cols = [f"{col}_token_overlap", f"{col}_levenshtein", f"{col}_full_match"]
        X = df[feature_cols]

        model = LogisticRegression(penalty='l2', solver='liblinear', random_state=123)
        model.fit(X, y_true)
        proba = model.predict_proba(X)[:, 1]

        best_f1 = 0
        best_threshold = 0
        best_precision = 0
        best_recall = 0

        for threshold in thresholds:
            y_pred = (proba >= threshold).astype(int)
            precision, recall, f1, _ = precision_recall_fscore_support(
                y_true, y_pred, average='binary', zero_division=0
            )
            if f1 > best_f1:
                best_f1 = f1
                best_threshold = threshold
                best_precision = precision
                best_recall = recall

        results.append([
            col,
            round(best_threshold, 2),
            round(best_precision, 4),
            round(best_recall, 4),
            round(best_f1, 4)
        ])

    return pd.DataFrame(results, columns=["Skeem", "Parim lävend", "Täpsus", "Saagis", "F1-skoor"])


In [9]:
columns = ["iso9", "dstu_a", "icao", "dstu_b", "gost_b", "bgn", "rt_translit", "eki", "OS_ascii"]
target_col = "name_final"

df_results = run_logistic_regression_component_model(df, columns, target_col)
print("Tulemused (põhiskeemid):")
print(df_results.to_string(index=False))

Tulemused (põhiskeemid):
      Skeem  Parim lävend  Täpsus  Saagis  F1-skoor
       iso9          0.42  0.9561  0.8394    0.8940
     dstu_a          0.43  0.9389  0.8108    0.8702
       icao          0.45  0.9769  0.8647    0.9174
     dstu_b          0.42  0.9362  0.8224    0.8756
     gost_b          0.43  0.9653  0.8510    0.9046
        bgn          0.47  0.9792  0.8616    0.9167
rt_translit          0.42  0.9617  0.8448    0.8994
        eki          0.43  0.9672  0.8459    0.9025
   OS_ascii          0.46  0.9749  0.8757    0.9227


In [10]:
ascii_columns = ["iso9_ascii", "dstu_a_ascii", "icao_ascii", "dstu_b_ascii", "gost_b_ascii", "bgn_ascii", "rt_translit_ascii", "eki_ascii", "OS_ascii"]
target_col = "name_ascii"

df_results_ascii = run_logistic_regression_component_model(df, ascii_columns, target_col)
print("Tulemused (ASCII skeemid):")
print(df_results_ascii.to_string(index=False))

Tulemused (ASCII skeemid):
            Skeem  Parim lävend  Täpsus  Saagis  F1-skoor
       iso9_ascii          0.41  0.9614  0.8548    0.9050
     dstu_a_ascii          0.46  0.9574  0.8123    0.8789
       icao_ascii          0.44  0.9767  0.8674    0.9188
     dstu_b_ascii          0.43  0.9451  0.8187    0.8774
     gost_b_ascii          0.45  0.9728  0.8474    0.9058
        bgn_ascii          0.46  0.9791  0.8647    0.9183
rt_translit_ascii          0.42  0.9650  0.8498    0.9037
        eki_ascii          0.40  0.9618  0.8575    0.9067
         OS_ascii          0.45  0.9755  0.8806    0.9256


In [None]:
# !jupyter nbconvert --to html regression.ipynb --log-level=ERROR > nul 2>&1