In [None]:
!pip install python-Levenshtein
!pip install textdistance

In [None]:
import textdistance
import pandas as pd
from Levenshtein import distance as lev_distance

In [None]:
corpora = "wiki" # "reddit" # "uber"

eval_df = pd.read_csv(f"../../datasets/automatic_evaluation/multiref.csv")
eval_df = eval_df.loc[~eval_df.isna().values].drop_duplicates()
eval_df = eval_df.loc[:, [
    "feature",
    "target",
    "spivavtor_correction",
    "pravopysnyk_correction",
    "language_tool_correction"
]]
eval_df = eval_df.loc[~eval_df.pravopysnyk_correction.isna()]
eval_df = eval_df.loc[~eval_df.correction.isna()]
eval_df.loc[eval_df.loc[:, "corpora"] == corpora]

In [None]:
eval_df['feature_target_lev_distance'] = eval_df.apply(lambda row: lev_distance(row['text'], row['correction']), axis=1)
eval_df['feature_target_dam_lev_distance'] = eval_df.apply(lambda row: textdistance.damerau_levenshtein.distance(row['text'], row['correction']), axis=1)

In [None]:
evals_lev_dist      = eval_df.feature_target_lev_distance.mean()
evals_dam_lev_dist  = eval_df.feature_target_dam_lev_distance.mean()

print(f"""
Average feature to target edit distance in {corpora}

Levenshtein distance: {evals_lev_dist:.2f}
Damerau–Levenshtein distance (with swaps): {evals_dam_lev_dist:.2f}
""")

In [None]:
eval_df['denom'] = eval_df.apply(
    lambda r: max(len(r['feature']), len(r['target'])), axis=1
)

# Compute edits per 100 chars
eval_df['edits_per_100'] = eval_df['feature_target_dam_lev_distance'] / eval_df['denom'] * 100

cer = eval_df['edits_per_100'].mean()

print(f"Average edits/100 chars (Damerau-Levenshtein CER) in {corpora}: {cer")

In [None]:
eval_df['denom'] = eval_df.apply(
    lambda r: max(len(r['feature']), len(r['target'])), axis=1
)

# Compute edits per 100 chars
eval_df['edits_per_100'] = eval_df['feature_target_lev_distance'] / eval_df['denom'] * 100

dam_cer = eval_df['edits_per_100'].mean()

print(f"Average edits/100 chars (CER) in {corpora}: {dam_cer")

In [None]:
results = {
    "edit-distance": evals_lev_dist,
    "edit-distance (Damerau-Levenshtein)": evals_dam_lev_dist,
    "cer": cer,
    "dam_cer": dam_cer,
}
results_df = pd.DataFrame([results])
results_df.to_csv(f"../../datasets/automatic_evaluation/{corpora}_edit_distance.csv", index=False)