In [None]:
import pandas as pd
import tempfile
import os

In [None]:
corpora = "wiki" # "reddit" # "uber"

eval_df = pd.read_csv(f"../../datasets/automatic_evaluation/mutliref.csv")
eval_df = eval_df.loc[:, [
    "feature",
    "target",
    "spivavtor_correction",
    "pravopysnyk_correction",
    "language_tool_correction"
]]
reference_columns = [
    "spivavtor_correction",
    "pravopysnyk_correction",
    "language_tool_correction",
]
eval_df.loc[eval_df.loc[:, "corpora"] == corpora]

In [None]:
filtered_df = eval_df.dropna(subset=['text', 'correction', *reference_columns])

if filtered_df.empty:
    print("No valid rows for evaluation with multiple references.")
else:
    # Preprocess each essay to force one-line formatting: replace newlines with a space.
    sources = [s.replace("\n", " ").strip() for s in filtered_df['text'].tolist()]
    hypotheses = [h.replace("\n", " ").strip() for h in filtered_df['correction'].tolist()]
    refs1 = [r.replace("\n", " ").strip() for r in filtered_df[reference_columns[0]].tolist()]
    refs2 = [r.replace("\n", " ").strip() for r in filtered_df[reference_columns[1]].tolist()]
    refs3 = [r.replace("\n", " ").strip() for r in filtered_df[reference_columns[2]].tolist()]

    # Write the sources, hypotheses, and each reference set to separate temporary files.
    with tempfile.NamedTemporaryFile(delete=False, mode='w') as src_file, \
         tempfile.NamedTemporaryFile(delete=False, mode='w') as hyp_file, \
         tempfile.NamedTemporaryFile(delete=False, mode='w') as ref_file1, \
         tempfile.NamedTemporaryFile(delete=False, mode='w') as ref_file2, \
         tempfile.NamedTemporaryFile(delete=False, mode='w') as ref_file3:

         src_file.write('\n'.join(sources) + '\n')
         hyp_file.write('\n'.join(hypotheses) + '\n')
         ref_file1.write('\n'.join(refs1) + '\n')
         ref_file2.write('\n'.join(refs2) + '\n')
         ref_file3.write('\n'.join(refs3) + '\n')

         src_file_path = src_file.name
         hyp_file_path = hyp_file.name
         ref_file1_path = ref_file1.name
         ref_file2_path = ref_file2.name
         ref_file3_path = ref_file3.name

    # Build the GLEU command: pass all three reference files.
    gleu_command = [
        'gleu',
        '-s', src_file_path,
        '-r', ref_file1_path, ref_file2_path, ref_file3_path,
        '-o', hyp_file_path,
        '-d', '4',  # Maximum n-gram length.
        '-f',       # Calculate sentence-level GLEU.
        '-n', '4',  # Order of n-grams.
        '-t', 'word'  # Tokenization type.
    ]

    print(f"Running GLEU command: {' '.join(gleu_command)}")
    gleu_output = os.popen(' '.join(gleu_command)).read()
    print(gleu_output)

    if gleu_output != "":
        gleu_split = gleu_output.split()
        gleu_score = gleu_split[1] if len(gleu_split) > 1 else "n/a"
    else:
        gleu_score = "n/a"

    # Clean up temporary files.
    os.remove(src_file_path)
    os.remove(hyp_file_path)
    os.remove(ref_file1_path)
    os.remove(ref_file2_path)
    os.remove(ref_file3_path)

    # Store and display the result.
    results = {
        'reference_columns': reference_columns,
        'gleu_score': gleu_score,
        'n_sentences': len(filtered_df)
    }

    results_df = pd.DataFrame([results])
    results_df.to_csv(f"../../datasets/automatic_evaluation/{corpora}_gleu_multi_ref.csv", index=False)
    display(results_df)

In [None]:
import re

filtered_df = eval_df.dropna(subset=['text', 'correction', *reference_columns])
language = filtered_df.language.unique()[0]
language_to_lang_code_mapping = {
    "ukrainian": "uk",
}
lang_code = language_to_lang_code_mapping[language]

# If filtered_df is empty, you won't get any scores
if filtered_df.empty:
    print("No valid rows for multi-reference ERRANT scoring.")
    errant_results_df = pd.DataFrame(columns=["n_sentences", "precision", "recall", "f0.5"])
    errant_results_df.to_csv(f"../../datasets/automatic_evaluation/{corpora}_errant_multi_ref.csv", index=False)
else:
    # ------------------------------------------------------
    # Convert multiline to single-line text for each column
    # ------------------------------------------------------
    originals = [t.replace("\n", " ").strip() for t in filtered_df['text'].tolist()]
    predictions = [p.replace("\n", " ").strip() for p in filtered_df['correction'].tolist()]
    refs1 = [r.replace("\n", " ").strip() for r in filtered_df[reference_columns[0]].tolist()]
    refs2 = [r.replace("\n", " ").strip() for r in filtered_df[reference_columns[1]].tolist()]
    refs3 = [r.replace("\n", " ").strip() for r in filtered_df[reference_columns[2]].tolist()]

    # ----------------------------------------------------------------
    # Write these to temporary .tmp files for original/prediction/refs
    # ----------------------------------------------------------------
    with tempfile.NamedTemporaryFile(delete=False, mode='w') as orig_f, \
         tempfile.NamedTemporaryFile(delete=False, mode='w') as pred_f, \
         tempfile.NamedTemporaryFile(delete=False, mode='w') as ref1_f, \
         tempfile.NamedTemporaryFile(delete=False, mode='w') as ref2_f, \
         tempfile.NamedTemporaryFile(delete=False, mode='w') as ref3_f:

        orig_f.write("\n".join(originals) + "\n")
        pred_f.write("\n".join(predictions) + "\n")
        ref1_f.write("\n".join(refs1) + "\n")
        ref2_f.write("\n".join(refs2) + "\n")
        ref3_f.write("\n".join(refs3) + "\n")

        orig_f_path = orig_f.name
        pred_f_path = pred_f.name
        ref1_f_path = ref1_f.name
        ref2_f_path = ref2_f.name
        ref3_f_path = ref3_f.name

    # ----------------------------------------------------------------
    # 1) Build an M2 for the references (multi-ref).
    #    "errant_parallel -orig <orig> -cor <ref1> <ref2> <ref3> -out <ref_m2> -lang en"
    # ----------------------------------------------------------------
    ref_m2 = ref1_f_path.replace(".tmp", "") + "_ref.m2"  # or use a temp file
    errant_parallel_refs_cmd = (
        f"errant_parallel -orig {orig_f_path} "
        f"-cor {ref1_f_path} {ref2_f_path} {ref3_f_path} "
        f"-out {ref_m2} -lang {lang_code}"
    )
    print("Building reference M2:\n", errant_parallel_refs_cmd)
    os.system(errant_parallel_refs_cmd)

    # ----------------------------------------------------------------
    # 2) Build an M2 for the prediction
    #    "errant_parallel -orig <orig> -cor <prediction> -out <hyp_m2> -lang en"
    # ----------------------------------------------------------------
    hyp_m2 = pred_f_path.replace(".tmp", "") + "_pred.m2"
    errant_parallel_pred_cmd = (
        f"errant_parallel -orig {orig_f_path} "
        f"-cor {pred_f_path} "
        f"-out {hyp_m2} -lang {lang_code}"
    )
    print("Building prediction M2:\n", errant_parallel_pred_cmd)
    os.system(errant_parallel_pred_cmd)

    # ----------------------------------------------------------------
    # 3) Compare the two M2 files with errant_compare
    #    "errant_compare -hyp <hyp_m2> -ref <ref_m2>"
    # ----------------------------------------------------------------
    errant_compare_cmd = f"errant_compare -hyp {hyp_m2} -ref {ref_m2}"
    print("Comparing M2 files:\n", errant_compare_cmd)
    errant_output = os.popen(errant_compare_cmd).read()
    print(errant_output)

    # ----------------------------------------------------------------
    # Parse the precision, recall, and F0.5 from ERRANT’s output
    # Example block:
    # =========== Span-Based Correction ============
    # TP      FP      FN      Prec    Rec     F0.5
    # 12      4       6       0.75    0.6667  0.7317
    # ==============================================
    # ----------------------------------------------------------------
    prf_pattern = re.compile(r"(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)")
    match = prf_pattern.search(errant_output)
    if match:
        precision_str, recall_str, f05_str = match.groups()
    else:
        precision_str, recall_str, f05_str = "0", "0", "0"

    # Convert to numeric and store
    precision = float(precision_str) * 100
    recall = float(recall_str) * 100
    f05 = float(f05_str) * 100

    # ----------------------------------------------------------------
    # Build final DataFrame with results, write to CSV
    # ----------------------------------------------------------------
    results_dict = {
        "n_sentences": len(filtered_df),
        "precision": precision,
        "recall": recall,
        "f0.5": f05
    }
    errant_results_df = pd.DataFrame([results_dict])
    errant_results_df.to_csv(f"../../datasets/automatic_evaluation/{corpora}_errant_multi_ref.csv", index=False)

    # ----------------------------------------------------------------
    # Cleanup: remove the .tmp and .m2 files if desired
    # ----------------------------------------------------------------
    os.remove(orig_f_path)
    os.remove(pred_f_path)
    os.remove(ref1_f_path)
    os.remove(ref2_f_path)
    os.remove(ref3_f_path)
    if os.path.exists(ref_m2):
        os.remove(ref_m2)
    if os.path.exists(hyp_m2):
        os.remove(hyp_m2)