In [6]:
import pandas as pd
import stanza

In [7]:
stanza_tokenizer = stanza.Pipeline(
    lang="be",
    model_dir="../temp/stanza",
    processors="tokenize",
    download_method=stanza.DownloadMethod.REUSE_RESOURCES,
)

2024-05-15 18:50:03 INFO: Loading these models for language: be (Belarusian):
| Processor | Package |
-----------------------
| tokenize  | hse     |

2024-05-15 18:50:03 INFO: Using device: cpu
2024-05-15 18:50:03 INFO: Loading: tokenize
2024-05-15 18:50:03 INFO: Done loading processors!


In [8]:
def get_total_words_with_errors(doc_orig, doc_with_errors) -> tuple[int, int]:
    total_words_with_errors = 0
    for word_orig, word_with_errors in zip(
            doc_orig.iter_words(), doc_with_errors.iter_words()
    ):
        if word_orig.text != word_with_errors.text:
            total_words_with_errors += 1
    return total_words_with_errors

In [9]:
def compare_docs(doc_orig, doc_with_errors, doc_after_correction) -> dict:
    has_errors_fixed_correctly = 0
    has_errors_fixed_incorrectly = 0
    has_errors_not_fixed = 0
    has_no_errors_fixed_incorrectly = 0

    for word_orig, word_with_errors, word_after_correction in zip(
            doc_orig.iter_words(), doc_with_errors.iter_words(), doc_after_correction.iter_words()
    ):
        if word_orig.text == word_with_errors.text:
            if word_orig.text != word_after_correction.text:
                has_no_errors_fixed_incorrectly += 1
        else:
            if word_orig.text == word_after_correction.text:
                has_errors_fixed_correctly += 1
            elif word_with_errors.text == word_after_correction.text:
                # print(word_orig.text, word_after_correction.text)
                has_errors_not_fixed += 1
            else:
                has_errors_fixed_incorrectly += 1

    return {
        "Words with errors, fixed correctly": has_errors_fixed_correctly,
        "Words with errors, fixed incorrectly": has_errors_fixed_incorrectly,
        "Words with errors, not fixed": has_errors_not_fixed,
        "Words with no errors, mistakenly \"fixed\"": has_no_errors_fixed_incorrectly,
    }

In [10]:
for text in range(1, 4):
    with open(f"original_texts/{text}_orig.txt", mode="r", encoding="utf-8") as file:
        text_orig = file.read()
    with open(f"../app/public/static/texts/{text}.txt", mode="r", encoding="utf-8") as file:
        text_with_errors = file.read()
    doc_orig = stanza_tokenizer(text_orig)
    doc_with_errors = stanza_tokenizer(text_with_errors)
    
    print(
        f"----------\n"
        f"Text {text}\n"
        f"Total words: {doc_orig.num_words}\n"
        f"Total words with errors: {get_total_words_with_errors(doc_orig, doc_with_errors)}"
    )

    results_damerau = pd.DataFrame()
    for damerau_distance in range(1, 6):
        with open(
            f"texts_after_corrections/{text}/{text}_damerau_{damerau_distance}.txt",
            mode="r",
            encoding="utf-8"
        ) as file:
            text_after_correction = file.read()
        doc_after_correction = stanza_tokenizer(text_after_correction)
        results_damerau[damerau_distance] = compare_docs(
            doc_orig, doc_with_errors, doc_after_correction
        )
    print("Damerau-Levenshtein:")
    display(results_damerau)

    results_jaro = pd.DataFrame()
    for jaro_similarity in range(95, 74, -5):
        with open(
            f"texts_after_corrections/{text}/{text}_jaro_{jaro_similarity}.txt",
            mode="r",
            encoding="utf-8"
        ) as file:
            text_after_correction = file.read()
        doc_after_correction = stanza_tokenizer(text_after_correction)
        results_jaro[f"0.{jaro_similarity}"] = compare_docs(
            doc_orig, doc_with_errors, doc_after_correction
        )
    print("Jaro-Winkler:")
    display(results_jaro)

----------
Text 1
Total words: 1186
Total words with errors: 265
Damerau-Levenshtein:


Unnamed: 0,1,2,3,4,5
"Words with errors, fixed correctly",150,163,164,164,164
"Words with errors, fixed incorrectly",73,89,89,89,89
"Words with errors, not fixed",42,13,12,12,12
"Words with no errors, mistakenly ""fixed""",21,24,29,29,29


Jaro-Winkler:


Unnamed: 0,0.95,0.90,0.85,0.80,0.75
"Words with errors, fixed correctly",136,162,163,163,163
"Words with errors, fixed incorrectly",57,83,90,90,90
"Words with errors, not fixed",72,20,12,12,12
"Words with no errors, mistakenly ""fixed""",21,27,28,29,29


----------
Text 2
Total words: 1217
Total words with errors: 227
Damerau-Levenshtein:


Unnamed: 0,1,2,3,4,5
"Words with errors, fixed correctly",124,141,141,141,141
"Words with errors, fixed incorrectly",63,75,76,76,76
"Words with errors, not fixed",40,11,10,10,10
"Words with no errors, mistakenly ""fixed""",21,29,29,29,29


Jaro-Winkler:


Unnamed: 0,0.95,0.90,0.85,0.80,0.75
"Words with errors, fixed correctly",106,140,142,142,142
"Words with errors, fixed incorrectly",44,72,75,75,75
"Words with errors, not fixed",77,15,10,10,10
"Words with no errors, mistakenly ""fixed""",10,26,27,29,29


----------
Text 3
Total words: 1078
Total words with errors: 214
Damerau-Levenshtein:


Unnamed: 0,1,2,3,4,5
"Words with errors, fixed correctly",121,131,131,131,131
"Words with errors, fixed incorrectly",57,67,67,67,67
"Words with errors, not fixed",36,16,16,16,16
"Words with no errors, mistakenly ""fixed""",10,12,14,15,15


Jaro-Winkler:


Unnamed: 0,0.95,0.90,0.85,0.80,0.75
"Words with errors, fixed correctly",116,137,138,138,138
"Words with errors, fixed incorrectly",46,58,60,60,60
"Words with errors, not fixed",52,19,16,16,16
"Words with no errors, mistakenly ""fixed""",6,13,16,16,16
