In [None]:
!pip install regex pandas syntok

!git clone https://github.com/cainesap/errant
!cd errant
!pip install -e .
!cd ../

!git clone https://github.com/cainesap/spacy_conll
!cd spacy_conll
!pip install -e .
!cd ../

!pip install spacy-udpipe
!pip install evaluate
!mkdir spacy_udpipe_models
!pip install language-tool-python

In [None]:
%cd gleu && pip install -e .

In [None]:
import pandas as pd
import evaluate

from tqdm import tqdm
import language_tool_python

from src.prompts.automatic_evaluation.spivavtor import SpivavtorGECPrompt, spivavtor_gec_verbalizers
import random

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

In [None]:
tqdm.pandas

evaluation_df = pd.read_csv("../../datasets/annotations/anot_1500.csv")
evaluation_df.loc[:, "language"] = "ukrainian"
evaluation_df

In [None]:
tokenizer = AutoTokenizer.from_pretrained("grammarly/spivavtor-large")
model = AutoModelForSeq2SeqLM.from_pretrained("grammarly/spivavtor-large").to(device="mps")

def spivavtor_gec(input_text: str) -> str:
    input = SpivavtorGECPrompt().prompt_template.format(
        original_text=input_text,
        verbalizer=random.choice(spivavtor_gec_verbalizers)
    )
    inputs = tokenizer.encode(input, return_tensors="pt").to(device="mps")
    output = model.generate(inputs, max_length=256)
    output_text = tokenizer.decode(output[0], skip_special_tokens=True)

    return output_text

evaluation_df.loc[:, "spivavtor_correction"] = evaluation_df.loc[:, "text"].progress_apply(
    lambda x: spivavtor_gec(x)
)

In [None]:
model_checkpoint = "Pravopysnyk/best-unlp"
translator = pipeline("translation", src_lang="uk_UA", tgt_lang="uk_UA", model=model_checkpoint, device="mps")

ukrainian_mask = (evaluation_df.loc[:, "language"] == "ukrainian")
evaluation_df.loc[ukrainian_mask, "pravopysnyk_correction"] = evaluation_df.loc[ukrainian_mask, "text"].progress_apply(
    lambda x: translator(x, max_length=400)
)
evaluation_df.loc[ukrainian_mask, "pravopysnyk_correction"] = evaluation_df.loc[ukrainian_mask, "pravopysnyk_correction"].progress_apply(
    lambda x: x[0]['translation_text']
)

In [None]:
language_mapping = {
    'ukrainian': 'uk-UA'
}

def correct_text(text, lang):
    # Map the language to the corresponding LanguageTool code
    lt_lang_code = language_mapping.get(lang)
    if not lt_lang_code:
        # If the language is not supported, return the original text
        print("lang", lang, " not found")

        return text
    # Initialize LanguageTool for the specified language
    tool = language_tool_python.LanguageTool(lt_lang_code)
    # Check and correct the text
    matches = tool.check(text)
    print(len(matches))
    corrected_text = language_tool_python.utils.correct(text, matches)
    # Close the LanguageTool instance
    tool.close()
    return corrected_text

tqdm.pandas()

evaluation_df.loc[:, 'language_tool_correction'] = evaluation_df.progress_apply(
    lambda row: correct_text(row['text'], row['language']),
    axis=1
)

In [None]:
evaluation_df.to_csv("../../datasets/automatic_evaluation/multiref.csv", index=False)