In [62]:
!pip install ansi2html



In [61]:
text = """–î–æ–±—Ä–∏–π –¥–µ–Ω—å —è –≤–∞–º –±—É–≤ –ø–∏—Å–∞–≤ –≤ –º–µ–Ω–µ –±—É–ª–∏ –≤–∏–Ω–µ–∫–ª–∏
–¥–µ—è–∫—ñ –ø—Ä–æ–±–ª–µ–º–∏ —è–∫—â–æ –º–æ–∂–Ω–∞ —è –∑–∞–≤—Ç—Ä–∞ –±—É–¥—É –Ω–∞ —Ä–æ–±–æ—á–æ–º—É –º—ñ—Å—Ç—ñ ?"""

In [63]:
import ast
import os
import re
import nltk
import mlflow
import pandas as pd
import tiktoken
from dotenv import load_dotenv
from omegaconf import OmegaConf
from openai import OpenAI
from flatten_dict import flatten

from src.prompts.reddit_multigec import multi_gec_prompt_per_language
from src.utils.metrics import average_edit_distance
from src.utils.utils import normalize_spaces, generate_original_corrected_texts
from src.utils.comparison_to_html import save_comparison_to_html
from src.prompts.reddit_multigec import gec_aggregation_prompt_per_language

In [64]:
load_dotenv("../../.env")

True

In [65]:
parameters = OmegaConf.load("./parameters.yaml")
mlflow.set_tracking_uri(os.environ.get("MLFLOW_TRACKING_URI"))
mlflow.set_experiment(parameters.experiment.experiment_name)

<Experiment: artifact_location='mlflow-artifacts:/875882461670179036', creation_time=1731081700081, experiment_id='875882461670179036', last_update_time=1731081700081, lifecycle_stage='active', name='reddit_ua_exploratory', tags={}>

In [66]:
tokenizer = tiktoken.encoding_for_model(parameters.tokenizer.model_name)

In [67]:
input_text_tokens = len(tokenizer.encode(text))
input_text_tokens

32

In [68]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 30)

In [70]:


language = "ukrainian"


grammar_correction_prompt = multi_gec_prompt_per_language[language].prompt_template

grammar_correction_prompt_formatted = grammar_correction_prompt.format(
    text=text,
    num_corrections=parameters.multi_gec.num_corrections
)
print(grammar_correction_prompt_formatted)

–í–∏–ø—Ä–∞–≤—Ç–µ –Ω–∞—Å—Ç—É–ø–Ω–∏–π —Ç–µ–∫—Å—Ç, –∑—Ä–æ–±–∏–≤—à–∏ –π–æ–≥–æ –≥—Ä–∞–º–∞—Ç–∏—á–Ω–æ –ø—Ä–∞–≤–∏–ª—å–Ω–∏–º.
–í–∏–ø—Ä–∞–≤—Ç–µ –≤—Å—ñ –æ—Ä—Ñ–æ–≥—Ä–∞—Ñ—ñ—á–Ω—ñ, –ø—É–Ω–∫—Ç—É–∞—Ü—ñ–π–Ω—ñ, —Å—Ç–∏–ª—ñ—Å—Ç–∏—á–Ω—ñ, –≥—Ä–∞–º–∞—Ç–∏—á–Ω—ñ, –ª–µ–∫—Å–∏—á–Ω—ñ —Ç–∞ —Å–∏–Ω—Ç–∞–∫—Å–∏—á–Ω—ñ –ø–æ–º–∏–ª–∫–∏.
–Ø–∫—â–æ –ø–æ–º–∏–ª–æ–∫ –Ω–µ–º–∞—î, –ø–æ–≤—Ç–æ—Ä—ñ—Ç—å –æ—Ä–∏–≥—ñ–Ω–∞–ª—å–Ω–∏–π —Ç–µ–∫—Å—Ç.
–ó–≥–µ–Ω–µ—Ä—É–π—Ç–µ 3 —Ä—ñ–∑–Ω—ñ –≤–∞—Ä—ñ–∞–Ω—Ç–∏ –≤–∏–ø—Ä–∞–≤–ª–µ–Ω–æ–≥–æ —Ç–µ–∫—Å—Ç—É –∑ –ø–æ—è—Å–Ω–µ–Ω–Ω—è–º–∏.

–§–æ—Ä–º–∞—Ç –≤—ñ–¥–ø–æ–≤—ñ–¥—ñ —É JSON:
[{
    "correction": "–≤–∏–ø—Ä–∞–≤–ª–µ–Ω–∏–π —Ç–µ–∫—Å—Ç",
    "explanation": "–ø–æ—è—Å–Ω–µ–Ω–Ω—è –¥–æ –≤–∏–ø—Ä–∞–≤–ª–µ–Ω–Ω—è"
}, ...]

–ü—Ä–∏–∫–ª–∞–¥–∏:

1. –í—Ö—ñ–¥–Ω–∏–π —Ç–µ–∫—Å—Ç:
   –û—Å—Ç–∞–Ω–Ω—ñ 3 –º—ñ—Å—è—Ü—ñ –º–æ–≥–æ –∂–∏—Ç—Ç—è –≤–∏–¥–∞–ª–∏—Å—è –∞–∂ –∑–∞–Ω–∞–¥—Ç–æ –Ω–∞—Å–∏—á–µ–Ω–∏–º–∏ –Ω–∞ –ø–æ–¥—ñ—ó —Ç–∞ –µ–º–æ—Ü—ñ—ó, –∞–ª–µ –æ—Å—å –Ω–∞—Ä–µ—à—Ç—ñ —É –º–µ–Ω–µ –∑‚Äô—è–≤–∏–ª–æ—Å—è –¥–µ–∫—ñ–ª—å–∫–∞ –≤—ñ–ª—å–Ω–∏—Ö –≥–æ–¥–∏–Ω —Ç–∞ —Ç—Ä–æ—Ö–∏ 

In [71]:
grammar_correction_prompt_tokens = len(tokenizer.encode(grammar_correction_prompt.template))
grammar_correction_prompt_formatted_tokens = len(tokenizer.encode(grammar_correction_prompt_formatted))
grammar_correction_prompt_tokens, grammar_correction_prompt_formatted_tokens

(3235, 3260)

In [72]:
len(grammar_correction_prompt_formatted), len(grammar_correction_prompt.template)

(9649, 9566)

In [73]:
client = OpenAI(api_key=os.environ.get("OPEN_AI_API_KEY"))
chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": grammar_correction_prompt_formatted,
        }
    ],
    temperature=parameters.multi_gec.temperature,
    top_p=parameters.multi_gec.top_p,
    model=parameters.multi_gec.model_name,
)
original_text = text
multi_gec_raw_output = chat_completion.choices[0].message.content
import json
pattern = r'```json\s*\n(?P<json>([\s\S]*?))\n```'
matches = re.finditer(pattern, multi_gec_raw_output, re.MULTILINE)
for match in matches:
    json_content = match.group('json')
    try:
        multi_gec_output = json.loads(json_content)
        print(multi_gec_output)
    except json.JSONDecodeError as e:
        print("Invalid JSON:", e)
else:
    multi_gec_output = ast.literal_eval(multi_gec_raw_output.strip("```json"))

multi_gec_output_tokens = len(tokenizer.encode(multi_gec_raw_output))
multi_gec_correction_output_tokens = len(tokenizer.encode(str([output["correction"] for output in multi_gec_output])))
multi_gec_explanation_output_tokens = len(tokenizer.encode(str([output["explanation"] for output in multi_gec_output])))

multi_gec_output_tokens, multi_gec_correction_output_tokens, multi_gec_explanation_output_tokens

[{'correction': '–î–æ–±—Ä–∏–π –¥–µ–Ω—å! –Ø –≤–∞–º –ø–∏—Å–∞–≤, —É –º–µ–Ω–µ –≤–∏–Ω–∏–∫–ª–∏ –¥–µ—è–∫—ñ –ø—Ä–æ–±–ª–µ–º–∏. –Ø–∫—â–æ –º–æ–∂–Ω–∞, —è –∑–∞–≤—Ç—Ä–∞ –±—É–¥—É –Ω–∞ —Ä–æ–±–æ—á–æ–º—É –º—ñ—Å—Ü—ñ?', 'explanation': "–î–æ–¥–∞–Ω–æ –∑–Ω–∞–∫ –æ–∫–ª–∏–∫—É –ø—ñ—Å–ª—è '–î–æ–±—Ä–∏–π –¥–µ–Ω—å' –¥–ª—è –∑–∞–≤–µ—Ä—à–µ–Ω–Ω—è –ø—Ä–∏–≤—ñ—Ç–∞–Ω–Ω—è; –≤–∏–ø—Ä–∞–≤–ª–µ–Ω–æ '–≤–∏–Ω–µ–∫–ª–∏' –Ω–∞ '–≤–∏–Ω–∏–∫–ª–∏' –¥–ª—è –ø—Ä–∞–≤–∏–ª—å–Ω–æ–≥–æ –Ω–∞–ø–∏—Å–∞–Ω–Ω—è; –¥–æ–¥–∞–Ω–æ –∫–æ–º–∏ –¥–ª—è —Ä–æ–∑–¥—ñ–ª–µ–Ω–Ω—è —á–∞—Å—Ç–∏–Ω —Ä–µ—á–µ–Ω–Ω—è; –≤–∏–ø—Ä–∞–≤–ª–µ–Ω–æ '–º—ñ—Å—Ç—ñ' –Ω–∞ '–º—ñ—Å—Ü—ñ' –¥–ª—è –ø—Ä–∞–≤–∏–ª—å–Ω–æ–≥–æ –≤–∂–∏–≤–∞–Ω–Ω—è."}, {'correction': '–î–æ–±—Ä–∏–π –¥–µ–Ω—å! –Ø –ø–∏—Å–∞–≤ –≤–∞–º, —É –º–µ–Ω–µ –≤–∏–Ω–∏–∫–ª–∏ –¥–µ—è–∫—ñ –ø—Ä–æ–±–ª–µ–º–∏. –Ø–∫—â–æ –º–æ–∂–ª–∏–≤–æ, —è –∑–∞–≤—Ç—Ä–∞ –±—É–¥—É –Ω–∞ —Ä–æ–±–æ—á–æ–º—É –º—ñ—Å—Ü—ñ.', 'explanation': "–ó–º—ñ–Ω–µ–Ω–æ –ø–æ—Ä—è–¥–æ–∫ —Å–ª—ñ–≤ –¥–ª—è –ø–æ–∫—Ä–∞—â–µ–Ω–Ω—è —Å—Ç–∏–ª—é; –≤–∏–ø—Ä–∞–≤–ª–µ–Ω–æ '–≤–∏–Ω–µ–∫–ª–∏' –Ω–∞ '–≤–∏–Ω–∏–∫–ª–∏'; –∑–∞–º—ñ–Ω–µ–

(345, 105, 194)

In [74]:
multi_gec_correction_comparison_text: str = ""

for i, correction_reasoning in enumerate(multi_gec_output):
    correction = correction_reasoning["correction"]
    reasoning = correction_reasoning["explanation"]

    text1 = normalize_spaces(original_text)
    text2 = normalize_spaces(correction)

    original_corrected_text = generate_original_corrected_texts(
        original_text=original_text,
        corrected_text=text2)

    multi_gec_correction_comparison_text += f"""
Correction: {i}

Original Text:
{original_corrected_text[0]}

Corrected Text:
{original_corrected_text[1]}

Reasoning:
{reasoning}
    """

print(multi_gec_correction_comparison_text)


Correction: 0

Original Text:
–î–æ–±—Ä–∏–π [91m[1m–¥–µ–Ω—å[0m —è –≤–∞–º [91m[1m–±—É–≤[0m [91m[1m–ø–∏—Å–∞–≤[0m [91m[1m–≤[0m –º–µ–Ω–µ [91m[1m–±—É–ª–∏[0m [91m[1m–≤–∏–Ω–µ–∫–ª–∏[0m –¥–µ—è–∫—ñ [91m[1m–ø—Ä–æ–±–ª–µ–º–∏[0m [91m[1m—è–∫—â–æ[0m [91m[1m–º–æ–∂–Ω–∞[0m —è –∑–∞–≤—Ç—Ä–∞ –±—É–¥—É –Ω–∞ —Ä–æ–±–æ—á–æ–º—É [91m[1m–º—ñ—Å—Ç—ñ[0m [91m[1m?[0m

Corrected Text:
–î–æ–±—Ä–∏–π [92m[1m–¥–µ–Ω—å![0m [92m[1m–Ø[0m –≤–∞–º [92m[1m–ø–∏—Å–∞–≤,[0m [92m[1m—É[0m –º–µ–Ω–µ [92m[1m–≤–∏–Ω–∏–∫–ª–∏[0m –¥–µ—è–∫—ñ [92m[1m–ø—Ä–æ–±–ª–µ–º–∏.[0m [92m[1m–Ø–∫—â–æ[0m [92m[1m–º–æ–∂–Ω–∞,[0m —è –∑–∞–≤—Ç—Ä–∞ –±—É–¥—É –Ω–∞ —Ä–æ–±–æ—á–æ–º—É [92m[1m–º—ñ—Å—Ü—ñ?[0m

Reasoning:
–î–æ–¥–∞–Ω–æ –∑–Ω–∞–∫ –æ–∫–ª–∏–∫—É –ø—ñ—Å–ª—è '–î–æ–±—Ä–∏–π –¥–µ–Ω—å' –¥–ª—è –∑–∞–≤–µ—Ä—à–µ–Ω–Ω—è –ø—Ä–∏–≤—ñ—Ç–∞–Ω–Ω—è; –≤–∏–ø—Ä–∞–≤–ª–µ–Ω–æ '–≤–∏–Ω–µ–∫–ª–∏' –Ω–∞ '–≤–∏–Ω–∏–∫–ª–∏' –¥–ª—è –ø—Ä–∞–≤–∏–ª—å–Ω–æ–≥–æ –Ω–∞–ø–∏—Å–∞–Ω–Ω—è; –¥–æ–¥–∞–Ω–æ –∫–æ–º–∏ –¥–ª—è —Ä–æ–∑–¥—ñ–ª–µ–Ω–Ω—è —á–∞—Å—Ç–∏–Ω —Ä–µ—á–µ–Ω–Ω

In [75]:


correction_aggregation_prompt = gec_aggregation_prompt_per_language[language].prompt_template

correction_aggregation_prompt_formatted = correction_aggregation_prompt.format(
    text=text,
    possible_corrections=str([output["correction"] for output in multi_gec_output]),
    num_corrections=parameters.multi_gec.num_corrections
)
correction_aggregation_prompt_formatted

'–ê–≥—Ä–µ–≥—É–π—Ç–µ –∑–∞–ø—Ä–æ–ø–æ–≤–∞–Ω—ñ –≥—Ä–∞–º–∞—Ç–∏—á–Ω—ñ –≤–∏–ø—Ä–∞–≤–ª–µ–Ω–Ω—è —Ç–µ–∫—Å—Ç—É —É —Ñ—ñ–Ω–∞–ª—å–Ω–∏–π –≥—Ä–∞–º–∞—Ç–∏—á–Ω–æ –ø—Ä–∞–≤–∏–ª—å–Ω–∏–π —Ç–µ–∫—Å—Ç.\n–ó –æ—Ä–∏–≥—ñ–Ω–∞–ª—å–Ω–æ–≥–æ —Ç–µ–∫—Å—Ç—É —Ç–∞ —Å–ø–∏—Å–∫—É –∑ 3 –≤–∞—Ä—ñ–∞–Ω—Ç—ñ–≤ –π–æ–≥–æ –≤–∏–ø—Ä–∞–≤–ª–µ–Ω–Ω—è, –æ–±\'—î–¥–Ω–∞–π –≤—Å—ñ –∫–æ—Ä–∏—Å–Ω—ñ –≤–∏–ø—Ä–∞–≤–ª–µ–Ω–Ω—è –∑ —Ü–∏—Ö –≤–∞—Ä—ñ–∞–Ω—Ç—ñ–≤ —Ç–∞ —Å—Ç–≤–æ—Ä–∏ —Ñ—ñ–Ω–∞–ª—å–Ω–∏–π –≤–∏–ø—Ä–∞–≤–ª–µ–Ω–∏–π —Å–∏–Ω—Ç–∞–∫—Å–∏—á–Ω–æ –∫–æ—Ä–µ–∫—Ç–Ω–∏–π —Ç–µ–∫—Å—Ç.\n–£—Å—É–Ω—å –æ—Ä—Ñ–æ–≥—Ä–∞—Ñ—ñ—á–Ω—ñ, –ø—É–Ω–∫—Ç—É–∞—Ü—ñ–π–Ω—ñ, —Å—Ç–∏–ª—ñ—Å—Ç–∏—á–Ω—ñ, –≥—Ä–∞–º–∞—Ç–∏—á–Ω—ñ, –ª–µ–∫—Å–∏—á–Ω—ñ —Ç–∞ —Å–∏–Ω—Ç–∞–∫—Å–∏—á–Ω—ñ –ø–æ–º–∏–ª–∫–∏.\n–î–æ–±–∞–≤ –ø–æ—è—Å–Ω–µ–Ω–Ω—è –¥–æ –∞–≥—Ä–µ–≥–æ–≤–∞–Ω–∏—Ö –≤–∏–ø—Ä–∞–≤–ª–µ–Ω—å —Ç–∞ –≥—Ä–∞–º–∞—Ç–∏—á–Ω–æ—ó –∫–æ—Ä–µ–∫—Ü—ñ—ó —Ç–µ–∫—Å—Ç—É.\n–°—Ñ–æ–∫–æ—Å—É–π—Å—è –Ω–∞ –∑–∞–ø—Ä–æ–ø–æ–Ω–æ–≤–∞–Ω–∏—Ö –≤–∏–ø—Ä–∞–≤–ª–µ–Ω–Ω—è—Ö, —Ç–∞ –Ω–µ —Å—Ç–∞—Ä–∞–π—Å—è –≤–∏–ø—Ä–∞–≤–∏—Ç–∏ –æ—Ä–∏–≥—ñ–Ω–∞–ª—å–Ω–∏–π —Ç–µ–∫—Å—Ç —Å–∞–º–æ—Ç—É–∂–∫–∏.\n–Ø–∫—

In [76]:
gec_aggregation_prompt_tokens = len(tokenizer.encode(correction_aggregation_prompt.template))
gec_aggregation_prompt_formatted_tokens = len(tokenizer.encode(correction_aggregation_prompt_formatted))
gec_aggregation_prompt_tokens, gec_aggregation_prompt_formatted_tokens

(316, 439)

In [77]:
client = OpenAI(api_key=os.environ.get("OPEN_AI_API_KEY"))
chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": correction_aggregation_prompt_formatted,
        }
    ],
    temperature=parameters.gec_aggregation.temperature,
    top_p=parameters.gec_aggregation.top_p,
    model=parameters.gec_aggregation.model_name,
)
original_text = text
gec_aggregation_raw_output = chat_completion.choices[0].message.content
import json
pattern = r'```json\s*\n(?P<json>([\s\S]*?))\n```'
matches = re.finditer(pattern, gec_aggregation_raw_output, re.MULTILINE)
for match in matches:
    json_content = match.group('json')
    try:
        gec_aggregation_output = json.loads(json_content)
        print(gec_aggregation_output)
    except json.JSONDecodeError as e:
        print("Invalid JSON:", e)
else:
    gec_aggregation_output = ast.literal_eval(gec_aggregation_raw_output.strip("```json"))

gec_aggregation_output_tokens = len(tokenizer.encode(gec_aggregation_raw_output))
gec_aggregation_correction_output_tokens = len(tokenizer.encode(str(gec_aggregation_output["correction"])))
gec_aggregation_explanation_output_tokens = len(tokenizer.encode(str(gec_aggregation_output["explanation"])))


In [78]:
gec_aggregation_output["correction"]

'–î–æ–±—Ä–∏–π –¥–µ–Ω—å! –Ø –ø–∏—Å–∞–≤ –≤–∞–º, —É –º–µ–Ω–µ –≤–∏–Ω–∏–∫–ª–∏ –¥–µ—è–∫—ñ –ø—Ä–æ–±–ª–µ–º–∏. –Ø–∫—â–æ –º–æ–∂–ª–∏–≤–æ, —è –∑–∞–≤—Ç—Ä–∞ –±—É–¥—É –Ω–∞ —Ä–æ–±–æ—á–æ–º—É –º—ñ—Å—Ü—ñ.'

In [79]:
correction = gec_aggregation_output["correction"]
reasoning = gec_aggregation_output["explanation"]

original_text = text

print("\nSentence #", i)
text1 = normalize_spaces(original_text)
text2 = normalize_spaces(correction)

original_corrected_text = generate_original_corrected_texts(
        original_text=original_text,
        corrected_text=text2)

print("Original Text:")
print(original_corrected_text[0])
print()

print("Corrected Text:")
print(original_corrected_text[1])
print()

print("Reasoning:")
print(reasoning)
print()



Sentence # 2
Original Text:
–î–æ–±—Ä–∏–π [91m[1m–¥–µ–Ω—å[0m —è [91m[1m–≤–∞–º[0m [91m[1m–±—É–≤[0m –ø–∏—Å–∞–≤ [91m[1m–≤[0m –º–µ–Ω–µ [91m[1m–±—É–ª–∏[0m [91m[1m–≤–∏–Ω–µ–∫–ª–∏[0m –¥–µ—è–∫—ñ [91m[1m–ø—Ä–æ–±–ª–µ–º–∏[0m [91m[1m—è–∫—â–æ[0m [91m[1m–º–æ–∂–Ω–∞[0m —è –∑–∞–≤—Ç—Ä–∞ –±—É–¥—É –Ω–∞ —Ä–æ–±–æ—á–æ–º—É [91m[1m–º—ñ—Å—Ç—ñ[0m [91m[1m?[0m

Corrected Text:
–î–æ–±—Ä–∏–π [92m[1m–¥–µ–Ω—å![0m [92m[1m–Ø[0m –ø–∏—Å–∞–≤ [92m[1m–≤–∞–º,[0m [92m[1m—É[0m –º–µ–Ω–µ [92m[1m–≤–∏–Ω–∏–∫–ª–∏[0m –¥–µ—è–∫—ñ [92m[1m–ø—Ä–æ–±–ª–µ–º–∏.[0m [92m[1m–Ø–∫—â–æ[0m [92m[1m–º–æ–∂–ª–∏–≤–æ,[0m —è –∑–∞–≤—Ç—Ä–∞ –±—É–¥—É –Ω–∞ —Ä–æ–±–æ—á–æ–º—É [92m[1m–º—ñ—Å—Ü—ñ.[0m

Reasoning:
–í–∏–ø—Ä–∞–≤–ª–µ–Ω–Ω—è –≤–∫–ª—é—á–∞—î –∫–æ—Ä–µ–∫—Ç–Ω–µ –≤–∏–∫–æ—Ä–∏—Å—Ç–∞–Ω–Ω—è —Ä–æ–∑–¥—ñ–ª–æ–≤–∏—Ö –∑–Ω–∞–∫—ñ–≤, –∑–æ–∫—Ä–µ–º–∞, –¥–æ–¥–∞–≤–∞–Ω–Ω—è –∑–Ω–∞–∫–∞ –æ–∫–ª–∏–∫—É –ø—ñ—Å–ª—è '–î–æ–±—Ä–∏–π –¥–µ–Ω—å' —Ç–∞ –∫–æ–º –ø—ñ—Å–ª—è '—è–∫—â–æ –º–æ–∂–ª–∏–≤–æ'. –°–ª–æ–≤–æ '–≤–∏–Ω–µ–∫–ª–∏' –≤–∏–ø—Ä–∞–≤–ª–µ

In [80]:
clip_text_in_run_name_in_chars: int = 30

run_name = f"{parameters.experiment.run_name} {text[:clip_text_in_run_name_in_chars] + '...' if len(text) > clip_text_in_run_name_in_chars else text}"

with mlflow.start_run(run_name=run_name) as run:
    # region Log Outputs and Parameters
    mlflow.log_params(flatten(parameters, reducer="dot"))
    gec_aggregation_html_comparison_file_name = save_comparison_to_html(
        original_text,
        correction,
        reasoning,
        "gec_aggregation.output_comparison.html",
    )
    mlflow.log_artifact(gec_aggregation_html_comparison_file_name)

    mlflow.log_artifact(multi_gec_correction_comparison_file_name)
    # endregion

    # region Log texts and results
    mlflow.log_text(text1, artifact_file="original_text.txt")
    mlflow.log_text(text2, artifact_file="corrected_text.txt")
    mlflow.log_text(reasoning, artifact_file="reasoning.txt")
    # endregion

    # region Log prompts
    mlflow.log_text(
        grammar_correction_prompt_formatted,
        artifact_file="multi_gec.prompt_formatted.txt")
    mlflow.log_text(
        grammar_correction_prompt.template,
        artifact_file="multi_gec.prompt.txt")
    mlflow.log_text(
        correction_aggregation_prompt_formatted,
        artifact_file="gec_aggregation.prompt_formatted.txt"
    )
    mlflow.log_text(
        correction_aggregation_prompt.template,
        artifact_file="gec_aggregation.prompt.txt"
    )
    # endregion

    # region Log prompt tokens
    mlflow.log_metric(
        key="multi_gec.prompt_tokens",
        value=grammar_correction_prompt_tokens
    )
    mlflow.log_metric(
        key="multi_gec.prompt_formatted_tokens",
        value=grammar_correction_prompt_formatted_tokens
    )
    mlflow.log_metric(
        key="gec_aggregation.prompt_tokens",
        value=grammar_correction_prompt_tokens
    )
    mlflow.log_metric(
        key="gec_aggregation.prompt_formatted_tokens",
        value=grammar_correction_prompt_formatted_tokens
    )
    # endregion

    # region Log metrics
    edit_distance = nltk.edit_distance(
        original_text,
        correction
    )
    avg_edit_distance = average_edit_distance(
        original_text,
        correction,
    )
    mlflow.log_metric(
        key="gec_aggregation.edit_distance",
        value=edit_distance,
    )
    mlflow.log_metric(
        key="gec_aggregation.avg_edit_distance",
        value=avg_edit_distance,
    )
    for i, output in enumerate(multi_gec_output):
        multi_gec_correction = output["correction"]

        edit_distance = nltk.edit_distance(
            original_text,
            multi_gec_correction
        )
        avg_edit_distance = average_edit_distance(
            original_text,
            multi_gec_correction,
        )
        mlflow.log_metric(
            key=f"multi_gec_correction.{i}.edit_distance",
            value=edit_distance,
        )
        mlflow.log_metric(
            key=f"multi_gec_correction.{i}.avg_edit_distance",
            value=avg_edit_distance,
        )

    mlflow.log_metric(
        key="multi_gec.input_text_tokens",
        value=input_text_tokens
    )
    mlflow.log_metric(
        key="multi_gec.output_tokens",
        value=multi_gec_output_tokens
    )
    mlflow.log_metric(
        key="multi_gec.correction_output_tokens",
        value=multi_gec_correction_output_tokens
    )
    mlflow.log_metric(
        key="multi_gec.explanation_output_tokens",
        value=multi_gec_explanation_output_tokens
    )
    mlflow.log_metric(
        key="gec_aggregation.output_tokens",
        value=gec_aggregation_output_tokens
    )
    mlflow.log_metric(
        key="gec_aggregation.correction_output_tokens",
        value=gec_aggregation_correction_output_tokens
    )
    mlflow.log_metric(
        key="gec_aggregation.explanation_output_tokens",
        value=gec_aggregation_explanation_output_tokens
    )
    # endregion


HTML file with colored output saved as gec_aggregation.output_comparison.html
HTML file with colored output saved as multi_gec.output_comparison.html


2024/11/11 14:31:59 INFO mlflow.tracking._tracking_service.client: üèÉ View run experiment –î–æ–±—Ä–∏–π –¥–µ–Ω—å —è –≤–∞–º –±—É–≤ –ø–∏—Å–∞–≤ –≤ ... at: http://127.0.0.1:5000/#/experiments/875882461670179036/runs/fd0ffef943cb46e394a9c4aa548e50d2.
2024/11/11 14:31:59 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: http://127.0.0.1:5000/#/experiments/875882461670179036.
