In [1]:
import ast
import os
import re
import json
import nltk
import mlflow
import pandas as pd
import tiktoken
from dotenv import load_dotenv
from langchain_core.prompts import PromptTemplate
from omegaconf import OmegaConf
from openai import OpenAI
from flatten_dict import flatten

from src.prompts.ubertext import (
    MultiGrammarErrorCorrectionGenerationPrompt,
    GrammarErrorCorrectionAggregationPrompt
)
from src.utils.openai_batch_utils import retrieve_openai_batch
from src.utils.openai_batch_utils import submit_openai_batch
from src.utils.utils import get_multi_gec_correction_comparison_text
from src.utils.metrics import average_edit_distance
from src.utils.utils import normalize_spaces, generate_original_corrected_texts
from src.utils.comparison_to_html import save_comparison_to_html, save_ansi_to_html

In [2]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 30)

In [3]:
load_dotenv("../../../.env")

True

In [5]:
ubertext_df = pd.read_csv("../../../datasets/ubertext_gec/raw_ubertext_gec.csv", index_col=0)
ubertext_df

Unnamed: 0,tokens,characters,words,sentences,sentence
0,512,1382,181,17,"üî¥–û–ø–µ—Ä–∞—Ç–∏–≤–Ω–µ –∑–≤–µ–¥–µ–Ω–Ω—è –ì–µ–Ω—à—Ç–∞–±—É —Å—Ç–∞–Ω–æ–º –Ω–∞ 00:00 18 –±–µ—Ä–µ–∑–Ω—è. \n–ì–æ–ª–æ–≤–Ω–µ: üîª–û–∫—É–ø–∞–Ω—Ç–∏ —Ç–∏–º—á–∞—Å–æ–≤–æ –ø–æ–∑–±–∞–≤–∏–ª–∏ –£–∫—Ä–∞—ó–Ω—É –≤–∏—Ö–æ–¥—É –¥–æ –ê–∑–æ–≤—Å—å–∫–æ–≥–æ –º–æ—Ä—è. \n–ù–∞ –ø—ñ–¥—Å—Ç—É–ø–∞—Ö –¥–æ –ú–∏–∫–æ–ª–∞—î–≤–∞ –≤–æ—Ä–æ–≥–∞ –∑—É–ø–∏–Ω–∏–ª–∏ —Å–ø—ñ–ª—å–Ω–∏–º–∏ –¥—ñ—è–º–∏ –ø—ñ–¥—Ä–æ–∑–¥—ñ–ª—ñ–≤ –°–∏–ª –æ–±–æ—Ä–æ–Ω–∏. \nüîª–ß–µ—Ä–µ–∑ –ø—Ä–æ–≤–∞–ª —É –ø—Ä–æ—Å—É–≤–∞–Ω–Ω—ñ –≤—ñ–π—Å—å–∫ –æ–∫—É–ø–∞–Ω—Ç–∏ –∑–∞–≤–¥–∞—é—Ç—å —Ä–∞–∫–µ—Ç–Ω–æ-–±–æ–º–±–æ–≤–∏—Ö —É–¥–∞—Ä—ñ–≤ –ø–æ –º–∏—Ä–Ω–æ–º—É –Ω–∞—Å–µ–ª–µ–Ω–Ω—é. \n–¢—ñ–ª—å–∫–∏ –ø—Ä–æ—Ç—è–≥–æ–º –¥–æ–±–∏ 18 –±–µ—Ä–µ–∑–Ω—è –æ–∫—É–ø–∞–Ω—Ç–∏ –Ω–∞–Ω–µ—Å–ª–∏ 4 —Ä–∞–∫–µ—Ç–Ω–∏—Ö —É–¥–∞—Ä–∏ (14 —Ä–∞–∫–µ—Ç) —Ç–∞ –∑–¥—ñ–π—Å–Ω–∏–ª–∏ –ø–æ–Ω–∞–¥ 40 –∞–≤—ñ–∞–Ω–∞–ª—å–æ—Ç—ñ–≤. \nüîª–ó–∞–≥–∞—Ä–±–Ω–∏–∫–∏ –Ω–µ –ø–æ–ª–∏—à–∞—é—Ç—å —Å–ø—Ä–æ–± –≤—ñ–¥–Ω–æ–≤–∏—Ç–∏ –ø—Ä–æ—Å—É–≤–∞–Ω–Ω—è –≤ –±—ñ–∫ –ö–∏—î–≤–∞. \n–í–µ–¥—É—Ç—å –ø–æ–≤—ñ—Ç—Ä—è–Ω—É —Ä–æ–∑–≤—ñ–¥–∫—É –¥—Ä–æ–Ω–∞–º–∏. \nüîª–ü—Ä–æ–¥–æ–≤–∂—É—é—Ç—å—Å—è —Å–ø—Ä–æ–±–∏ –±–ª–æ–∫—É–≤–∞–Ω–Ω—è –º—ñ—Å—Ç –°—É–º–∏ —Ç–∞ –•–∞—Ä–∫–æ–≤–∞, –∞ —Ç–∞–∫–æ–∂ –Ω–∞–º–∞–≥–∞–Ω–Ω—è –≤—ñ–¥–Ω–æ–≤–∏—Ç–∏ –Ω–∞—Å—Ç—É–ø —É –Ω–∞–ø—Ä—è–º–∫—É –ü–æ–∫—Ä–æ–≤—Å—å–∫–∞. \n–û–∫—É–ø–∞–Ω—Ç–∏ –ø–æ—Å–∏–ª–∏–ª–∏ —É–≥—Ä—É–ø–æ–≤–∞–Ω–Ω—è –∑–∞ —Ä–∞—Ö—É–Ω–æ–∫ –≤–≤–µ–¥–µ–Ω–Ω—è –∑ —Ç–µ—Ä–∏—Ç–æ—Ä—ñ—ó —Ä—Ñ –¥–≤–æ—Ö –ø—ñ–¥—Ä–æ–∑–¥—ñ–ª—ñ–≤ –∑—ñ —Å–∫–ª–∞–¥—É 6-—ó –∑–∞–≥–∞–ª—å–Ω–æ–≤—ñ–π—Å—å–∫–æ–≤–æ—ó –∞—Ä–º—ñ—ó —Ç–∞ 1-—ó —Ç–∞–Ω–∫–æ–≤–æ—ó –∞—Ä–º—ñ—ó –ó–∞—Ö—ñ–¥–Ω–æ–≥–æ –≤—ñ–π—Å—å–∫–æ–≤–æ–≥–æ –æ–∫—Ä—É–≥—É. \nüîª–ù–∞ –õ—É–≥–∞–Ω—Å—å–∫–æ–º—É –Ω–∞–ø—Ä—è–º–∫—É –∑–∞–≥–∞—Ä–±–Ω–∏–∫–∏ –Ω–∞–º–∞–≥–∞—é—Ç—å—Å—è –≤—Å—Ç–∞–Ω–æ–≤–∏—Ç–∏ –∫–æ–Ω—Ç—Ä–æ–ª—å –Ω–∞–¥ –Ω–∞—Å–µ–ª–µ–Ω–∏–º–∏ –ø—É–Ω–∫—Ç–∞–º–∏ –†—É–±—ñ–∂–Ω–µ —Ç–∞ –ü–æ–ø–∞—Å–Ω–∞, –∞ —Ç–∞–∫–æ–∂ –±–ª–æ–∫—É–≤–∞—Ç–∏ –°—î–≤–µ—Ä–æ–¥–æ–Ω–µ—Ü—å–∫. \nüîª–í —Ä–∞–π–æ–Ω—ñ –†—É–±—ñ–∂–Ω–æ–≥–æ –≤–æ—Ä–æ–≥ –∑–∞–∫—Ä—ñ–ø–∏–≤—Å—è –Ω–∞ –∑–∞—Ö—ñ–¥–Ω–∏—Ö —Ç–∞ –ø—ñ–≤–Ω—ñ—á–Ω–æ-–∑–∞—Ö—ñ–¥–Ω–∏—Ö –æ–∫–æ–ª–∏—Ü—è—Ö –º—ñ—Å—Ç–∞, —Ä–æ–±–∏—Ç—å –Ω–µ–≤–¥–∞–ª—ñ —Å–ø—Ä–æ–±–∏ –∑–¥—ñ–π—Å–Ω–∏—Ç–∏ –≤–∏—Ö—ñ–¥ –¥–æ –ø—ñ–≤–¥–µ–Ω–Ω–æ—ó —á–∞—Å—Ç–∏–Ω–∏ –º—ñ—Å—Ç–∞. \n–¢—Ä–∏–≤–∞—é—Ç—å –±–æ—ó –∑–∞ –º—ñ—Å—Ç–æ –ü–æ–ø–∞—Å–Ω–∞. \nüîª–ù–∞ –î–æ–Ω–µ—Ü—å–∫–æ–º—É –Ω–∞–ø—Ä—è–º–∫—É –æ–∫—É–ø–∞–Ω—Ç–∏ –≤–µ–¥—É—Ç—å –±–æ–π–æ–≤—ñ –¥—ñ—ó –≤ —Ä–∞–π–æ–Ω–∞—Ö –í–µ—Ä—Ö–Ω—å–æ—Ç–æ—Ä–µ—Ü—å–∫–æ–≥–æ, –ö—Ä–∏–º—Å—å–∫–æ–≥–æ, –ê–≤–¥—ñ—ó–≤–∫–∏, –¢–∞—Ä–∞–º—á—É–∫–∞. \n–¢—Ä–∏–≤–∞—î –±–ª–æ–∫–∞–¥–∞ —Ç–∞ —Å–ø—Ä–æ–±–∏ —à—Ç—É—Ä–º—É –ú–∞—Ä—ñ—É–ø–æ–ª—è. \n–ó–±–µ—Ä—ñ–≥–∞—î–º–æ —Å–ø–æ–∫—ñ–π! \n–†–∞–∑–æ–º –ø–µ—Ä–µ–º–æ–∂–µ–º–æ! \n–°–ª–∞–≤–∞ –£–∫—Ä–∞—ó–Ω—ñ! \n#stoprussia"
1,75,154,22,0,"üí™üèª –ó–±—Ä–æ–π–Ω—ñ –°–∏–ª–∏ –£–∫—Ä–∞—ó–Ω–∏ –≤–ø—Ä–æ–¥–æ–≤–∂ –º–∏–Ω—É–ª–æ—ó –¥–æ–±–∏, 18 –±–µ—Ä–µ–∑–Ω—è, –∑–Ω–∏—â–∏–ª–∏ 12 –≤–æ—Ä–æ–∂–∏—Ö –ø–æ–≤—ñ—Ç—Ä—è–Ω–∏—Ö —Ü—ñ–ª–µ–π : ‚ñ´Ô∏è2 –ª—ñ—Ç–∞–∫–∞ ‚ñ´Ô∏è3 –≤–µ—Ä—Ç–æ–ª—å–æ—Ç–∏ ‚ñ´Ô∏è3 –ë–ø–õ–ê ‚ñ´Ô∏è4 –∫—Ä–∏–ª–∞—Ç—ñ —Ä–∞–∫–µ—Ç–∏"
2,291,775,111,7,"–ù–∞—Ü–≥–≤–∞—Ä–¥—ñ–π—Ü—ñ –∑–∞—Ö–æ–ø–∏–ª–∏ —Ç–µ—Ö–Ω—ñ–∫—É –∑–∞–≥–∞—Ä–±–Ω–∏–∫—ñ–≤ –Ω–∞ –õ—É–≥–∞–Ω—â–∏–Ω—ñ ""–ì–∞—Ä–Ω—ñ –Ω–æ–≤–∏–Ω–∏ –Ω–∞ –Ω—ñ—á. \n–ì–≤–∞—Ä–¥—ñ–π—Ü—ñ –∑–∞—Ö–æ–ø–∏–ª–∏ —Ç–µ—Ö–Ω—ñ–∫—É –æ–∫—É–ø–∞–Ω—Ç—ñ–≤ –Ω–∞ –õ—É–≥–∞–Ω—â–∏–Ω—ñ. \n–†–æ—Å—ñ–π—Å—å–∫—ñ –æ–∫—É–ø–∞–Ω—Ç–∏ –∑–∞–∑–Ω–∞–ª–∏ –≤—Ç—Ä–∞—Ç –ø—ñ–¥ –ö—Ä–µ–º—ñ–Ω–Ω–æ—é. \n–ù–∞—Ü–≥–≤–∞—Ä–¥—ñ—è –≤–∏—è–≤–∏–ª–∞ –¥–≤—ñ –±–æ–π–æ–≤—ñ –º–∞—à–∏–Ω–∏ –ø—ñ—Ö–æ—Ç–∏ —Ä–æ—Å—ñ–π—Å—å–∫–∏—Ö –∑–∞–≥–∞—Ä–±–Ω–∏–∫—ñ–≤ —Ç–∞ –∑–Ω–∏—â–∏–ª–∞ –∑ –≥—Ä–∞–Ω–∞—Ç–æ–º–µ—Ç–∞ –æ–¥–Ω—É –ë–ú–ü-1"", ‚Äî –ø–æ–≤—ñ–¥–æ–º–∏–≤ –∫–µ—Ä—ñ–≤–Ω–∏–∫ –õ—É–≥–∞–Ω—Å—å–∫–æ—ó –û–í–ê –°–µ—Ä–≥—ñ–π –ì–∞–π–¥–∞–π. \n–í—ñ–Ω —Ä–æ–∑–ø–æ–≤—ñ–≤, —â–æ –ø—ñ–¥ —á–∞—Å –±–æ–π–æ–≤–æ–≥–æ –∑—ñ—Ç–∫–Ω–µ–Ω–Ω—è –µ–∫—ñ–ø–∞–∂ —ñ–Ω—à–æ—ó –≤–æ—Ä–æ–∂–æ—ó –ë–ú–ü-1 –Ω–µ –≤–∏—Ç—Ä–∏–º–∞–≤ –≤–æ–≥–Ω–µ–≤–æ–≥–æ –∫–æ–Ω—Ç–∞–∫—Ç—É —Ç–∞ –≤—Ç—ñ–∫, –ø–æ–∫–∏–Ω—É–≤—à–∏ –∑–±—Ä–æ—é –π —Ç–µ—Ö–Ω—ñ–∫—É. \n–ù–∞—à–∏–º –±—ñ–π—Ü—è–º –¥—ñ—Å—Ç–∞–ª–∞—Å—è —Ç—Ä–æ—Ñ–µ–π–Ω–∞ –±–æ–π–æ–≤–∞ –º–∞—à–∏–Ω–∞ –ø—ñ—Ö–æ—Ç–∏ –ë–ú–ü-1 —É —Ä–æ–±–æ—á–æ–º—É —Å—Ç–∞–Ω—ñ –∑ –ø–æ–≤–Ω–∏–º –ë–ö, –ø—Ä–∏–¥–∞—Ç–Ω–∞ –¥–ª—è –≤–∏–∫–æ–Ω–∞–Ω–Ω—è –±–æ–π–æ–≤–∏—Ö –∑–∞–≤–¥–∞–Ω—å —ñ –ª—ñ–∫–≤—ñ–¥–∞—Ü—ñ—ó —ó—ó –∫–æ–ª–∏—à–Ω—ñ—Ö –≥–æ—Å–ø–æ–¥–∞—Ä—ñ–≤. \n–ü—ñ–¥ —á–∞—Å –æ–≥–ª—è–¥—É –∑–∞–ª–∏—à–µ–Ω–æ—ó –¥–æ–∫—É–º–µ–Ω—Ç–∞—Ü—ñ—ó –≤ –ë–ú–ü-1 —Å—Ç–∞–ª–æ –≤—ñ–¥–æ–º–æ, —â–æ –≤ –Ω—å–æ–º—É –ø–µ—Ä–µ–±—É–≤–∞–≤ –∫–æ–º–∞–Ω–¥–∏—Ä –æ–¥–Ω–æ–≥–æ –∑ –ø—ñ–¥—Ä–æ–∑–¥—ñ–ª—ñ–≤ —Ç–∞–∫ –∑–≤–∞–Ω–æ—ó ""–õ–ù–†""."
3,27,78,10,0,"üòÑ –í—ñ—Ä–º–µ–Ω—ñ—è, –ö–∞–∑–∞—Ö—Å—Ç–∞–Ω —Ç–∞ –ö–∏—Ä–≥–∏–∑—Å—Ç–∞–Ω –≤–∏–º–∞–≥–∞—é—Ç—å –≤—ñ–¥ —Ä—Ñ –ø–ª–∞—Ç–∏—Ç–∏–º–∏—Ç–æ –≤ –¥–æ–ª–∞—Ä–∞—Ö"
4,39,102,14,0,–ú—ñ–Ω—ñ—Å—Ç—Ä–∏ –∫—É–ª—å—Ç—É—Ä–∏ –Ñ–°–ø—Ä–æ—Å—è—Ç—å –Æ–ù–ï–°–ö–û –ø–µ—Ä–µ–Ω–µ—Å—Ç–∏ 45 —Å–µ—Å—ñ—é –ö–æ–º—ñ—Ç–µ—Ç—É –≤—Å–µ—Å–≤—ñ—Ç–Ω—å–æ—ó —Å–ø–∞–¥—â–∏–Ω–∏ –∑ —Ä—Ñ –¥–æ –õ—å–≤–æ–≤–∞
...,...,...,...,...,...
719634,299,704,83,2,"üñïüá∑üá∫–ê–≤—ñ–∞—Ü—ñ—è —Ä—Ñ –∫—Ä—É–∂–ª—è—î –Ω–∞–¥ –ë–µ—Ä–¥—è–Ω—Å—å–∫–æ–º üá∫üá¶ üê• @proberdiansk_b ot ¬´–ë–∞–≤–æ–≤–Ω–∞¬ª—É–ú–µ–ª—ñ—Ç–æ–ø–æ–ª—ñ—Ç–∞ –¢–æ–∫–º–∞–∫—É –ø–æ–∂–≤–∞–≤–∏–ª–∞ —Ä–æ—Å—ñ–π—Å—å–∫–∏—Ö —Ç–µ—Ä–æ—Ä–∏—Å—Ç—ñ–≤ –ø–æ –≤—Å—ñ–π —Ç–∏–º—á–∞—Å–æ–≤–æ –æ–∫—É–ø–æ–≤–∞–Ω–æ–Ω—ñ–π —Ç–µ—Ä–∏—Ç–æ—Ä—ñ—ó –£–∫—Ä–∞—ó–Ω–∏ 8 —Å—ñ—á–Ω—è 2023 —Ä. —Ç—Ä–∞–Ω—Å–ø–æ—Ä—Ç–Ω—ñ —Ç–∞ –≤—ñ–π—Å—å–∫–æ–≤—ñ –≥–µ–ª—ñ–∫–æ–ø—Ç–µ—Ä–∏ –≤–µ—Å—å –¥–µ–Ω—å –ª–µ—Ç—è—Ç—å —É –±—ñ–∫ –ë–µ—Ä–¥—è–Ω—Å—å–∫–æ—ó –∫–æ—Å–∏, –∫—É–¥–∏ —Å–∫–æ—Ä—ñ—à –∑–∞ –≤—Å–µ, –≤ –æ—Ä–≥–∞–Ω—ñ–∑–æ–≤–∞–Ω—ñ –∑–∞–∑–¥–∞–ª–µ–≥—ñ–¥—å —à–ø–∏—Ç–∞–ª—ñ, –∑–≤–æ–∑—è—Ç—å –¥–≤—É—Ö—Å–æ—Ç–∏—Ö. \n‚ùóÔ∏è–ë–µ—Ä–¥—è–Ω—Ü—ñ –æ—á—ñ–∫—É—é—Ç—å —ñ –Ω–∞ –ø–æ—Å–∏–ª–µ–Ω–Ω—è —Ä–µ–ø—Ä–µ—Å—ñ–π –∑ –±–æ–∫—É —Ä–æ—Å—ñ–π—Å—å–∫–∏—Ö —Ç–µ—Ä–æ—Ä–∏—Å—Ç—ñ–≤ –í—Ç—ñ–º, —Ä–∞—à–∏—Å—Ç–∏ –∑ –ø–µ—Ä—à–∏—Ö –¥–Ω—ñ–≤ –æ–∫—É–ø–∞—Ü—ñ—ó —ñ –¥–æ —Å—å–æ–≥–æ–¥–Ω—ñ –Ω–µ –ø—Ä–∏–ø–∏–Ω—è–ª–∏ —Ä—É—Å–∏—Ñ—ñ–∫–∞—Ü—ñ—é, —Ç–∏—Å–∫ —ñ –∑–∞–ª—è–∫—É–≤–∞–Ω–Ω—è –ª—é–¥–µ–π –ë–µ—Ä–¥—è–Ω—Å–¨–∫–∞ üê•@proberdiansk_bot #–±–µ—Ä–¥—è–Ω—Å–∫ [ –±–µ—Ä–¥—è–Ω—Å—å–∫]( #–±–µ—Ä–¥—è–Ω—Å—å–∫) #–±–µ—Ä–¥—è–Ω—Å—å–∫—Ü–µ—É–∫—Ä–∞—ó–Ω–∞ [ –º–µ–ª—ñ—Ç–æ–ø–æ–ª—å]( #–º–µ–ª—ñ—Ç–æ–ø–æ–ª—å) #–º–µ–ª–∏—Ç–æ–ø–æ–ª—å [ #–∑–∞–ø–æ—Ä—ñ–∂–∂—è]( –∑–∞–ø–æ—Ä—ñ–∂–∂—è) #–æ–∫—É–ø–∞—Ü—ñ—è #–æ–∫–∫—É–ø–∞—Ü–∏—è"
719635,109,292,45,4,"üî•–ú—ñ—Å—Ü–µ–≤—ñ –ó–ú–Ü –ú–µ–ª—ñ—Ç–æ–ø–æ–ª—è –ø–æ–≤—ñ–¥–æ–º–ª—è—é—Ç—å —â–æ –≤–∂–µ –≤–¥—Ä—É–≥–µ –∑–∞ –æ—Å—Ç–∞–Ω–Ω—ñ –ø—ñ–≤ –≥–æ–¥–∏–Ω–∏ –º—ñ—Å—Ç–æ —á—É—î –≥—É—á–Ω—ñ –≤–∏–±—É—Ö–∏. \n–¢–∞–∫–æ–∂ —É –Ω–µ–±–æ –ø—ñ–¥–Ω—è–ª–∞—Å—å –∞–≤—ñ–∞—Ü—ñ—è —Ä–∞—à–∏—Å—Ç—ñ–≤. \n–í—Å–µ —Ü–µ –ø—ñ—Å–ª—è —Ç–æ–≥–æ, —è–∫ –≤–Ω–æ—á—ñ –¥–æ–≤–≥–æ –≥–æ—Ä—ñ–≤ —Ç–∞ –¥–µ—Ç–æ–Ω—É–≤–∞–≤ —Å–∫–ª–∞–¥ –ë–ö —Ä–∞—à–∏—Å—Ç—ñ–≤, —è–∫–∏–π –≤–æ–Ω–∏ –≤–ª–∞—à—Ç—É–≤–∞–ª–∏ –Ω–∞ –∑–∞–≤–æ–¥—ñ –ì—ñ–¥—Ä–æ–º–∞—à. \n–§–æ—Ç–æ: –Ω—ñ—á–Ω–æ—ó –±–∞–≤–æ–≤–Ω–∏ —É –ú–µ–ª—ñ—Ç–æ–ø–æ–ª—ñ."
719636,111,250,30,0,"üá∫üá¶¬´–†–æ–º–µ–æ¬ª ‚Äì –ø—ñ–¥–ø—Ä–∏—î–º–µ—Ü—å, –ª—é–±–∏—Ç–µ–ª—å –º–æ—Ä—è —ñ —Ä–∏–±–∞–ª–∫–∏, —Ç–µ–ø–µ—Ä ‚Äì —Å–∞–ø–µ—Ä –ë–µ—Ä–¥—è–Ω—Å—å–∫–æ–≥–æ –±–∞—Ç–∞–ª—å–π–æ–Ω—É —Ç–µ—Ä–∏—Ç–æ—Ä—ñ–∞–ª—å–Ω–æ—ó –æ–±–æ—Ä–æ–Ω–∏, –≤–∑—è–≤ –∑–±—Ä–æ—é –∞–±–∏ –∑–∞—Ö–∏—Å—Ç–∏—Ç–∏ —Å–≤—ñ–π –∫—Ä–∞–π, —Ç–∞ –ø–æ–≤–µ—Ä–Ω—É—Ç–∏ –∂–æ–≤—Ç–æ-–±–ª–∞–∫–∏—Ç–Ω–∏–π –ø—Ä–∞–ø–æ—Ä –≤ —Ä—ñ–¥–Ω–µ –º—ñ—Å—Ç–æüá∫üá¶ üöÄ –ù–ê–ü–ò–°–ê–¢–ò |–ü–Ü–î–ü–ò–°–ê–¢–ò–°–¨ | –ü–Ü–î–¢–†–ò–ú–ê–¢–ò |–†–ï–ö–õ–ê–ú–ê"
719637,0,2,0,0,


In [6]:
grammar_correction_prompt = MultiGrammarErrorCorrectionGenerationPrompt().prompt_template

print(grammar_correction_prompt)

input_variables=['num_corrections', 'original_text'] template='–í–∏–ø—Ä–∞–≤—Ç–µ –Ω–∞—Å—Ç—É–ø–Ω–∏–π —Ç–µ–∫—Å—Ç, –∑—Ä–æ–±–∏–≤—à–∏ –π–æ–≥–æ –≥—Ä–∞–º–∞—Ç–∏—á–Ω–æ –ø—Ä–∞–≤–∏–ª—å–Ω–∏–º.\n–í–∏–ø—Ä–∞–≤—Ç–µ –≤—Å—ñ –æ—Ä—Ñ–æ–≥—Ä–∞—Ñ—ñ—á–Ω—ñ, –ø—É–Ω–∫—Ç—É–∞—Ü—ñ–π–Ω—ñ, —Å—Ç–∏–ª—ñ—Å—Ç–∏—á–Ω—ñ, –≥—Ä–∞–º–∞—Ç–∏—á–Ω—ñ, –ª–µ–∫—Å–∏—á–Ω—ñ —Ç–∞ —Å–∏–Ω—Ç–∞–∫—Å–∏—á–Ω—ñ –ø–æ–º–∏–ª–∫–∏.\n–Ø–∫—â–æ –ø–æ–º–∏–ª–æ–∫ –Ω–µ–º–∞—î, –ø–æ–≤—Ç–æ—Ä—ñ—Ç—å –æ—Ä–∏–≥—ñ–Ω–∞–ª—å–Ω–∏–π —Ç–µ–∫—Å—Ç.\n–ó–≥–µ–Ω–µ—Ä—É–π—Ç–µ {num_corrections} —Ä—ñ–∑–Ω—ñ –≤–∞—Ä—ñ–∞–Ω—Ç–∏ –≤–∏–ø—Ä–∞–≤–ª–µ–Ω–æ–≥–æ —Ç–µ–∫—Å—Ç—É –∑ –ø–æ—è—Å–Ω–µ–Ω–Ω—è–º–∏.\n–ï–∫—Ä–∞–Ω—É–π –≤—Å—ñ —Å–∏–º–≤–æ–ª–∏ –ø–æ–¥–≤—ñ–π–Ω–∏—Ö –ª–∞–ø–æ–∫ —É –≤—ñ–¥–ø–æ–≤—ñ–¥—ñ –∑–≤–æ—Ä–æ—Ç–Ω–æ—é –∫–æ—Å–æ—é —Ä–∏—Å–∫–æ—é\n\n–§–æ—Ä–º–∞—Ç –≤—ñ–¥–ø–æ–≤—ñ–¥—ñ —É JSON:\n[{{\n    "correction": "–≤–∏–ø—Ä–∞–≤–ª–µ–Ω–∏–π —Ç–µ–∫—Å—Ç",\n    "explanation": "–ø–æ—è—Å–Ω–µ–Ω–Ω—è –¥–æ –≤–∏–ø—Ä–∞–≤–ª–µ–Ω–Ω—è"\n}}, ...]\n\n–ü—Ä–∏–∫–ª–∞–¥–∏:\n\n1. –í—Ö—ñ–¥–Ω–∏–π —Ç–µ–∫—Å—Ç:\n   –û—Å—Ç–∞–Ω–Ω—ñ 3 –º—ñ—Å—è—Ü—ñ –º–æ–

In [7]:
import json


def prepare_jsonl_batches(
        df: pd.DataFrame,
        text_column: str,
        prompt_template: PromptTemplate,
        start_at: int,
        end_at: int,
        output_dir: str
):
    """
    Split the dataset into batches and save as JSONL files for batch processing.
    """
    filename = f"{output_dir}/ubertext_batched_multigec_input_{start_at}:{end_at}.jsonl"

    if os.path.exists(filename):
        os.remove(filename)

    for i in range(start_at, end_at + 1):
        row = df.loc[i]
        sample_id: int = str(i)
        sample_input_text: str = row[text_column]
        sample_request = {
            "custom_id": sample_id,
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {
                "model": parameters.multi_gec.model_name,
                "messages": [
                    {
                        "role": "user",
                        "content": prompt_template.format(
                            original_text=sample_input_text,
                            num_corrections=parameters.multi_gec.num_corrections
                        ),
                    }
                ],
                "temperature": parameters.multi_gec.temperature,
                "top_p": parameters.multi_gec.top_p,
            }
        }

        # Save the batch to a JSONL file
        with open(filename, "a") as f:
            f.write(json.dumps(sample_request) + "\n")

    print(f"Input request saved to: {filename}")

    return filename


for batch_id, (s_at, e_at) in parameters.processing.batches.multi_gec.items():
    # Prepare batches
    ubertext_df_subset = ubertext_df.loc[s_at:e_at]
    batch_filename = prepare_jsonl_batches(
        df=ubertext_df_subset,
        text_column="sentence",
        prompt_template=grammar_correction_prompt,
        start_at=s_at,
        end_at=e_at,
        output_dir=output_dir
    )
    batch_metadata = submit_openai_batch(
        client=client,
        batch_filename=batch_filename,
        start_at=s_at,
        end_at=e_at,
        description_prefix="ubertext-batched-multigec"
    )
    print(f"{batch_metadata.id}: [{s_at, e_at}]")

Input request saved to: ../../../datasets/ubertext/batched.nosync/ubertext_batched_multigec_input_170001:175000.jsonl
Batch submitted for processing: batch_673f5fb5f7e481908d819ed9d2ff1a6e
batch_673f5fb5f7e481908d819ed9d2ff1a6e: [(170001, 175000)]
Input request saved to: ../../../datasets/ubertext/batched.nosync/ubertext_batched_multigec_input_175001:180000.jsonl
Batch submitted for processing: batch_673f5fc8d1048190bf32cea6ce34e4e0
batch_673f5fc8d1048190bf32cea6ce34e4e0: [(175001, 180000)]
Input request saved to: ../../../datasets/ubertext/batched.nosync/ubertext_batched_multigec_input_180001:185000.jsonl
Batch submitted for processing: batch_673f5fdb43b48190b3e99109b3edec54
batch_673f5fdb43b48190b3e99109b3edec54: [(180001, 185000)]
Input request saved to: ../../../datasets/ubertext/batched.nosync/ubertext_batched_multigec_input_185001:190000.jsonl
Batch submitted for processing: batch_673f5fed9f2c8190a214f63f3fa77eac
batch_673f5fed9f2c8190a214f63f3fa77eac: [(185001, 190000)]
Input re

In [6]:
multi_gec_batches: str = "multi_gec:\n"
for batch_id, (s_at, e_at) in parameters.processing.batches.multi_gec.items():
    output_file = retrieve_openai_batch(
        client=client,
        batch_id=batch_id,
        output_dir=output_dir,
        start_at=s_at,
        end_at=e_at,
        description_prefix="ubertext_batched_multigec",
    )
    multi_gec_batches += f"\t{batch_id}: [{s_at}, {e_at}]\n"

    print(f"Batch {batch_id}: {s_at, e_at} processing output saved to: {output_file}")

print()
print(multi_gec_batches)

Batch processing output saved to: ../../../datasets/ubertext/batched.nosync/ubertext_batched_multigec_output_50001:55000.jsonl
Batch batch_673f2c1f07188190b80d2cdf18471535: (50001, 55000) processing output saved to: ../../../datasets/ubertext/batched.nosync/ubertext_batched_multigec_output_50001:55000.jsonl
Batch processing output saved to: ../../../datasets/ubertext/batched.nosync/ubertext_batched_multigec_output_55001:60000.jsonl
Batch batch_673f2d684cf48190bd0bf1f8e68f3752: (55001, 60000) processing output saved to: ../../../datasets/ubertext/batched.nosync/ubertext_batched_multigec_output_55001:60000.jsonl
Batch processing output saved to: ../../../datasets/ubertext/batched.nosync/ubertext_batched_multigec_output_60001:65000.jsonl
Batch batch_673f2f432d508190829c54ca7ec5fd2d: (60001, 65000) processing output saved to: ../../../datasets/ubertext/batched.nosync/ubertext_batched_multigec_output_60001:65000.jsonl
Batch processing output saved to: ../../../datasets/ubertext/batched.nosy