In [21]:
import json
import os
import random
from concurrent.futures import ThreadPoolExecutor
from functools import lru_cache

from dotenv import load_dotenv
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import AzureChatOpenAI
from tenacity import retry, stop_after_attempt, wait_exponential
from tqdm import tqdm

In [22]:
# params
DEPLOYMENT_NAME = "emi-gpt-4o-mini"

In [23]:
load_dotenv(f"{os.environ['HOME']}/.env")

True

In [24]:
llm = AzureChatOpenAI(
    azure_deployment=DEPLOYMENT_NAME,
    temperature=0.0,
    max_tokens=800,
    seed=42,
)

In [25]:
translate_prompt_template = ChatPromptTemplate.from_messages(
    [
        ("system", "Translate the following from English to Lithuanian"),
        ("user", "{string}"),
    ]
)


def translate_string(string):
    prompt = translate_prompt_template.invoke(
        {"string": string}
    )
    try:
        response = llm.invoke(prompt)
        return response.content
    except Exception as e:
        print(string)
        raise e


@retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=3, max=6))
@lru_cache(maxsize=None)
def translate_to_lithuanian(string):
    translated_string = translate_string(string)
    return translated_string

In [46]:
samples = json.load(open("../datasets/train_samples_tuning.json"))

In [60]:
samples_1p = samples[::100]

In [53]:
answers = [
    sample["conversations"][1]["value"]
    for sample in samples_1p
]

In [54]:
with ThreadPoolExecutor(max_workers=20) as executor:
    results = list(
        tqdm(
            executor.map(
                translate_to_lithuanian,
                answers,
            ),
            total=len(answers),
        )
    )

100%|██████████| 225/225 [00:29<00:00,  7.52it/s]


In [55]:
for sample, answer in zip(samples_1p, results):
    sample["conversations"][1]["value"] = answer

In [61]:
len(samples_1p)

2249

In [56]:
json.dump(
    samples_1p,
    open("../datasets/train_samples_tuning_1p.json", "w"),
    indent=2,
)