In [28]:
from pathlib import Path
from pprint import pprint
from datasets import load_dataset

import json
import os
from openai import OpenAI
from typing import List, Dict

In [26]:
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

In [18]:
datasets_dir = Path("~/Datasets/").expanduser()
assert datasets_dir.exists(), "Datasets directory not found"
datasets_dir = datasets_dir / "SRBedding_datasets/ms_marco_v1"
print(datasets_dir)
datasets_dir.mkdir(parents=True, exist_ok=True)

/Users/studeni/Datasets/SRBedding_datasets/ms_marco_v1


In [19]:
data = load_dataset("microsoft/ms_marco", "v1.1", cache_dir=datasets_dir)
data_test = data["test"]
ms_marco = data_test.select_columns(["passages", "query", "query_id"])
ms_marco

Dataset({
    features: ['passages', 'query', 'query_id'],
    num_rows: 9650
})

In [22]:
test_data = []
for i in range(3):
    test_data.append(
        {
            "query_id": ms_marco["query_id"][i],
            "query": ms_marco["query"][i],
            "passage_text": ms_marco["passages"][i]["passage_text"],
        }
    )

pprint(test_data[0])

{'passage_text': ['We have been feeding our back yard squirrels for the fall '
                  'and winter and we noticed that a few of them have missing '
                  'fur. One has a patch missing down his back and under both '
                  'arms. Also another has some missing on his whole chest. '
                  'They are all eating and seem to have a good appetite.',
                  'Critters cannot stand the smell of human hair, so '
                  'sprinkling a barrier of hair clippings around your garden, '
                  'or lightly working it into the soil when you plant bulbs, '
                  'apparently does have some merit. The whole thing kind of '
                  'makes me laugh. It never occurred to me that we are the '
                  'ones that stink.',
                  'Spread some human hair around your vegetable and flower '
                  'gardens. This will scare the squirrels away because humans '
                  "are predator

In [29]:
SYSTEM_PROMPT = """
***TRANSLATION FROM ENGLISH TO SERBIAN***

**GOALS**

You are a professional translator fluent in English and Serbian. 
Your primary goal is to produce a high-quality, natural-sounding translation from English to Serbian. 
You are translating texts and questions pertaining to the texts. The translation is intended for dataset creation. Look at the example below:

***TRANSLATION EXAMPLE***
***ENGLISH***
query: 'What is a unicorn?'
passage_text: 'The unicorn is a legendary creature that has been described since antiquity as a beast with a single large, pointed, spiraling horn projecting from its forehead.'

***SERBIAN TRANSLATION***
query: 'Šta je jednorog?'
passage_text: 'Jednorog je mitsko stvorenje koje se od davnina opisuje kao zver sa jednim velikim, šiljastim, spiralnim rogom koji mu viri iz čela.'
***END OF TRANSLATION EXAMPLE***

To translate, follow the steps below:
   **TRANSLATION INSTRUCTIONS**
   1. Read and understand the sentence in English.
   2. When you understand the English sentence, start to translate.
   3. Pay close attention to both left and right context when you are making translation decisions.  4. Convey the original context, tone and meaning in the Serbian translation.
   4. Avoid literal translations and ensure the output reads naturally in Serbian.
   5. The translation must be contextually accurate, fluent, and adhere to the grammatical rules and lexicon of the Serbian language.
   6. The declination of nouns, adjectives, and pronouns must be correct.
   7. Make sure to proofread the translated text in Serbian and revise any mistakes. If no revisions are needed, provide the translations as they are.

   **FORMATTING INSTRUCTIONS**
   1. Strings should be enclosed within single quotation marks ('').
   2. Use double quotes for strings and escape internal quotes with a backslashes (\).
 
   **OUTPUT FORMATTING**
   1. The output must be a valid JSON file.
   2. The output must be parsable by Python's 'json.loads()' function.
   3. Ensure consistent JSON formatting as illustrated in the example below:

      **EXAMPLE**
      {'translations' : [{'query' : 'This is a query',
                      ''passage_text ' : {'This is one passage. With another sentence.',
                                          'This is yet another passage. With yet another sentence.'}]
      }
       **END OF EXAMPLE**

   - Strictly follow the structure provided in the example when generating the output.   
   - Make sure to translate text under both "query" and "passage_text" keys.

"""

In [35]:
def save_jsonl(jobs: List[Dict], filename: str | Path) -> None:
    with open(filename, "w", encoding="utf-8") as f:
        for job in jobs:
            json_string = json.dumps(job, ensure_ascii=False)
            f.write(json_string + "\n")


def make_jobs(
    model: str, prompt: str, filename: str | Path, dataset: List[Dict[str, str]]
) -> None:
    jobs = [
        {
            "model": model,
            "response_format": {"type": "json_object"},
            "temperature": 0,
            "metadata": {"id": sample["query_id"]},
            "messages": [
                {"role": "system", "content": prompt},  # Can be done in one line
                {"role": "user", "content": json.dumps(sample)},
            ],
        }
        for sample in dataset
    ]

    save_jsonl(jobs, filename)

In [36]:
model = "gpt-3.5-turbo-0125"
make_jobs(model=model, prompt=SYSTEM_PROMPT, filename="test.jsonl", dataset=test_data)

## Terminal command

python translation-pipeline-test/api_request_parallel_processor.py \
  --requests_filepath translation-pipeline-test/test.jsonl \
  --save_filepath translation-pipeline-test/test_results.jsonl \
  --request_url https://api.openai.com/v1/chat/completions \
  --max_requests_per_minute 4000 \
  --max_tokens_per_minute 1_250_000 \
  --token_encoding_name cl100k_base \
  --max_attempts 5 \
  --logging_level 20

In [None]:
# How to set env variable in terminal
# export