In [37]:
import openai
from openai import OpenAI
from datasets import Dataset, load_dataset
import backoff
from tqdm import tqdm

client = OpenAI()


In [38]:
def language_convert(text, language="hindi"):

    is_list = False
    if isinstance(text, list):
        text = "\n".join(text)
        is_list = True

    response = llm(f"convert following into {language}:{text}")
    return response if not is_list else [response]



In [39]:
@backoff.on_exception(backoff.expo, openai.APIError)
def llm(prompt, **kwargs):
    response = client.chat.completions.create(
        model=kwargs.get("model", "gpt-4"),
        messages=[{"role": "system", "content": prompt}],
        temperature=kwargs.get("temperature", 0),
        top_p=kwargs.get("top_p", 1),
        frequency_penalty=kwargs.get("frequency_penalty", 0.0),
        presence_penalty=kwargs.get("presence_penalty", 0.0),
        max_tokens=kwargs.get("max_tokens", 4000),
        n=kwargs.get("n", 1),
    )
    return response.choices[0].message.content

In [40]:
def convert_dataset(dataset, lang="hindi"):

    for key in tqdm(dataset.features.keys()):
        dataset = dataset.map(lambda ex: {key:language_convert(ex[key],lang)})

    return dataset

## Load dataset

In [16]:
english_dataset = load_dataset("explodinggradients/amnesty_qa")["train"]

No config specified, defaulting to: amnesty_qa/english
Found cached dataset amnesty_qa (/Users/shahules/.cache/huggingface/datasets/explodinggradients___amnesty_qa/english/1.0.0/061b5017d6cc681b4e33100596f256dc332ec9d306b7e6f05c48f1f0ecaa7aa0)
100%|████████████████████████████████████████████████████| 1/1 [00:00<00:00, 216.30it/s]


In [None]:
hindi_dataset=convert_dataset(english_dataset)

  0%|                                                             | 0/4 [00:00<?, ?it/s]
Map:   0%|                                                | 0/20 [00:00<?, ? examples/s][A
Map:   5%|██                                      | 1/20 [00:04<01:27,  4.62s/ examples][A
Map:  10%|████                                    | 2/20 [00:17<02:54,  9.67s/ examples][A
Map:  15%|██████                                  | 3/20 [00:27<02:48,  9.88s/ examples][A
Map:  20%|████████                                | 4/20 [00:37<02:38,  9.91s/ examples][A
Map:  25%|██████████                              | 5/20 [00:43<02:07,  8.48s/ examples][A
Map:  30%|████████████                            | 6/20 [00:53<02:04,  8.89s/ examples][A
Map:  35%|██████████████                          | 7/20 [01:07<02:15, 10.42s/ examples][A
Map:  40%|████████████████                        | 8/20 [01:19<02:11, 10.97s/ examples][A
Map:  45%|██████████████████                      | 9/20 [01:26<01:47,  9.81s/ exam

In [None]:
hindi_dataset.to_csv("/Users/shahules/Downloads/amnesty_qa_hindi.csv")