In [1]:
import os
import time
import json
from tqdm.notebook import tqdm

from openai import OpenAI
from pydantic import BaseModel, ValidationError
from datasets import load_dataset
from huggingface_hub import HfApi, login

from concurrent.futures import ThreadPoolExecutor, as_completed
from tenacity import retry, stop_after_attempt, \
                     wait_random_exponential

In [2]:
os.environ['OPENAI_API_KEY'] = ''

In [3]:
# set login credentials for Huggingface
login('')

api = HfApi()

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /home/faiq0913/.cache/huggingface/token
Login successful


### 1.0 Load, explore & clean Boolq dataset

In [2]:
ds = load_dataset('google/boolq', split=('train', 'validation'))
ds

(Dataset({
     features: ['question', 'answer', 'passage'],
     num_rows: 9427
 }),
 Dataset({
     features: ['question', 'answer', 'passage'],
     num_rows: 3270
 }))

In [3]:
df_train = ds[0].to_pandas()
df_val = ds[1].to_pandas()

df_train['answer'] = df_train['answer'].apply(lambda x: 0 if x == False else 1)
df_val['answer'] = df_val['answer'].apply(lambda x: 0 if x == False else 1)

df_train['language'] = 'English'
df_val['language'] = 'English'

df_train['split'] = 'train'
df_val['split'] = 'validation'

In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9427 entries, 0 to 9426
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   question  9427 non-null   object
 1   answer    9427 non-null   int64 
 2   passage   9427 non-null   object
 3   language  9427 non-null   object
 4   split     9427 non-null   object
dtypes: int64(1), object(4)
memory usage: 368.4+ KB


In [5]:
df_val.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3270 entries, 0 to 3269
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   question  3270 non-null   object
 1   answer    3270 non-null   int64 
 2   passage   3270 non-null   object
 3   language  3270 non-null   object
 4   split     3270 non-null   object
dtypes: int64(1), object(4)
memory usage: 127.9+ KB


In [6]:
df_val.to_json('../datasets/boolq-english-val.jsonl', orient='records', lines=True)
df_train.to_json('../datasets/boolq-english-train.jsonl', orient='records', lines=True)

### 2.0 Generate Malay translations

In [4]:
class NewTranslatedOutput(BaseModel):
    passage_my: str
    question_my: str

class OldTranslatedOutput(BaseModel):
    question_my: str

In [5]:
client = OpenAI()

In [8]:
passage_done = {}

# Define a function to handle when all retries fail
def after_all_retries(retry_state):
    print(retry_state.outcome.exception())
    print("All retries failed. Moving on...\n")

@retry(
        stop=stop_after_attempt(3),
        wait=wait_random_exponential(min=1, max=60),
        retry_error_callback=after_all_retries,
)
def generate_reasoning(sample, split: str):
    """
    A function that calls the OpenAI API to generate Malay translations and returns a dictionary.
    
    Here, we handle an instance where the 'passage' column (which is very long!) has been translated
    already. If it hasn't we'll use the NewTranslatedOutput pydantic class as the output, else
    we'll use the OldTranslatedOutput class. Gotta save those OpenAI credits!
    """

    if sample['passage'] not in passage_done.keys():
        prompt_new = f"""You are tasked with providing translations from English to
            Bahasa Malaysia/Malay.

            Passage (Input):
            ```
            {sample['passage']}
            ```

            Statement (Input):
            ```
            {sample['question']}
            ```

            Provide the output in JSON schema e.g., {{'passage_my': '', 'question_my': ''}}.
            """
        try:
            completion = client.chat.completions.create(
            model="gpt-4o-mini",
            response_format={
                "type": "json_schema",
                "json_schema": {
                    "name": "NewTranslatedOutput",
                    "strict": True,
                    "schema": {
                        "type": 'object',
                        "properties": {
                            'passage_my': {"type": "string"},
                            'question_my': {"type": "string"},
                        },
                        "required": ["question_my", "passage_my"],
                        "additionalProperties": False
                    }

                }
            },
            temperature=0.2,
            messages=[
                {"role": "system", "content": prompt_new},
            ]
            )

            # add the translated output as a key in the passage_done dictionary.
            passage_done[sample['passage']] = NewTranslatedOutput.model_validate_json(completion.choices[0].message.content).passage_my

            time.sleep(10)
        
            return {
                'passage': NewTranslatedOutput.model_validate_json(completion.choices[0].message.content).passage_my,
                'summary': NewTranslatedOutput.model_validate_json(completion.choices[0].message.content).question_my,
                'answer': sample['answer'],
                'language': 'Malay',
                'split': split,
            }
        except ValidationError as e:
            raise ValidationError(f"Pydantic ValidationError: {e} - summary: {sample['summary']}")
            
    else:
        prompt_old = f"""You are tasked with providing translations from English to
            Bahasa Malaysia/Malay.

            Statement (Input):
            ```
            {sample['question']}
            ```

            Provide the output in JSON schema e.g., {{'question_my': ''}}.
            """
        try:
            completion = client.chat.completions.create(
            model="gpt-4o-mini",
            response_format={
                "type": "json_schema",
                "json_schema": {
                    "name": "OldTranslatedOutput",
                    "strict": True,
                    "schema": {
                        "type": 'object',
                        "properties": {
                            'question_my': {"type": "string"},
                        },
                        "required": ["question_my"],
                        "additionalProperties": False
                    }

                }
            },
            temperature=0.2,
            messages=[
                {"role": "system", "content": prompt_old},
            ]
            )

            time.sleep(10)

            return {
                'passage': passage_done[sample['passage']],
                'summary': OldTranslatedOutput.model_validate_json(completion.choices[0].message.content).question_my,
                'answer': sample['answer'],
                'language': 'Malay',
                'split': split,
            }
        
        except ValidationError as e:
            raise ValidationError(f"Pydantic ValidationError: {e} - summary: {sample['summary']}")

In [9]:
data_val = []

with open('../datasets/boolq-english-val.jsonl') as fopen:
    for d in tqdm(fopen):
        d = json.loads(d)
        data_val.append(d)

0it [00:00, ?it/s]

In [10]:
print(len(data_val))

3270


In [11]:
data_val[99]

{'question': 'did the girl in the lost world die',
 'answer': 0,
 'passage': "On Isla Sorna, an island off the Pacific coast of Costa Rica, a young girl named Cathy Bowman wanders around during a family vacation, and survives an attack by a swarm of Compsognathus. Her parents file a lawsuit against the genetics company InGen, now headed by John Hammond's nephew, Peter Ludlow, who plans to use Isla Sorna to alleviate the financial losses imposed by the incident that occurred at Jurassic Park four years earlier. Mathematician Dr. Ian Malcolm meets Hammond at his mansion. Hammond explains that Isla Sorna, abandoned years earlier during a hurricane, is where InGen created their dinosaurs before moving them to Jurassic Park on Isla Nublar. Hammond hopes to stop InGen by sending a team to Isla Sorna to document the dinosaurs, thus causing public support against human interference on the island. Ian, who survived the Jurassic Park disaster, is reluctant. After learning that his girlfriend, pa

In [13]:
# test on one sample
generate_reasoning(data_val[99], 'validation')

{'passage': 'Di Isla Sorna, sebuah pulau di pantai Pasifik Costa Rica, seorang gadis muda bernama Cathy Bowman berkeliaran semasa percutian keluarga, dan selamat daripada serangan sekumpulan Compsognathus. Ibu bapanya memfailkan saman terhadap syarikat genetik InGen, yang kini diketuai oleh anak saudara John Hammond, Peter Ludlow, yang merancang untuk menggunakan Isla Sorna untuk mengurangkan kerugian kewangan yang ditanggung akibat insiden yang berlaku di Jurassic Park empat tahun sebelumnya. Ahli matematik Dr. Ian Malcolm bertemu Hammond di rumah agamnya. Hammond menerangkan bahawa Isla Sorna, yang ditinggalkan bertahun-tahun lalu semasa ribut taufan, adalah tempat di mana InGen mencipta dinosaur mereka sebelum memindahkan mereka ke Jurassic Park di Isla Nublar. Hammond berharap untuk menghentikan InGen dengan menghantar satu pasukan ke Isla Sorna untuk mendokumentasikan dinosaur, dengan itu menyebabkan sokongan awam menentang campur tangan manusia di pulau itu. Ian, yang selamat dar

In [16]:
# verify
data_val[99]['passage'] in passage_done.keys()

True

In [17]:
# verify
passage_done[data_val[99]['passage']]

'Di Isla Sorna, sebuah pulau di pantai Pasifik Costa Rica, seorang gadis muda bernama Cathy Bowman berkeliaran semasa percutian keluarga, dan selamat daripada serangan sekumpulan Compsognathus. Ibu bapanya memfailkan saman terhadap syarikat genetik InGen, yang kini diketuai oleh anak saudara John Hammond, Peter Ludlow, yang merancang untuk menggunakan Isla Sorna untuk mengurangkan kerugian kewangan yang ditanggung akibat insiden yang berlaku di Jurassic Park empat tahun sebelumnya. Ahli matematik Dr. Ian Malcolm bertemu Hammond di rumah agamnya. Hammond menerangkan bahawa Isla Sorna, yang ditinggalkan bertahun-tahun lalu semasa ribut taufan, adalah tempat di mana InGen mencipta dinosaur mereka sebelum memindahkan mereka ke Jurassic Park di Isla Nublar. Hammond berharap untuk menghentikan InGen dengan menghantar satu pasukan ke Isla Sorna untuk mendokumentasikan dinosaur, dengan itu menyebabkan sokongan awam menentang campur tangan manusia di pulau itu. Ian, yang selamat daripada bencan

In [8]:
max_worker = 50
passage_done = {}

for i in tqdm(range(0, len(data_val), max_worker)):
    with ThreadPoolExecutor(max_workers=max_worker) as executor:
        futures = {executor.submit(generate_reasoning, t, 'validation'): t for t in data_val[i: i + max_worker]}

        for future in as_completed(futures):
            result = future.result()
            if result:
                with open('../datasets/openai-generated/boolq-malay-val.jsonl', 'a') as final:
                    json.dump(result, final)
                    final.write('\n')

In [14]:
data_train = []

with open('../datasets/boolq-english-train.jsonl') as fopen:
    for d in tqdm(fopen):
        d = json.loads(d)
        data_train.append(d)

0it [00:00, ?it/s]

In [15]:
max_worker = 50
passage_done = {}

for i in tqdm(range(0, len(data_train), max_worker)):
    with ThreadPoolExecutor(max_workers=max_worker) as executor:
        futures = {executor.submit(generate_reasoning, t, 'train'): t for t in data_train[i: i + max_worker]}

        for future in as_completed(futures):
            result = future.result()
            if result:
                with open('../datasets/openai-generated/boolq-malay-train.jsonl', 'a') as final:
                    json.dump(result, final)
                    final.write('\n')

  0%|          | 0/189 [00:00<?, ?it/s]

'summary'
All retries failed. Moving on...

'summary'
All retries failed. Moving on...

'summary'
All retries failed. Moving on...

