In [38]:
import os
import json

from openai import OpenAI
from tqdm.notebook import tqdm
from huggingface_hub import create_repo, login, HfApi
from concurrent.futures import ThreadPoolExecutor, \
                               as_completed
from tenacity import retry, stop_after_attempt, \
                     wait_random_exponential

In [11]:
os.environ['OPENAI_API_KEY'] = ''

In [15]:
client = OpenAI()

In [39]:
# set login credentials for Huggingface
login('')

api = HfApi()

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /home/faiq0913/.cache/huggingface/token
Login successful


### 1.0 Load and generate chain of thought for Boolq

#### 1.1 Load files

In [6]:
files = [
    '../datasets/boolq-english-train.jsonl',
    '../datasets/boolq-english-val.jsonl',
    '../datasets/openai-generated/boolq-malay-train-fixed.jsonl',
    '../datasets/openai-generated/boolq-malay-val-fixed.jsonl',
]

In [18]:
data_all = []

for f in files:
    with open(f) as fopen:
        for d in tqdm(fopen):
            d = json.loads(d)
            data_all.append(d)

print(f'Length of dataset: {len(data_all)}')

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

Length of dataset: 25391


In [33]:
data_all[0]

{'question': 'do iran and afghanistan speak the same language',
 'answer': 1,
 'passage': 'Persian (/ˈpɜːrʒən, -ʃən/), also known by its endonym Farsi (فارسی fārsi (fɒːɾˈsiː) ( listen)), is one of the Western Iranian languages within the Indo-Iranian branch of the Indo-European language family. It is primarily spoken in Iran, Afghanistan (officially known as Dari since 1958), and Tajikistan (officially known as Tajiki since the Soviet era), and some other regions which historically were Persianate societies and considered part of Greater Iran. It is written in the Persian alphabet, a modified variant of the Arabic script, which itself evolved from the Aramaic alphabet.',
 'language': 'English',
 'split': 'train'}

In [17]:
# Define a function to handle when all retries fail
def after_all_retries(retry_state):
    print(retry_state.outcome.exception())
    print(retry_state)
    print("All retries failed. Moving on...\n")

In [23]:
@retry(
        stop=stop_after_attempt(3),
        wait=wait_random_exponential(min=1, max=60),
        retry_error_callback=after_all_retries,
)
def generate_reasoning(sample):
    prompt = f"""You were initially tasked with determining whether a particular
        statement/question is factually/logically consistent (1) or not (0) based on a
        given passage.

        Passage (Input):
        ```
        {sample['passage']}
        ```

        Statement (Input):
        ```
        {sample['question']}
        ```

        Answer (Output):
        ```
        {sample['answer']}
        ```

        Provide step-by-step explanation for the output based on the Context/Statement given.
        Please give your answer in {sample['language']}.
        """

    completion = client.chat.completions.create(
      model="gpt-4o-mini",
      max_completion_tokens=1024,
      temperature=0.2,
      messages=[
        {"role": "system", "content": "You are an expert in detecting factual inconsistencies and hallucinations."},
        {"role": "user", "content": prompt}
      ]
    )

    return {
        'passage': sample['passage'],
        'question': sample['question'],
        'answer': sample['answer'],
        'language': sample['language'],
        'split': sample['split'],
        'reasoning': completion.choices[0].message.content,
    }

In [26]:
generate_reasoning(data_all[66])

{'passage': 'In the canonical case, lawyers seeking admission must earn a Juris Doctor degree from a law school approved by the jurisdiction, and then pass a bar exam administered by it. Typically, there is also a character and fitness evaluation, which includes a background check. However, there are exceptions to each of these requirements. A lawyer who is admitted in one state is not automatically allowed to practice in any other. Some states have reciprocal agreements that allow attorneys from other states to practice without sitting for another full bar exam; such agreements differ significantly among the states.',
 'question': 'do you have to have a college degree to take the bar exam',
 'answer': 1,
 'language': 'English',
 'split': 'train',
 'reasoning': 'To determine whether the statement "do you have to have a college degree to take the bar exam" is factually/logically consistent with the provided passage, we can analyze the information step-by-step:\n\n1. **Understanding the 

In [34]:
generate_reasoning(data_all[14500])

{'passage': "A Wrinkle in Time adalah novel fantasi sains yang ditulis oleh penulis Amerika Madeleine L'Engle, pertama kali diterbitkan pada tahun 1962. Buku ini memenangi Anugerah Newbery, Anugerah Buku Sequoyah, dan Anugerah Rak Lewis Carroll, dan menjadi naib juara untuk Anugerah Hans Christian Andersen. Ia adalah buku pertama dalam Quintet Waktu L'Engle, yang mengikuti kisah Murrys dan Calvin O'Keefe.",
 'question': 'adakah a wrinkle in time sebuah cerita benar',
 'answer': 0,
 'language': 'Malay',
 'split': 'train',
 'reasoning': 'Untuk menentukan konsistensi fakta dari pernyataan "adakah A Wrinkle in Time sebuah cerita benar" berdasarkan petikan yang diberikan, kita perlu mengikuti langkah-langkah berikut:\n\n1. **Memahami Petikan**: Petikan menyatakan bahawa "A Wrinkle in Time" adalah novel fantasi sains yang ditulis oleh Madeleine L\'Engle dan diterbitkan pada tahun 1962. Ia juga menyebut bahawa buku ini memenangi beberapa anugerah dan merupakan buku pertama dalam Quintet Waktu

#### 1.2 Use multithreading and call OpenAI API

In [36]:
max_worker = 50

for i in tqdm(range(0, len(data_all), max_worker)):
    with ThreadPoolExecutor(max_workers=max_worker) as executor:
        futures = {executor.submit(generate_reasoning, t): t for t in data_all[i: i + max_worker]}

        for future in as_completed(futures):
            result = future.result()
            if result:
                with open('../datasets/openai-generated/boolq-with-reasoning.jsonl', 'a') as final:
                    json.dump(result, final)
                    final.write('\n')

  0%|          | 0/508 [00:00<?, ?it/s]

In [40]:
create_repo("wanadzhar913/boolq-malay-with-chain-of-thought", repo_type="dataset")

RepoUrl('https://huggingface.co/datasets/wanadzhar913/boolq-malay-with-chain-of-thought', endpoint='https://huggingface.co', repo_type='dataset', repo_id='wanadzhar913/boolq-malay-with-chain-of-thought')

In [41]:
api.upload_file(
    path_or_fileobj='../datasets/openai-generated/boolq-with-reasoning.jsonl',
    path_in_repo='boolq-with-reasoning.jsonl',
    repo_id="wanadzhar913/boolq-malay-with-chain-of-thought",
    repo_type="dataset",
)

boolq-with-reasoning.jsonl:   0%|          | 0.00/55.0M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/wanadzhar913/boolq-malay-with-chain-of-thought/commit/d10139839e765a339d46c35f184c6e34fca1b695', commit_message='Upload boolq-with-reasoning.jsonl with huggingface_hub', commit_description='', oid='d10139839e765a339d46c35f184c6e34fca1b695', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/wanadzhar913/boolq-malay-with-chain-of-thought', endpoint='https://huggingface.co', repo_type='dataset', repo_id='wanadzhar913/boolq-malay-with-chain-of-thought'), pr_revision=None, pr_num=None)