In [16]:
import os
import json

from openai import OpenAI
from tqdm.notebook import tqdm
from huggingface_hub import create_repo, login, HfApi
from concurrent.futures import ThreadPoolExecutor, \
                               as_completed
from tenacity import retry, stop_after_attempt, \
                     wait_random_exponential

In [2]:
os.environ['OPENAI_API_KEY'] = ''

In [3]:
client = OpenAI()

In [17]:
# set login credentials for Huggingface
login('')

api = HfApi()

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /home/faiq0913/.cache/huggingface/token
Login successful


### 1.0 Load and generate chain of thought for Boolq

#### 1.1 Load files

In [5]:
files = [
    '../datasets/fib-english.jsonl',
    '../datasets/openai-generated/fib-malay-openai.jsonl',
]

In [6]:
data_all = []

for f in files:
    with open(f) as fopen:
        for d in tqdm(fopen):
            d = json.loads(d)
            data_all.append(d)

print(f'Length of dataset: {len(data_all)}')

0it [00:00, ?it/s]

0it [00:00, ?it/s]

Length of dataset: 7610


In [7]:
data_all[0]

{'passage': 'Vehicles and pedestrians will now embark and disembark the Cowes ferry separately following Maritime and Coastguard Agency (MCA) guidance.\nIsle of Wight Council said its new procedures were in response to a resident\'s complaint.\nCouncillor Shirley Smart said it would "initially result in a slower service".\nOriginally passengers and vehicles boarded or disembarked the so called "floating bridge" at the same time.\nMs Smart, who is the executive member for economy and tourism, said the council already had measures in place to control how passengers and vehicles left or embarked the chain ferry "in a safe manner".\nHowever, it was "responding" to the MCA\'s recommendations "following this complaint".\nShe added: "This may initially result in a slower service while the measures are introduced and our customers get used to the changes."\nThe service has been in operation since 1859.',
 'summary': "A new service on the Isle of Wight's chain ferry has been launched following 

In [9]:
# Define a function to handle when all retries fail
def after_all_retries(retry_state):
    print(retry_state.outcome.exception())
    print(retry_state)
    print("All retries failed. Moving on...\n")

In [10]:
@retry(
        stop=stop_after_attempt(3),
        wait=wait_random_exponential(min=1, max=60),
        retry_error_callback=after_all_retries,
)
def generate_reasoning(sample):
    prompt = f"""You were initially tasked with determining whether a particular
        statement/question is factually/logically consistent (1) or not (0) based on a
        given passage.

        Passage (Input):
        ```
        {sample['passage']}
        ```

        Statement (Input):
        ```
        {sample['summary']}
        ```

        Answer (Output):
        ```
        {sample['answer']}
        ```

        Provide step-by-step explanation for the output based on the Context/Statement given.
        Please give your answer in {sample['language']}.
        """

    completion = client.chat.completions.create(
      model="gpt-4o-mini",
      max_completion_tokens=1024,
      temperature=0.2,
      messages=[
        {"role": "system", "content": "You are an expert in detecting factual inconsistencies and hallucinations."},
        {"role": "user", "content": prompt}
      ]
    )

    return {
        'passage': sample['passage'],
        'summary': sample['summary'],
        'answer': sample['answer'],
        'language': sample['language'],
        'reasoning': completion.choices[0].message.content,
    }

In [11]:
generate_reasoning(data_all[66])

{'passage': 'The 24-year-old tight-head is among the starting XV who are given a chance to redeem themselves after the 29-13 defeat by Scotland.\nAsked if he was relieved to be retained, the Exeter Chiefs player replied: "Oh yeah.\n"Obviously you want to keep your place and after a loss everyone was on edge."\nFrancis continued: "It\'s good we\'ve got a good squad and a good depth especially in my position.\n"There\'s Samson [Lee] there\'s me and there\'s Rhods [Rhodri Jones] and there\'s Scott Andrews.\n"If you don\'t have a good game there\'s someone knocking ready to do anything to take your place."\nMedia playback is not supported on this device\nWales go into Friday night\'s match in Cardiff off the back of defeats by Scotland and England.\nAnother defeat will mean it would be Wales\' worst championship since 2010 and Francis accepts the players have something to prove at the Principality Stadium.\nFrancis says he has managed to avoid the worst of the criticism aimed at the team i

In [13]:
generate_reasoning(data_all[5600])

{'passage': 'Burung yang dikenali secara rasmi sebagai LF15, dan pasangannya LM12, telah membiakkan tiga anak burung pada tahun 2015 dan 2016 di rizab Dunkeld. Lassie tiba kembali di Scotland pada 23 Mac, enam hari selepas pasangannya. Penonton dari seluruh dunia telah menyaksikan kemajuan pasangan itu di webcam langsung Loch of the Lowes. Charlotte Fleming, penjaga Perthshire untuk Scottish Wildlife Trust, berkata: "LF15 telah menetap untuk memulakan pengeraman, dan kami menjangkakan telur itu akan menetas dalam masa kira-kira enam minggu. "Kini jelas bahawa usaha pasangan itu untuk membiakkan telah berjaya dan mereka berada di landasan untuk mengulangi kejayaan tiga anak burung yang menetas pada tahun 2015 dan 2016.",',
 'summary': 'Pasangan itu diperkenalkan ke kawasan simpanan pada tahun 2013 oleh Scottish Wildlife Trust dan pasangan itu telah diperkenalkan semula ke kawasan simpanan oleh Scottish Raptor Study Group. Loch of the Lowes adalah kawasan hutan yang luas.',
 'answer': 0,

#### 1.2 Use multithreading and call OpenAI API

In [14]:
max_worker = 50

for i in tqdm(range(0, len(data_all), max_worker)):
    with ThreadPoolExecutor(max_workers=max_worker) as executor:
        futures = {executor.submit(generate_reasoning, t): t for t in data_all[i: i + max_worker]}

        for future in as_completed(futures):
            result = future.result()
            if result:
                with open('../datasets/openai-generated/fib-with-reasoning.jsonl', 'a') as final:
                    json.dump(result, final)
                    final.write('\n')

  0%|          | 0/153 [00:00<?, ?it/s]

In [18]:
create_repo("wanadzhar913/fib-malay-with-chain-of-thought", repo_type="dataset")

RepoUrl('https://huggingface.co/datasets/wanadzhar913/fib-malay-with-chain-of-thought', endpoint='https://huggingface.co', repo_type='dataset', repo_id='wanadzhar913/fib-malay-with-chain-of-thought')

In [20]:
api.upload_file(
    path_or_fileobj='../datasets/openai-generated/fib-with-reasoning.jsonl',
    path_in_repo='fib-with-reasoning.jsonl',
    repo_id="wanadzhar913/fib-malay-with-chain-of-thought",
    repo_type="dataset",
)

fib-with-reasoning.jsonl:   0%|          | 0.00/26.6M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/wanadzhar913/fib-malay-with-chain-of-thought/commit/9517ddbe153f9c359025c20e2f7ae3eb2c97d3e8', commit_message='Upload fib-with-reasoning.jsonl with huggingface_hub', commit_description='', oid='9517ddbe153f9c359025c20e2f7ae3eb2c97d3e8', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/wanadzhar913/fib-malay-with-chain-of-thought', endpoint='https://huggingface.co', repo_type='dataset', repo_id='wanadzhar913/fib-malay-with-chain-of-thought'), pr_revision=None, pr_num=None)