In [1]:
import os
import time
import json
from tqdm.notebook import tqdm

from openai import OpenAI
from pydantic import BaseModel, ValidationError
from datasets import load_dataset
from huggingface_hub import HfApi, login

from concurrent.futures import ThreadPoolExecutor, as_completed
from tenacity import retry, stop_after_attempt, \
                     wait_random_exponential

In [2]:
os.environ['OPENAI_API_KEY'] = ''

In [3]:
# set login credentials for Huggingface
login('')

api = HfApi()

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /home/faiq0913/.cache/huggingface/token
Login successful


### 1.0 Load, explore & clean FIB dataset

In [4]:
ds = load_dataset('r-three/fib', split='test')
df = ds.to_pandas()

Repo card metadata block was not found. Setting CardData to empty.


In [5]:
df.head(3)

Unnamed: 0,id,input,correct_choice,list_choices,lbl,distractor_model,dataset
0,32168497,Vehicles and pedestrians will now embark and d...,Passengers using a chain ferry have been warne...,[ A new service on the Isle of Wight's chain f...,1,bart-base,xsum
1,29610109,If you leave your mobile phone somewhere do yo...,"Do you ever feel lonely, stressed or jealous w...","[ You may be worried about your health, but wh...",1,bart-base,xsum
2,38018439,"Speaking on TV, Maria Zakharova said Jews had ...",A spokeswoman on Russian TV has said Jewish pe...,[ The Russian foreign minister has said she ha...,1,bart-base,xsum


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3579 entries, 0 to 3578
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                3579 non-null   object
 1   input             3579 non-null   object
 2   correct_choice    3579 non-null   object
 3   list_choices      3579 non-null   object
 4   lbl               3579 non-null   int64 
 5   distractor_model  3579 non-null   object
 6   dataset           3579 non-null   object
dtypes: int64(1), object(6)
memory usage: 195.9+ KB


In [7]:
df.dataset.value_counts(normalize=True)

dataset
xsum      0.872311
cnn_dm    0.127689
Name: proportion, dtype: float64

In [8]:
df.iloc[0].input

'Vehicles and pedestrians will now embark and disembark the Cowes ferry separately following Maritime and Coastguard Agency (MCA) guidance.\nIsle of Wight Council said its new procedures were in response to a resident\'s complaint.\nCouncillor Shirley Smart said it would "initially result in a slower service".\nOriginally passengers and vehicles boarded or disembarked the so called "floating bridge" at the same time.\nMs Smart, who is the executive member for economy and tourism, said the council already had measures in place to control how passengers and vehicles left or embarked the chain ferry "in a safe manner".\nHowever, it was "responding" to the MCA\'s recommendations "following this complaint".\nShe added: "This may initially result in a slower service while the measures are introduced and our customers get used to the changes."\nThe service has been in operation since 1859.'

In [9]:
df.iloc[0].list_choices

array([" A new service on the Isle of Wight's chain ferry has been launched following a complaint from a resident.",
       'Passengers using a chain ferry have been warned crossing times will be longer because of new safety measures.'],
      dtype=object)

In [10]:
# Each list choice contains a positive and negative summary; we'll explode & clean
df_ = df.explode('list_choices')
df_['list_choices'] = df_['list_choices'].apply(lambda x: x.strip())

There seems to be similar input texts in the *input* column. A quick check yields that the correct choices get duplicated many times in the *list_choices* column. We'll need to handle this.

In [11]:
# https://stackoverflow.com/questions/57297077/use-variable-in-pandas-query
df_.query('input == @df_.iloc[0].input')

Unnamed: 0,id,input,correct_choice,list_choices,lbl,distractor_model,dataset
0,32168497,Vehicles and pedestrians will now embark and d...,Passengers using a chain ferry have been warne...,A new service on the Isle of Wight's chain fer...,1,bart-base,xsum
0,32168497,Vehicles and pedestrians will now embark and d...,Passengers using a chain ferry have been warne...,Passengers using a chain ferry have been warne...,1,bart-base,xsum
463,32168497,Vehicles and pedestrians will now embark and d...,Passengers using a chain ferry have been warne...,Passengers using a chain ferry have been warne...,0,bart-large,xsum
463,32168497,Vehicles and pedestrians will now embark and d...,Passengers using a chain ferry have been warne...,"The Isle of Wight's ""floating bridge"" is to be...",0,bart-large,xsum
877,32168497,Vehicles and pedestrians will now embark and d...,Passengers using a chain ferry have been warne...,ferry services are run by the Cowes Ferry Comp...,1,bloom-560m,xsum
877,32168497,Vehicles and pedestrians will now embark and d...,Passengers using a chain ferry have been warne...,Passengers using a chain ferry have been warne...,1,bloom-560m,xsum
2639,32168497,Vehicles and pedestrians will now embark and d...,Passengers using a chain ferry have been warne...,A chain of chain ferry services in the Isle of...,1,t5-large,xsum
2639,32168497,Vehicles and pedestrians will now embark and d...,Passengers using a chain ferry have been warne...,Passengers using a chain ferry have been warne...,1,t5-large,xsum


In [12]:
print(f'Num. rows before drop duplicates: {len(df_)}')
df_ = df_.drop_duplicates(subset=['input', 'list_choices'])
print(f'Num. rows after drop duplicates: {len(df_)}')

df_[['input', 'list_choices', 'correct_choice']].head(5)

Num. rows before drop duplicates: 7158
Num. rows after drop duplicates: 4081


Unnamed: 0,input,list_choices,correct_choice
0,Vehicles and pedestrians will now embark and d...,A new service on the Isle of Wight's chain fer...,Passengers using a chain ferry have been warne...
0,Vehicles and pedestrians will now embark and d...,Passengers using a chain ferry have been warne...,Passengers using a chain ferry have been warne...
1,If you leave your mobile phone somewhere do yo...,"You may be worried about your health, but what...","Do you ever feel lonely, stressed or jealous w..."
1,If you leave your mobile phone somewhere do yo...,"Do you ever feel lonely, stressed or jealous w...","Do you ever feel lonely, stressed or jealous w..."
2,"Speaking on TV, Maria Zakharova said Jews had ...",The Russian foreign minister has said she has ...,A spokeswoman on Russian TV has said Jewish pe...


In [13]:
# Create labels where factually consistent (1) and factually inconsistent/contradicting (0)
df_.loc[df_['correct_choice'] == df_['list_choices'], 'target'] = 1
df_.loc[df_['correct_choice'] != df_['list_choices'], 'target'] = 0
df_['target'] = df_['target'].astype(int)

df_[['input', 'list_choices', 'target']].head(3)

Unnamed: 0,input,list_choices,target
0,Vehicles and pedestrians will now embark and d...,A new service on the Isle of Wight's chain fer...,0
0,Vehicles and pedestrians will now embark and d...,Passengers using a chain ferry have been warne...,1
1,If you leave your mobile phone somewhere do yo...,"You may be worried about your health, but what...",0


In [14]:
df_clean = df_[['input', 'list_choices', 'target']]
df_clean['language'] = 'English'

df_clean.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['language'] = 'English'


Unnamed: 0,input,list_choices,target,language
0,Vehicles and pedestrians will now embark and d...,A new service on the Isle of Wight's chain fer...,0,English
0,Vehicles and pedestrians will now embark and d...,Passengers using a chain ferry have been warne...,1,English
1,If you leave your mobile phone somewhere do yo...,"You may be worried about your health, but what...",0,English


In [15]:
df_clean.rename({'input':'passage', 'list_choices': 'summary', 'target':'answer'}, axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean.rename({'input':'passage', 'list_choices': 'summary', 'target':'answer'}, axis=1, inplace=True)


In [16]:
df_clean.columns

Index(['passage', 'summary', 'answer', 'language'], dtype='object')

In [17]:
df_clean.to_json('../datasets/fib-english-openai.jsonl', orient='records', lines=True)

### 2.0 Generate Malay translations

In [4]:
class NewTranslatedOutput(BaseModel):
    passage_my: str
    summary_my: str

class OldTranslatedOutput(BaseModel):
    summary_my: str

In [5]:
client = OpenAI()

In [8]:
passage_done = {}

# Define a function to handle when all retries fail
def after_all_retries(retry_state):
    print(retry_state.outcome.exception())
    print("All retries failed. Moving on...\n")

@retry(
        stop=stop_after_attempt(3),
        wait=wait_random_exponential(min=1, max=60),
        retry_error_callback=after_all_retries,
)
def generate_reasoning(sample):
    """
    A function that calls the OpenAI API to generate Malay translations and returns a dictionary.
    
    Here, we handle an instance where the 'passage' column (which is very long!) has been translated
    already. If it hasn't we'll use the NewTranslatedOutput pydantic class as the output, else
    we'll use the OldTranslatedOutput class. Gotta save those OpenAI credits!
    """

    if sample['passage'] not in passage_done.keys():
        prompt_new = f"""You are tasked with providing translations from English to
            Bahasa Malaysia/Malay.

            Passage (Input):
            ```
            {sample['passage']}
            ```

            Statement (Input):
            ```
            {sample['summary']}
            ```

            Provide the output in JSON schema e.g., {{'passage_my': '', 'summary_my': ''}}.
            """
        try:
            completion = client.chat.completions.create(
            model="gpt-4o-mini",
            response_format={
                "type": "json_schema",
                "json_schema": {
                    "name": "NewTranslatedOutput",
                    "strict": True,
                    "schema": {
                        "type": 'object',
                        "properties": {
                            'passage_my': {"type": "string"},
                            'summary_my': {"type": "string"},
                        },
                        "required": ["summary_my", "passage_my"],
                        "additionalProperties": False
                    }

                }
            },
            temperature=0.2,
            messages=[
                {"role": "system", "content": prompt_new},
            ]
            )

            # add the translated output as a key in the passage_done dictionary.
            passage_done[sample['passage']] = NewTranslatedOutput.model_validate_json(completion.choices[0].message.content).passage_my

            time.sleep(10)
            
            return {
                'passage': NewTranslatedOutput.model_validate_json(completion.choices[0].message.content).passage_my,
                'summary': NewTranslatedOutput.model_validate_json(completion.choices[0].message.content).summary_my,
                'answer': sample['answer'],
                'language': 'Malay',
            }
        
        except ValidationError as e:
            raise ValidationError(f"Pydantic ValidationError: {e} - summary: {sample['summary']}")
            
    else:
        prompt_old = f"""You are tasked with providing translations from English to
            Bahasa Malaysia/Malay.

            Statement (Input):
            ```
            {sample['summary']}
            ```

            Provide the output in JSON schema e.g., {{'summary_my': ''}}.
            """
        try:
            completion = client.chat.completions.create(
            model="gpt-4o-mini",
            response_format={
                "type": "json_schema",
                "json_schema": {
                    "name": "OldTranslatedOutput",
                    "strict": True,
                    "schema": {
                        "type": 'object',
                        "properties": {
                            'summary_my': {"type": "string"},
                        },
                        "required": ["summary_my"],
                        "additionalProperties": False
                    }

                }
            },
            temperature=0.2,
            messages=[
                {"role": "system", "content": prompt_old},
            ]
            )

            time.sleep(10)

            return {
                'passage': passage_done[sample['passage']],
                'summary': OldTranslatedOutput.model_validate_json(completion.choices[0].message.content).summary_my,
                'answer': sample['answer'],
                'language': 'Malay',
            }
        
        except:
            raise ValidationError(f"Pydantic ValidationError: {e} - summary: {sample['summary']}")

In [9]:
data = []

with open('../datasets/fib-english-openai.jsonl') as fopen:
    for d in tqdm(fopen):
        d = json.loads(d)
        data.append(d)

0it [00:00, ?it/s]

In [10]:
print(len(data))

4081


In [11]:
data[99]

{'passage': 'It comes after Ruskin College\'s governing body agreed to cut two of its six BA programmes and one of its three MA courses to save money.\nCollege principal Chris Wilkes said the decision was related to "low student numbers rather than any other reason".\nHe said it would continue to focus on adult education.\nFamous Ruskin College alumni include former deputy prime minister John Prescott and Dennis Skinner, MP for Bolsover in Derbyshire.\nBA English Studies, BA History with Social Sciences and MA Women\'s Studies are being discontinued.\nAnne Hughes, 63, from Thame in Oxfordshire, has been studying BA English Studies part-time at the college for two years.\nShe said she was "absolutely distraught" it was being cut.\nShe is physically disabled and has depression, and said she chose to study at the college because it worked with "disadvantaged, disabled and mature students".\nThe college also offers bespoke timetables, unlike some other places of higher education, she added

In [12]:
# test on one sample
generate_reasoning(data[99])

{'passage': 'Ia datang selepas badan pengelola Kolej Ruskin bersetuju untuk memotong dua daripada enam program BA dan satu daripada tiga kursus MA untuk menjimatkan wang. Pengetua kolej, Chris Wilkes, berkata keputusan itu berkaitan dengan "bilangan pelajar yang rendah dan bukannya sebab lain". Beliau berkata kolej akan terus memberi tumpuan kepada pendidikan dewasa. Alumni terkenal Kolej Ruskin termasuk bekas timbalan perdana menteri John Prescott dan Dennis Skinner, MP untuk Bolsover di Derbyshire. BA English Studies, BA History with Social Sciences dan MA Women\'s Studies sedang dihentikan. Anne Hughes, 63, dari Thame di Oxfordshire, telah belajar BA English Studies secara separuh masa di kolej selama dua tahun. Beliau berkata beliau "sangat kecewa" ia dihentikan. Beliau mempunyai kecacatan fizikal dan mengalami kemurungan, dan berkata beliau memilih untuk belajar di kolej kerana ia bekerja dengan "pelajar yang kurang bernasib baik, cacat dan dewasa". Kolej juga menawarkan jadual wa

In [13]:
# verify
data[99]['passage'] in passage_done.keys()

True

In [14]:
max_worker = 50
passage_done = {}

for i in tqdm(range(0, len(data), max_worker)):
    with ThreadPoolExecutor(max_workers=max_worker) as executor:
        futures = {executor.submit(generate_reasoning, t): t for t in data[i: i + max_worker]}

        for future in as_completed(futures):
            result = future.result()
            if result:
                with open('../datasets/openai-generated/fib-malay-openai.jsonl', 'a') as final:
                    json.dump(result, final)
                    final.write('\n')

  0%|          | 0/82 [00:00<?, ?it/s]

No constructor defined
All retries failed. Moving on...

No constructor defined
All retries failed. Moving on...

No constructor defined
All retries failed. Moving on...

No constructor defined
All retries failed. Moving on...

No constructor defined
All retries failed. Moving on...

No constructor defined
All retries failed. Moving on...

No constructor defined
All retries failed. Moving on...

No constructor defined
All retries failed. Moving on...

No constructor defined
All retries failed. Moving on...

No constructor defined
All retries failed. Moving on...

No constructor defined
All retries failed. Moving on...

No constructor defined
All retries failed. Moving on...

No constructor defined
All retries failed. Moving on...

No constructor defined
All retries failed. Moving on...

No constructor defined
All retries failed. Moving on...

No constructor defined
All retries failed. Moving on...

No constructor defined
All retries failed. Moving on...

No constructor defined
All retr