In [1]:
import pandas as pd
from tqdm.notebook import tqdm
from datasets import load_dataset
from huggingface_hub import HfApi, login
from sklearn.model_selection import train_test_split

import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [2]:
# set login credentials for Huggingface
login('')

api = HfApi()

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /home/faiq0913/.cache/huggingface/token
Login successful


In [3]:
# load model for translations
tokenizer = T5Tokenizer.from_pretrained('mesolitica/t5-base-standard-bahasa-cased')
model = T5ForConditionalGeneration.from_pretrained('mesolitica/t5-base-standard-bahasa-cased').eval().cuda()

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [4]:
!nvidia-smi

Sat Oct 12 18:27:07 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 555.52.01              Driver Version: 555.99         CUDA Version: 12.5     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4050 ...    On  |   00000000:01:00.0 Off |                  N/A |
| N/A   52C    P3              9W /   39W |    1017MiB /   6141MiB |      7%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

### 1.0 Load, explore & clean FIB dataset

In [5]:
ds = load_dataset('r-three/fib', split='test')
df = ds.to_pandas()

Repo card metadata block was not found. Setting CardData to empty.


In [6]:
df.head(3)

Unnamed: 0,id,input,correct_choice,list_choices,lbl,distractor_model,dataset
0,32168497,Vehicles and pedestrians will now embark and d...,Passengers using a chain ferry have been warne...,[ A new service on the Isle of Wight's chain f...,1,bart-base,xsum
1,29610109,If you leave your mobile phone somewhere do yo...,"Do you ever feel lonely, stressed or jealous w...","[ You may be worried about your health, but wh...",1,bart-base,xsum
2,38018439,"Speaking on TV, Maria Zakharova said Jews had ...",A spokeswoman on Russian TV has said Jewish pe...,[ The Russian foreign minister has said she ha...,1,bart-base,xsum


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3579 entries, 0 to 3578
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                3579 non-null   object
 1   input             3579 non-null   object
 2   correct_choice    3579 non-null   object
 3   list_choices      3579 non-null   object
 4   lbl               3579 non-null   int64 
 5   distractor_model  3579 non-null   object
 6   dataset           3579 non-null   object
dtypes: int64(1), object(6)
memory usage: 195.9+ KB


In [8]:
df.dataset.value_counts(normalize=True)

dataset
xsum      0.872311
cnn_dm    0.127689
Name: proportion, dtype: float64

In [9]:
df.iloc[0].input

'Vehicles and pedestrians will now embark and disembark the Cowes ferry separately following Maritime and Coastguard Agency (MCA) guidance.\nIsle of Wight Council said its new procedures were in response to a resident\'s complaint.\nCouncillor Shirley Smart said it would "initially result in a slower service".\nOriginally passengers and vehicles boarded or disembarked the so called "floating bridge" at the same time.\nMs Smart, who is the executive member for economy and tourism, said the council already had measures in place to control how passengers and vehicles left or embarked the chain ferry "in a safe manner".\nHowever, it was "responding" to the MCA\'s recommendations "following this complaint".\nShe added: "This may initially result in a slower service while the measures are introduced and our customers get used to the changes."\nThe service has been in operation since 1859.'

In [10]:
df.iloc[0].list_choices

array([" A new service on the Isle of Wight's chain ferry has been launched following a complaint from a resident.",
       'Passengers using a chain ferry have been warned crossing times will be longer because of new safety measures.'],
      dtype=object)

In [11]:
# Each list choice contains a positive and negative summary; we'll explode & clean
df_ = df.explode('list_choices')
df_['list_choices'] = df_['list_choices'].apply(lambda x: x.strip())

There seems to be similar input texts in the *input* column. A quick check yields that the correct choices get duplicated many times in the *list_choices* column. We'll need to handle this.

In [12]:
# https://stackoverflow.com/questions/57297077/use-variable-in-pandas-query
df_.query('input == @df_.iloc[0].input')

Unnamed: 0,id,input,correct_choice,list_choices,lbl,distractor_model,dataset
0,32168497,Vehicles and pedestrians will now embark and d...,Passengers using a chain ferry have been warne...,A new service on the Isle of Wight's chain fer...,1,bart-base,xsum
0,32168497,Vehicles and pedestrians will now embark and d...,Passengers using a chain ferry have been warne...,Passengers using a chain ferry have been warne...,1,bart-base,xsum
463,32168497,Vehicles and pedestrians will now embark and d...,Passengers using a chain ferry have been warne...,Passengers using a chain ferry have been warne...,0,bart-large,xsum
463,32168497,Vehicles and pedestrians will now embark and d...,Passengers using a chain ferry have been warne...,"The Isle of Wight's ""floating bridge"" is to be...",0,bart-large,xsum
877,32168497,Vehicles and pedestrians will now embark and d...,Passengers using a chain ferry have been warne...,ferry services are run by the Cowes Ferry Comp...,1,bloom-560m,xsum
877,32168497,Vehicles and pedestrians will now embark and d...,Passengers using a chain ferry have been warne...,Passengers using a chain ferry have been warne...,1,bloom-560m,xsum
2639,32168497,Vehicles and pedestrians will now embark and d...,Passengers using a chain ferry have been warne...,A chain of chain ferry services in the Isle of...,1,t5-large,xsum
2639,32168497,Vehicles and pedestrians will now embark and d...,Passengers using a chain ferry have been warne...,Passengers using a chain ferry have been warne...,1,t5-large,xsum


In [13]:
print(f'Num. rows before drop duplicates: {len(df_)}')
df_ = df_.drop_duplicates(subset=['input', 'list_choices'])
print(f'Num. rows after drop duplicates: {len(df_)}')

df_[['input', 'list_choices', 'correct_choice']].head(5)

Num. rows before drop duplicates: 7158
Num. rows after drop duplicates: 4081


Unnamed: 0,input,list_choices,correct_choice
0,Vehicles and pedestrians will now embark and d...,A new service on the Isle of Wight's chain fer...,Passengers using a chain ferry have been warne...
0,Vehicles and pedestrians will now embark and d...,Passengers using a chain ferry have been warne...,Passengers using a chain ferry have been warne...
1,If you leave your mobile phone somewhere do yo...,"You may be worried about your health, but what...","Do you ever feel lonely, stressed or jealous w..."
1,If you leave your mobile phone somewhere do yo...,"Do you ever feel lonely, stressed or jealous w...","Do you ever feel lonely, stressed or jealous w..."
2,"Speaking on TV, Maria Zakharova said Jews had ...",The Russian foreign minister has said she has ...,A spokeswoman on Russian TV has said Jewish pe...


In [14]:
# Create labels where factually consistent (1) and factually inconsistent/contradicting (0)
df_.loc[df_['correct_choice'] == df_['list_choices'], 'target'] = 1
df_.loc[df_['correct_choice'] != df_['list_choices'], 'target'] = 0
df_['target'] = df_['target'].astype(int)

df_[['input', 'list_choices', 'target']].head(3)

Unnamed: 0,input,list_choices,target
0,Vehicles and pedestrians will now embark and d...,A new service on the Isle of Wight's chain fer...,0
0,Vehicles and pedestrians will now embark and d...,Passengers using a chain ferry have been warne...,1
1,If you leave your mobile phone somewhere do yo...,"You may be worried about your health, but what...",0


In [15]:
df_clean = df_[['input', 'list_choices', 'target']]
df_clean['language'] = 'English'

df_clean.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['language'] = 'English'


Unnamed: 0,input,list_choices,target,language
0,Vehicles and pedestrians will now embark and d...,A new service on the Isle of Wight's chain fer...,0,English
0,Vehicles and pedestrians will now embark and d...,Passengers using a chain ferry have been warne...,1,English
1,If you leave your mobile phone somewhere do yo...,"You may be worried about your health, but what...",0,English


### 2.0 Generate Malay translated rows for dataset

In [16]:
print(df_clean.input.loc[3541])

( cnn ) five years after the deepwater horizon rig exploded and unleashed the largest marine oil spill in the nation 's history , we are still experiencing -- yet only beginning to truly understand -- its profound environmental and economic repercussions . the immediate aftermath of the oil spill has been well documented , with declines in tourism and the seafood industry , as well as the significant destruction of wildlife in the region . since then , the amount of oil in the area has dissipated and communities have started to show signs of recovery . in fact , reports indicate that the gulf of mexico 's seafood industry , which supplies the united states with roughly 40 % of its seafood , is finally starting to rebound . however , profound challenges remain , in part because so many questions about the long-term consequences remain unanswered . to this day , it 's still unclear where all of the oil went , exactly how much remains or whether the reappearance of wildlife is a result of

In [17]:
# Let's translate one sample and compare with the English translation above.
input_ids = tokenizer.encode(f'terjemah Inggeris ke Melayu: {df_clean.input.loc[3541]}', return_tensors = 'pt').cuda()
outputs = model.generate(input_ids, max_length = 4096)
all_special_ids = [0, 1, 2]
outputs = [i for i in outputs[0] if i not in all_special_ids]
tokenizer.decode(outputs, spaces_between_special_tokens = False)

'(cnn) lima tahun setelah pelantar cakrawala air dalam meletup dan melepaskan tumpahan minyak laut terbesar dalam sejarah negara, kita masih mengalami - namun baru mula benar-benar memahami - kesan persekitaran dan ekonominya yang mendalam. sejurus selepas tumpahan minyak telah didokumentasikan dengan baik, dengan penurunan pelancongan dan industri makanan laut, serta kemusnahan hidupan liar yang ketara di rantau ini. sejak itu, jumlah minyak di kawasan itu hilang dan masyarakat telah mula menunjukkan tanda-tanda pemulihan. Sebenarnya, laporan menunjukkan bahawa jurang industri makanan laut mexico, yang membekalkan kira-kira 40% makanan lautnya, akhirnya mulai pulih. Namun, cabaran mendalam tetap ada, sebahagiannya kerana begitu banyak persoalan mengenai akibat jangka panjang tetap belum dijawab. hingga hari ini, masih belum jelas ke mana semua minyak pergi, berapa banyak yang tersisa atau apakah kemunculan semula hidupan liar adalah hasil penyesuaian atau isyarat bahawa krisis benar-b

In [18]:
# to take advantage of batch processing on GPU's, we save everything in lists first
choices_en = df_clean['list_choices'].to_list()
input_en = df_clean['input'].to_list()
target = df_clean['target'].to_list()

In [19]:
input_my = []
batch_size = 5

with torch.no_grad():
    
    for i in tqdm(range(0, len(input_en), batch_size)):
        batch = input_en[i:i+batch_size]
        encodeds = [f'terjemah Inggeris ke Melayu: {row}' for row in batch]
        
        model_inputs = tokenizer(
            encodeds,
            padding=True,
            truncation=True, # https://huggingface.co/docs/transformers/en/pad_truncation
            max_length=1024, # https://huggingface.co/mesolitica/translation-t5-base-standard-bahasa-cased/blob/main/config.json
            return_tensors="pt",
        ).to('cuda')

        generated_ids = model.generate(**model_inputs, max_length = 4096)
        decoded = tokenizer.batch_decode(
            generated_ids,
            spaces_between_special_tokens=True,
            skip_special_tokens=True
        )

        for output in decoded:
            input_my.append(output)

  0%|          | 0/817 [00:00<?, ?it/s]

In [20]:
choices_my = []
batch_size = 20

with torch.no_grad():
    
    for i in tqdm(range(0, len(choices_en), batch_size)):
        batch = choices_en[i:i+batch_size]
        encodeds = [f'terjemah Inggeris ke Melayu: {row}' for row in batch]
        
        model_inputs = tokenizer(
            encodeds,
            padding=True,
            truncation=True,
            max_length=1024,
            return_tensors="pt",
        ).to('cuda')

        generated_ids = model.generate(**model_inputs, max_length = 4096)
        decoded = tokenizer.batch_decode(
            generated_ids,
            spaces_between_special_tokens=True,
            skip_special_tokens=True
        )

        for output in decoded:
            choices_my.append(output)

  0%|          | 0/205 [00:00<?, ?it/s]

In [21]:
df_malay = pd.DataFrame({
    'input': input_my,
    'list_choices': choices_my,
    'target': target,
    'language': 'Malay'
})

df_malay.head(3)

Unnamed: 0,input,list_choices,target,language
0,Kenderaan dan pejalan kaki kini akan memulakan...,Perkhidmatan baru di feri rantai Isle of Wight...,0,Malay
1,Kenderaan dan pejalan kaki kini akan memulakan...,Penumpang yang menggunakan feri berantai telah...,1,Malay
2,Sekiranya anda meninggalkan telefon bimbit and...,"Anda mungkin bimbang tentang kesihatan anda, t...",0,Malay


In [22]:
df_final = pd.concat([df_malay, df_clean], axis=0)

print(f'Num. rows in final dataset: {len(df_final)}')
df_final.head(3)

Num. rows in final dataset: 8162


Unnamed: 0,input,list_choices,target,language
0,Kenderaan dan pejalan kaki kini akan memulakan...,Perkhidmatan baru di feri rantai Isle of Wight...,0,Malay
1,Kenderaan dan pejalan kaki kini akan memulakan...,Penumpang yang menggunakan feri berantai telah...,1,Malay
2,Sekiranya anda meninggalkan telefon bimbit and...,"Anda mungkin bimbang tentang kesihatan anda, t...",0,Malay


In [23]:
df_final.to_json('../datasets/fib-all.jsonl', orient='records', lines=True)

### 3.0 Train-test Split & Upload to Huggingface

The FIB datasets only has a *test* set. Hence, for our purposes, we generate a train/test split by ourselves.

In [25]:
df_final.columns

Index(['input', 'list_choices', 'target', 'language'], dtype='object')

In [28]:
# Split into train and val, ensuring that the same source doc doesn't appear across train and val
source_grouped = (
    df_final.groupby(['language', 'input'])
            .agg({'target': 'count'})
            .reset_index()
)

source_grouped

Unnamed: 0,language,input,target
0,English,"""I'm 64,"" he said of his fascination with the ...",8
1,English,( cnn ) # upordown ? that 's the trending ques...,14
2,English,( cnn ) '' real housewives of beverly hills ''...,2
3,English,( cnn ) '' success kid '' is likely the intern...,8
4,English,( cnn ) a 32-year-old massachusetts man is fac...,6
...,...,...,...
1187,Malay,"sao paulo, polis brazilian (cnn) telah menangk...",3
1188,Malay,"sao paulo, supermodel brazilian (cnn) brazilia...",2
1189,Malay,sportscotland mengatakan ia belum memutuskan d...,5
1190,Malay,washington (cnn) anggota yang tidak bertugas d...,3


In [29]:
input_train, input_val = train_test_split(source_grouped,
                                          test_size=0.18,
                                          stratify=source_grouped['target'],
                                          random_state=42)

In [30]:
df_train = df_final[df_final['input'].isin(input_train['input'])]
df_test = df_final[df_final['input'].isin(input_val['input'])]

df_train['split'] = 'train'
df_test['split'] = 'test'

print(f'Num. rows in train: {len(df_train)}')
print(f'Num. rows in val: {len(df_test)}')

Num. rows in train: 6695
Num. rows in val: 1467


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['split'] = 'train'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['split'] = 'test'


In [31]:
df_train.rename(columns={'input': 'passage', 'list_choices': 'summary', 'target': 'answer'}, inplace=True)
df_test.rename(columns={'input': 'passage', 'list_choices': 'summary', 'target': 'answer'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train.rename(columns={'input': 'passage', 'list_choices': 'summary', 'target': 'answer'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test.rename(columns={'input': 'passage', 'list_choices': 'summary', 'target': 'answer'}, inplace=True)


In [34]:
df_train.to_json('../datasets/fib-train.jsonl', orient='records', lines=True)
df_test.to_json('../datasets/fib-test.jsonl', orient='records', lines=True)

In [38]:
for f in ['../datasets/fib-train.jsonl', '../datasets/fib-test.jsonl']:
    api.upload_file(
        path_or_fileobj=f,
        path_in_repo=f.split("/")[2],
        repo_id="wanadzhar913/fib-malay",
        repo_type="dataset",
    )

fib-train.jsonl:   0%|          | 0.00/11.9M [00:00<?, ?B/s]