In [1]:
import pandas as pd
from tqdm.notebook import tqdm
from datasets import load_dataset
from huggingface_hub import HfApi, login

import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [2]:
# set login credentials for Huggingface
login('')

api = HfApi()

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /home/faiq0913/.cache/huggingface/token
Login successful


In [3]:
# load model for translations
tokenizer = T5Tokenizer.from_pretrained('mesolitica/t5-base-standard-bahasa-cased')
model = T5ForConditionalGeneration.from_pretrained('mesolitica/t5-base-standard-bahasa-cased').cuda()

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [4]:
!nvidia-smi

Sun Oct 13 16:34:33 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 555.52.01              Driver Version: 555.99         CUDA Version: 12.5     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4050 ...    On  |   00000000:01:00.0 Off |                  N/A |
| N/A   48C    P3              8W /   45W |    1017MiB /   6141MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

### 1.0 Load, explore & clean BoolQ dataset

In [5]:
# we'll be taking the validation set only
ds = load_dataset('google/boolq', split='validation')
df = ds.to_pandas()

In [6]:
df.head(3)

Unnamed: 0,question,answer,passage
0,does ethanol take more energy make that produces,False,All biomass goes through at least some of thes...
1,is house tax and property tax are same,True,Property tax or 'house tax' is a local tax on ...
2,is pain experienced in a missing body part or ...,True,Phantom pain sensations are described as perce...


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3270 entries, 0 to 3269
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   question  3270 non-null   object
 1   answer    3270 non-null   bool  
 2   passage   3270 non-null   object
dtypes: bool(1), object(2)
memory usage: 54.4+ KB


In [8]:
df.answer.value_counts(normalize=True)

answer
True     0.621713
False    0.378287
Name: proportion, dtype: float64

In [10]:
df.iloc[0].question

'does ethanol take more energy make that produces'

In [9]:
df.iloc[0].passage

"All biomass goes through at least some of these steps: it needs to be grown, collected, dried, fermented, distilled, and burned. All of these steps require resources and an infrastructure. The total amount of energy input into the process compared to the energy released by burning the resulting ethanol fuel is known as the energy balance (or ``energy returned on energy invested''). Figures compiled in a 2007 report by National Geographic Magazine point to modest results for corn ethanol produced in the US: one unit of fossil-fuel energy is required to create 1.3 energy units from the resulting ethanol. The energy balance for sugarcane ethanol produced in Brazil is more favorable, with one unit of fossil-fuel energy required to create 8 from the ethanol. Energy balance estimates are not easily produced, thus numerous such reports have been generated that are contradictory. For instance, a separate survey reports that production of ethanol from sugarcane, which requires a tropical clima

Before we proceed any further, it's important we ensure the *passage* column has at most 1024 tokens (due to our model only being able to take a maximum of 1024 tokens at a time). Hence, we create a function below that counts the number of tokens for every row in the *passage* column.

In [49]:
def count_tokens(text: str):
    """Function to calculate the amount of tokens after encoding with the tokenizer."""
    return (len(tokenizer.encode(f'terjemah Inggeris ke Melayu: {text}', return_tensors = 'pt')[0]))

In [50]:
df_ = df.copy()

df_['passage_sequence_length'] = df_['passage'].apply(lambda x: count_tokens(x))
df_['answer'] = df_['answer'].apply(lambda x: 0 if x == False else 1)
df_['language'] = 'English'

df_.head(3)

Unnamed: 0,question,answer,passage,passage_sequence_length,language
0,does ethanol take more energy make that produces,0,All biomass goes through at least some of thes...,280,English
1,is house tax and property tax are same,1,Property tax or 'house tax' is a local tax on ...,299,English
2,is pain experienced in a missing body part or ...,1,Phantom pain sensations are described as perce...,84,English


In [51]:
df_.passage_sequence_length.describe()

count    3270.000000
mean      138.120183
std        77.831390
min        16.000000
25%        86.000000
50%       123.000000
75%       172.000000
max      1114.000000
Name: passage_sequence_length, dtype: float64

In [52]:
# due to our context lenght limitation, we'll be dropping this row.
df_.query('passage_sequence_length > 1024')

Unnamed: 0,question,answer,passage,passage_sequence_length,language
3154,do you get overseas service bars for korea,1,19-28. Overseas service bars a. Authorized wea...,1114,English


In [53]:
df_final = df_.query('passage_sequence_length <= 1024')[['passage', 'question', 'answer', 'language']]

df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3269 entries, 0 to 3268
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   index     3269 non-null   int64 
 1   passage   3269 non-null   object
 2   question  3269 non-null   object
 3   answer    3269 non-null   int64 
 4   language  3269 non-null   object
dtypes: int64(2), object(3)
memory usage: 127.8+ KB


### 2.0 Generate Malay translated rows for dataset

In [54]:
df_final.passage.loc[0]

"All biomass goes through at least some of these steps: it needs to be grown, collected, dried, fermented, distilled, and burned. All of these steps require resources and an infrastructure. The total amount of energy input into the process compared to the energy released by burning the resulting ethanol fuel is known as the energy balance (or ``energy returned on energy invested''). Figures compiled in a 2007 report by National Geographic Magazine point to modest results for corn ethanol produced in the US: one unit of fossil-fuel energy is required to create 1.3 energy units from the resulting ethanol. The energy balance for sugarcane ethanol produced in Brazil is more favorable, with one unit of fossil-fuel energy required to create 8 from the ethanol. Energy balance estimates are not easily produced, thus numerous such reports have been generated that are contradictory. For instance, a separate survey reports that production of ethanol from sugarcane, which requires a tropical clima

In [36]:
# Let's translate one sample and compare with the English translation above.
input_ids = tokenizer.encode(f'terjemah Inggeris ke Melayu: {df_final.passage.loc[0]}', return_tensors = 'pt').cuda()
outputs = model.generate(input_ids, max_length = 4096)
all_special_ids = [0, 1, 2]
outputs = [i for i in outputs[0] if i not in all_special_ids]
tokenizer.decode(outputs, spaces_between_special_tokens = False)

'Semua biomas melalui sekurang-kurangnya beberapa langkah ini: ia perlu ditanam, dikumpulkan, dikeringkan, ditapai, disuling, dan dibakar. Semua langkah ini memerlukan sumber dan infrastruktur. Jumlah input tenaga ke dalam proses berbanding tenaga yang dikeluarkan dengan membakar bahan bakar etanol yang dihasilkan dikenali sebagai keseimbangan tenaga (atau "tenaga dikembalikan pada tenaga yang dilaburkan"). Angka yang disusun dalam laporan 2007 oleh National Geographic Magazine menunjukkan hasil sederhana untuk etanol jagung yang dihasilkan di AS: satu unit tenaga bahan bakar fosil diperlukan untuk membuat 1.3 unit tenaga dari etanol yang dihasilkan. Baki tenaga untuk etanol tebu yang dihasilkan di Brazil lebih baik, dengan satu unit tenaga bahan bakar fosil diperlukan untuk menghasilkan 8 dari etanol. Anggaran keseimbangan tenaga tidak dihasilkan dengan mudah, sehingga banyak laporan seperti itu dihasilkan yang bertentangan. Sebagai contoh, tinjauan berasingan melaporkan bahawa pengel

In [56]:
# to take advantage of batch processing on GPU's, we save everything in lists first
question_en = df_final['question'].to_list()
passage_en = df_final['passage'].to_list()
target = df_final['answer'].to_list()

In [57]:
passage_my = []
batch_size = 5

with torch.no_grad():
    
    for i in tqdm(range(0, len(passage_en), batch_size)):
        batch = passage_en[i:i+batch_size]
        encodeds = [f'terjemah Inggeris ke Melayu: {row}' for row in batch]
        
        model_inputs = tokenizer(
            encodeds,
            padding=True,
            truncation=True, # https://huggingface.co/docs/transformers/en/pad_truncation
            max_length=1024, # https://huggingface.co/mesolitica/translation-t5-base-standard-bahasa-cased/blob/main/config.json
            return_tensors="pt",
        ).to('cuda')

        generated_ids = model.generate(**model_inputs, max_length = 4096)
        decoded = tokenizer.batch_decode(
            generated_ids,
            spaces_between_special_tokens=True,
            skip_special_tokens=True
        )

        for output in decoded:
            passage_my.append(output)

  0%|          | 0/654 [00:00<?, ?it/s]

In [58]:
question_my = []
batch_size = 20

with torch.no_grad():
    
    for i in tqdm(range(0, len(question_en), batch_size)):
        batch = question_en[i:i+batch_size]
        encodeds = [f'terjemah Inggeris ke Melayu: {row}' for row in batch]
        
        model_inputs = tokenizer(
            encodeds,
            padding=True,
            truncation=True,
            max_length=1024,
            return_tensors="pt",
        ).to('cuda')

        generated_ids = model.generate(**model_inputs, max_length = 4096)
        decoded = tokenizer.batch_decode(
            generated_ids,
            spaces_between_special_tokens=True,
            skip_special_tokens=True
        )

        for output in decoded:
            question_my.append(output)

  0%|          | 0/164 [00:00<?, ?it/s]

In [59]:
df_malay = pd.DataFrame({
    'passage': passage_my,
    'question': question_my,
    'answer': target,
    'language': 'Malay',
})

df_malay.head(3)

Unnamed: 0,passage,question,answer,language
0,Semua biomas melalui sekurang-kurangnya bebera...,adakah etanol mengambil lebih banyak tenaga ya...,0,Malay
1,Cukai harta tanah atau 'cukai rumah' adalah cu...,adalah cukai rumah dan cukai harta tanah sama,1,Malay
2,Sensasi kesakitan hantu digambarkan sebagai pe...,adalah kesakitan yang dialami di bahagian bada...,1,Malay


In [65]:
df_both = pd.concat([df_malay, df_final[['passage', 'question', 'answer', 'language']]], axis=0)

print(f'Num. rows in final dataset: {len(df_both)}')
df_both.head(3)

Num. rows in final dataset: 6538


Unnamed: 0,passage,question,answer,language
0,Semua biomas melalui sekurang-kurangnya bebera...,adakah etanol mengambil lebih banyak tenaga ya...,0,Malay
1,Cukai harta tanah atau 'cukai rumah' adalah cu...,adalah cukai rumah dan cukai harta tanah sama,1,Malay
2,Sensasi kesakitan hantu digambarkan sebagai pe...,adalah kesakitan yang dialami di bahagian bada...,1,Malay


In [66]:
df_both['split'] = 'validation'

df_both.to_json('../datasets/boolq-validation.jsonl', orient='records', lines=True)

### 3.0 Upload to Huggingface

In [62]:
df_both.columns

Index(['passage', 'question', 'answer', 'language', 'index'], dtype='object')

In [None]:
file_name = '../datasets/boolq-validation.jsonl'

In [67]:
api.upload_file(
    path_or_fileobj=file_name,
    path_in_repo=file_name.split("/")[2],
    repo_id="wanadzhar913/boolq-malay",
    repo_type="dataset",
)

CommitInfo(commit_url='https://huggingface.co/datasets/wanadzhar913/boolq-malay/commit/b5b6b9b58b477273978e8ed2f57304d069432f87', commit_message='Upload boolq-validation.jsonl with huggingface_hub', commit_description='', oid='b5b6b9b58b477273978e8ed2f57304d069432f87', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/wanadzhar913/boolq-malay', endpoint='https://huggingface.co', repo_type='dataset', repo_id='wanadzhar913/boolq-malay'), pr_revision=None, pr_num=None)