In [1]:
import os
import csv
import ast
import time
import pandas as pd

from datasets import load_dataset
from google.cloud import translate
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


### CSV Functions

In [2]:
def load_csv_data(file_path, bool_params):
    # Initialize an empty list to store the data
    data_list = []

    # Open the CSV file for reading
    with open(file_path, newline='') as csvfile:
        # Create a CSV reader object
        csv_reader = csv.DictReader(csvfile)
        
        # Iterate through each row in the CSV file
        for row in csv_reader:
            # Append the row (as a dictionary) to the data_list
            row["choices"] = ast.literal_eval(row["choices"])

            for param in bool_params:
                if row[param].lower() == "true":
                    row[param] = True
                elif row[param].lower() == "false":
                    row[param] = False
                else:
                    raise TypeError(f"{param} data cannot be recognized")

            data_list.append(row)
    
    return data_list

def load_all_rephrase_data(split, dir_path, file_name):
    data = {}
    
    for s in split:
        file_path = f"{dir_path}/{s}{file_name}"
        data[s] = load_csv_data(file_path, [])
        # data[s] = load_csv_data(file_path, ["concept", "name", "option"])
    
    return data

def save_data(samples, file_path):
    # Get the keys from the first dictionary
    header = samples[0].keys()

    # Write the data to the CSV file
    with open(file_path, 'w', newline='', encoding="utf-8") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=header)
        
        # Write the header
        writer.writeheader()
        
        # Write the data
        for row in samples:
            writer.writerow(row)

    print(f'CSV file "{file_path}" has been created with the data.')

### Google Translate Functions

In [11]:
def translate_texts(texts, project_id="radiant-math-403602", src_lang="en", tgt_lang="id"):

    client = translate.TranslationServiceClient()
    location = "global"
    parent = f"projects/{project_id}/locations/{location}"

    response = client.translate_text(
        request={
            "parent": parent,
            "contents": texts,
            "mime_type": "text/plain",
            "source_language_code": src_lang,
            "target_language_code": tgt_lang,
        }
    )

    return [t.translated_text for t in response.translations]

def translate_data(data, src_lang="en", tgt_lang="id"):
    results = []
    for item in tqdm(data):
        trans_items = [item['question'], item['question_concept']] + item['choices']['text']

        try:
            trans_texts = translate_texts(trans_items, src_lang=src_lang, tgt_lang=tgt_lang)
        except Exception:
            print('Caught exception, wait for 1 min...')
            time.sleep(60)
            trans_texts = translate_texts(trans_items, src_lang=src_lang, tgt_lang=tgt_lang)

        results.append({
            'id': item['id'],
            'question': trans_texts[0],
            'question_concept': trans_texts[1],
            'choices': {
                'label': item['choices']['label'],
                'text': trans_texts[2:]
            },
            'answerKey': item['answerKey']
        })
    
    return results

### Run Translation

In [15]:
split = ["validation", "test", "train"]
# en_data = load_all_rephrase_data(split, "92123", "_rephrased_name_92123.csv")
id_data = load_all_rephrase_data(split, "translated_data/id", ".csv")
su_data = load_all_rephrase_data(split, "translated_data/su", ".csv")

In [16]:
su_data["test"][5]

{'id': '6917399ea434e6c484459f895c72ef90',
 'question': 'Jenis sumur naon waÃ© anu tiasa nyababkeun kontrovÃ©rsi?',
 'question_concept': 'Sehat',
 'choices': {'label': ['A', 'B', 'C', 'D', 'E'],
  'text': ['panasbumi',
   'cai taneuh',
   'minyak jeung gas',
   'tatanÃ©n',
   'artesian']},
 'answerKey': 'C'}

In [17]:
for s in split:
    print(f"Translating data split {s}")
    print(f"Backtranslation for ID -> EN")
    id_en = translate_data(id_data[s], src_lang="id", tgt_lang="en")
    save_data(id_en, f"./backtranslation/id_en/{s}.csv")

    print(f"Backtranslation for SU -> EN")
    su_en = translate_data(su_data[s], src_lang="su", tgt_lang="en")
    save_data(su_en, f"./backtranslation/su_en/{s}.csv")

    print(f"Backtranslation for SU -> ID")
    su_id = translate_data(su_data[s], src_lang="su", tgt_lang="id")
    save_data(su_id, f"./backtranslation/su_id/{s}.csv")

Translating data split validation
Backtranslation for ID -> EN


100%|██████████| 274/274 [07:35<00:00,  1.66s/it]


CSV file "./backtranslation/id_en/validation.csv" has been created with the data.
Backtranslation for SU -> EN


100%|██████████| 274/274 [07:24<00:00,  1.62s/it]


CSV file "./backtranslation/su_en/validation.csv" has been created with the data.
Backtranslation for SU -> ID


100%|██████████| 274/274 [07:39<00:00,  1.68s/it]


CSV file "./backtranslation/su_id/validation.csv" has been created with the data.
Translating data split test
Backtranslation for ID -> EN


100%|██████████| 236/236 [06:23<00:00,  1.62s/it]


CSV file "./backtranslation/id_en/test.csv" has been created with the data.
Backtranslation for SU -> EN


100%|██████████| 236/236 [06:26<00:00,  1.64s/it]


CSV file "./backtranslation/su_en/test.csv" has been created with the data.
Backtranslation for SU -> ID


100%|██████████| 236/236 [06:27<00:00,  1.64s/it]


CSV file "./backtranslation/su_id/test.csv" has been created with the data.
Translating data split train
Backtranslation for ID -> EN


100%|██████████| 2162/2162 [1:50:34<00:00,  3.07s/it]    


CSV file "./backtranslation/id_en/train.csv" has been created with the data.
Backtranslation for SU -> EN


 91%|█████████ | 1958/2162 [50:57<05:13,  1.54s/it]

Caught exception, wait for 1 min...


100%|██████████| 2162/2162 [57:34<00:00,  1.60s/it]  


CSV file "./backtranslation/su_en/train.csv" has been created with the data.
Backtranslation for SU -> ID


100%|██████████| 2162/2162 [57:50<00:00,  1.61s/it]

CSV file "./backtranslation/su_id/train.csv" has been created with the data.



