In [32]:
import os
import csv
import ast
import time
import pandas as pd

from datasets import load_dataset
from google.cloud import translate
from tqdm import tqdm

### CSV Functions

In [41]:
def load_csv_data(file_path, bool_params):
    # Initialize an empty list to store the data
    data_list = []

    # Open the CSV file for reading
    with open(file_path, newline='') as csvfile:
        # Create a CSV reader object
        csv_reader = csv.DictReader(csvfile)
        
        # Iterate through each row in the CSV file
        for row in csv_reader:
            # Append the row (as a dictionary) to the data_list
            row["choices"] = ast.literal_eval(row["choices"])

            for param in bool_params:
                if row[param].lower() == "true":
                    row[param] = True
                elif row[param].lower() == "false":
                    row[param] = False
                else:
                    raise TypeError(f"{param} data cannot be recognized")

            data_list.append(row)
    
    return data_list

def load_all_rephrase_data(split, dir_path, file_name):
    data = {}
    
    for s in split:
        file_path = f"{dir_path}/{s}{file_name}"
        data[s] = load_csv_data(file_path, [])
        # data[s] = load_csv_data(file_path, ["concept", "name", "option"])
    
    return data

def save_data(samples, file_path):
    # Get the keys from the first dictionary
    header = samples[0].keys()

    # Write the data to the CSV file
    with open(file_path, 'w', newline='', encoding="utf-8") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=header)
        
        # Write the header
        writer.writeheader()
        
        # Write the data
        for row in samples:
            writer.writerow(row)

    print(f'CSV file "{file_path}" has been created with the data.')

### Google Translate Functions

In [34]:
def translate_texts(texts, project_id="radiant-math-403602", src_lang="en", tgt_lang="id"):

    client = translate.TranslationServiceClient()
    location = "global"
    parent = f"projects/{project_id}/locations/{location}"

    response = client.translate_text(
        request={
            "parent": parent,
            "contents": texts,
            "mime_type": "text/plain",
            "source_language_code": src_lang,
            "target_language_code": tgt_lang,
        }
    )

    return [t.translated_text for t in response.translations]

### Run Translation

In [35]:
split = ["validation", "test", "train"]
# v3_data = load_all_rephrase_data(split, "92123", "_rephrased_name_92123.csv")
v3_data = load_all_rephrase_data(split, "translated_data/id", ".csv")

In [37]:
v3_data["test"][5]

{'id': '6917399ea434e6c484459f895c72ef90',
 'question': 'Jenis sumur apa yang mungkin menimbulkan kontroversi?',
 'question_concept': 'Sehat',
 'choices': {'label': ['A', 'B', 'C', 'D', 'E'],
  'text': ['panas bumi',
   'air tanah',
   'minyak dan gas',
   'pertanian',
   'artesis']},
 'answerKey': 'C'}

In [38]:
src_lang = "id"
tgt_lang = "su"

results = {}
for s in split:
    print(f"Translating data split {s}")
    if s not in results.keys():
        results[s] = []
    for item in tqdm(v3_data[s]):
        trans_items = [item['question'], item['question_concept']] + item['choices']['text']

        try:
            trans_texts = translate_texts(trans_items, src_lang=src_lang, tgt_lang=tgt_lang)
        except Exception:
            print('Caught exception, wait for 1 min...')
            time.sleep(60)
            trans_texts = translate_texts(trans_items, src_lang=src_lang, tgt_lang=tgt_lang)

        results[s].append({
            'id': item['id'],
            'question': trans_texts[0],
            'question_concept': trans_texts[1],
            'choices': {
                'label': item['choices']['label'],
                'text': trans_texts[2:]
            },
            'answerKey': item['answerKey']
        })

Translating data split validation


100%|██████████| 274/274 [08:08<00:00,  1.78s/it]


Translating data split test


100%|██████████| 236/236 [07:01<00:00,  1.79s/it]


Translating data split train


 13%|█▎        | 280/2162 [08:20<53:47,  1.71s/it]  

Caught exception, wait for 1 min...


100%|██████████| 2162/2162 [1:08:08<00:00,  1.89s/it]


In [39]:
print(v3_data["test"][5])
print(results["test"][5])

{'id': '6917399ea434e6c484459f895c72ef90', 'question': 'Jenis sumur apa yang mungkin menimbulkan kontroversi?', 'question_concept': 'Sehat', 'choices': {'label': ['A', 'B', 'C', 'D', 'E'], 'text': ['panas bumi', 'air tanah', 'minyak dan gas', 'pertanian', 'artesis']}, 'answerKey': 'C'}
{'id': '6917399ea434e6c484459f895c72ef90', 'question': 'Jenis sumur naon waé anu tiasa nyababkeun kontrovérsi?', 'question_concept': 'Sehat', 'choices': {'label': ['A', 'B', 'C', 'D', 'E'], 'text': ['panasbumi', 'cai taneuh', 'minyak jeung gas', 'tatanén', 'artesian']}, 'answerKey': 'C'}


In [42]:
for s in split:
    save_data(results[s], f"./translated_data/su/{s}.csv")

CSV file "./translated_data/su/validation.csv" has been created with the data.
CSV file "./translated_data/su/test.csv" has been created with the data.
CSV file "./translated_data/su/train.csv" has been created with the data.
