In [1]:
from datasets import load_dataset
from tqdm import tqdm
import pandas as pd
import json
import jsonlines
import time
import os
from dotenv import load_dotenv
load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

# Load Dataset

In [2]:
cs_qa_dataset = load_dataset("commonsense_qa")
cs_qa_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'question_concept', 'choices', 'answerKey'],
        num_rows: 9741
    })
    validation: Dataset({
        features: ['id', 'question', 'question_concept', 'choices', 'answerKey'],
        num_rows: 1221
    })
    test: Dataset({
        features: ['id', 'question', 'question_concept', 'choices', 'answerKey'],
        num_rows: 1140
    })
})

In [3]:
df_train = cs_qa_dataset['train'].to_pandas()
df_validation = cs_qa_dataset['validation'].to_pandas()
df_test = cs_qa_dataset['test'].to_pandas()
df_train.head(3)

Unnamed: 0,id,question,question_concept,choices,answerKey
0,075e483d21c29a511267ef62bedc0461,The sanctions against the school were a punish...,punishing,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': [...",A
1,61fe6e879ff18686d7552425a36344c8,Sammy wanted to go to where the people were. ...,people,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': [...",B
2,4c1cb0e95b99f72d55c068ba0255c54d,To locate a choker not located in a jewelry bo...,choker,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': [...",A


In [4]:
def df_to_dict(df: pd.DataFrame):
    df_dict = {}

    for i, row in tqdm(df.iterrows(), total=df.shape[0]):
        id = row['id']
        q = row['question']
        q_c = row['question_concept']
        mc = row['choices']
        ans_key = row['answerKey']

        df_dict[id] = {
            'question': q,
            'question_concept': q_c,
            'choices': mc,
            'answerKey': ans_key,
        }
    return df_dict

dict_train = df_to_dict(df_train)
dict_validation = df_to_dict(df_validation)
dict_test = df_to_dict(df_test)

100%|██████████| 9741/9741 [00:00<00:00, 34961.33it/s]
100%|██████████| 1221/1221 [00:00<00:00, 38000.74it/s]
100%|██████████| 1140/1140 [00:00<00:00, 38887.95it/s]


In [5]:
dict_train[list(dict_train.keys())[0]]

{'question': 'The sanctions against the school were a punishing blow, and they seemed to what the efforts the school had made to change?',
 'question_concept': 'punishing',
 'choices': {'label': array(['A', 'B', 'C', 'D', 'E'], dtype=object),
  'text': array(['ignore', 'enforce', 'authoritarian', 'yell at', 'avoid'],
        dtype=object)},
 'answerKey': 'A'}

# Translation API

In [6]:
# !pip3 install google-cloud-translate

from google.cloud import translate

In [7]:
# !gcloud auth application-default login

PROJECT_ID = os.getenv('PROJECT_ID')

In [8]:
def translate_text(text_list=["Hello, world!"]):

    client = translate.TranslationServiceClient()
    location = "global"
    parent = f"projects/{PROJECT_ID}/locations/{location}"


    response = client.translate_text(
        request={
            "parent": parent,
            "contents": text_list,
            "mime_type": "text/plain",
            "source_language_code": "en-US",
            "target_language_code": "id-ID",
        }
    )

    translated_text = []
    for translation in response.translations:
        translated_text.append("{}".format(translation.translated_text))
    return translated_text

In [9]:
def translate_text_try(text_list):
    while True:
        try:
            return translate_text(text_list)
        except Exception as e:
            print('Error:', str(e))
            time.sleep(5)

In [10]:
translate_text_try(text_list=['hello\nI thing you\'re just a so so person ', 'just a test, don\'t be so rude to me'])

['Halo\nMenurutku, kamu adalah orang yang biasa-biasa saja',
 'hanya ujian, jangan bersikap kasar padaku']

# Translate Process

In [11]:
def translate_data(dict_source:dict, output_file:str, continue_after_id:str=None):
    if continue_after_id:
        start = False
    else:
        start = True

    with jsonlines.open(output_file, mode='a') as writer:
        for k, v in tqdm(dict_source.items()):
            if start:
                translate_input = [
                    v['question'],
                    v['question_concept']
                ] + list(v['choices']['text'])
                translate_output = translate_text_try(translate_input)
                translated_choices_texts = translate_output[2:]

                # make sure the format is equal to original commonsense_qa jsonl files
                translated_dict = {
                    'answerKey': v['answerKey'],
                    'id': k,
                    'question': {
                        'question_concept': translate_output[1],
                        'choices': [{
                            'label': 'A',
                            'text': translated_choices_texts[0]
                        }, {
                            'label': 'B',
                            'text': translated_choices_texts[1]
                        }, {
                            'label': 'C',
                            'text': translated_choices_texts[2]
                        }, {
                            'label': 'D',
                            'text': translated_choices_texts[3]
                        }, {
                            'label': 'E',
                            'text': translated_choices_texts[4]
                        }],
                        'stem': translate_output[0]
                    }
                }
                if translated_dict['answerKey'] == '':
                    del translated_dict['answerKey']

                writer.write(translated_dict)
            else:
                if k == continue_after_id:
                    start = True

In [12]:
TRAIN = 'commonsense_qa-id_train.jsonl'
VAL = 'commonsense_qa-id_dev.jsonl'
TEST = 'commonsense_qa-id_test_no_answer.jsonl'

In [13]:
# translate_data(dict_train, TRAIN)

In [14]:
# translate_data(dict_validation, VAL)

In [15]:
# translate_data(dict_test, TEST)

# Checking Translation

In [16]:
def read_data(filepath):
    res = {}
    with jsonlines.open(filepath, mode='r') as reader:
        for obj in reader:
            # find duplicated id
            if obj['id'] in res:
                print('duplicated', obj['id'])
            else:
                res[obj['id']] = obj

    return res

dict_train_translated = read_data(TRAIN)
dict_validation_translated = read_data(VAL)
dict_test_translated = read_data(TEST)

In [17]:
dict_train_ids = list(dict_train.keys())
dict_validation_ids = list(dict_validation.keys())
dict_test_ids = list(dict_test.keys())

dict_train_translated_ids = list(dict_train_translated.keys())
dict_validation_translated_ids = list(dict_validation_translated.keys())
dict_test_translated_ids = list(dict_test_translated.keys())

print('dict_train_translated is same length of data: ', len(dict_train_ids) == len(dict_train_translated_ids))
print('dict_validation_translated is same length of data: ', len(dict_validation_ids) == len(dict_validation_translated_ids))
print('dict_test_translated is same length of data: ', len(dict_test_ids) == len(dict_test_translated_ids))

dict_train_translated is same length of data:  True
dict_validation_translated is same length of data:  True
dict_test_translated is same length of data:  True
