In [2]:
from datasets import load_dataset
from tqdm import tqdm
import pandas as pd
import json
import jsonlines
import time
import os
from dotenv import load_dotenv
load_dotenv()

True

# Load Dataset

In [3]:
cs_qa_dataset = load_dataset("commonsense_qa")
cs_qa_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'question_concept', 'choices', 'answerKey'],
        num_rows: 9741
    })
    validation: Dataset({
        features: ['id', 'question', 'question_concept', 'choices', 'answerKey'],
        num_rows: 1221
    })
    test: Dataset({
        features: ['id', 'question', 'question_concept', 'choices', 'answerKey'],
        num_rows: 1140
    })
})

In [4]:
df_train = cs_qa_dataset['train'].to_pandas()
df_validation = cs_qa_dataset['validation'].to_pandas()
df_test = cs_qa_dataset['test'].to_pandas()
df_train.head(3)

Unnamed: 0,id,question,question_concept,choices,answerKey
0,075e483d21c29a511267ef62bedc0461,The sanctions against the school were a punish...,punishing,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': [...",A
1,61fe6e879ff18686d7552425a36344c8,Sammy wanted to go to where the people were. ...,people,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': [...",B
2,4c1cb0e95b99f72d55c068ba0255c54d,To locate a choker not located in a jewelry bo...,choker,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': [...",A


In [5]:
def df_to_dict(df: pd.DataFrame):
    df_dict = {}

    for i, row in tqdm(df.iterrows(), total=df.shape[0]):
        id = row['id']
        q = row['question']
        q_c = row['question_concept']
        mc = row['choices']
        ans_key = row['answerKey']

        df_dict[id] = {
            'question': q,
            'question_concept': q_c,
            'choices': mc,
            'answerKey': ans_key,
        }
    return df_dict

dict_train = df_to_dict(df_train)
dict_validation = df_to_dict(df_validation)
dict_test = df_to_dict(df_test)

100%|██████████| 9741/9741 [00:00<00:00, 37469.54it/s]
100%|██████████| 1221/1221 [00:00<00:00, 37355.45it/s]
100%|██████████| 1140/1140 [00:00<00:00, 36007.49it/s]


In [6]:
dict_train[list(dict_train.keys())[0]]

{'question': 'The sanctions against the school were a punishing blow, and they seemed to what the efforts the school had made to change?',
 'question_concept': 'punishing',
 'choices': {'label': array(['A', 'B', 'C', 'D', 'E'], dtype=object),
  'text': array(['ignore', 'enforce', 'authoritarian', 'yell at', 'avoid'],
        dtype=object)},
 'answerKey': 'A'}

# Translation API

In [7]:
# !pip3 install google-cloud-translate

from google.cloud import translate

In [8]:
# !gcloud auth application-default login

PROJECT_ID = os.getenv('PROJECT_ID')

In [9]:
def translate_text(text_list=["Hello, world!"]):

    client = translate.TranslationServiceClient()
    location = "global"
    parent = f"projects/{PROJECT_ID}/locations/{location}"


    response = client.translate_text(
        request={
            "parent": parent,
            "contents": text_list,
            "mime_type": "text/plain",
            "source_language_code": "en-US",
            "target_language_code": "id-ID",
        }
    )

    translated_text = []
    for translation in response.translations:
        translated_text.append("{}".format(translation.translated_text))
    return translated_text

In [10]:
translate_text(text_list=['hello\nI thing you\'re just a so so person ', 'just a test, don\'t be so rude to me'])

['Halo\nMenurutku, kamu adalah orang yang biasa-biasa saja',
 'hanya ujian, jangan bersikap kasar padaku']

# Translate Process

In [46]:
with jsonlines.open('commonsense_qa-id.jsonl', mode='a') as writer:
    for k, v in tqdm(dict_train.items()):
        translate_input = [
            v['question'],
            v['question_concept']
        ] + list(v['choices']['text'])
        translate_output = translate_text(translate_input)

        print(translate_input)
        print(translate_output)

        translated_dict = {
            'id': k,
            'question': translate_output[0],
            'question_concept': translate_output[1],
            'choices': {
                'label': list(v['choices']['label']),
                'text': translate_output[2:]
            },
            'answerKey': v['answerKey'],
        }
        writer.write(translated_dict)
        break


  0%|          | 0/9741 [00:00<?, ?it/s]

{'question': 'The sanctions against the school were a punishing blow, and they seemed to what the efforts the school had made to change?', 'question_concept': 'punishing', 'choices': {'label': array(['A', 'B', 'C', 'D', 'E'], dtype=object), 'text': array(['ignore', 'enforce', 'authoritarian', 'yell at', 'avoid'],
      dtype=object)}, 'answerKey': 'A'}


  0%|          | 0/9741 [00:01<?, ?it/s]

['The sanctions against the school were a punishing blow, and they seemed to what the efforts the school had made to change?', 'punishing', 'ignore', 'enforce', 'authoritarian', 'yell at', 'avoid']
['Sanksi terhadap sekolah tersebut merupakan pukulan telak, dan apa upaya yang telah dilakukan sekolah untuk mengubahnya?', 'menghukum', 'mengabaikan', 'melaksanakan', 'otoriter', 'berteriak', 'menghindari']





In [36]:
with jsonlines.open('commonsense_qa-id.jsonl') as reader:
    for obj in reader:
        print(obj)

{'test_a': 0}
{'test_a': 1}
{'test_a': 2}
{'test_a': 3}
{'test_a': 4}
{'test_a': 5}
{'test_a': 0}
{'test_a': 1}
{'test_a': 2}
{'test_a': 3}
{'test_a': 4}
