In [1]:
%load_ext dotenv
%dotenv

In [2]:
import json
import os
import pygsheets
import pandas as pd
import openai
from pathlib import Path
from tqdm import tqdm

In [3]:
client = pygsheets.authorize()

In [4]:
workers_ids = ['W1', 'W2', 'W3', 'W4', 'W5', 'W6']
id_to_key_task1 = {
    'ind': {
        'W1': '14dAiIsBLxBUHzXgFDznMjSXCJmFY6qnIWOADqITWx9E',
        'W2': '1jHWRjwkahxpA5T_BWBRaMQ4q9kyg1l0gGz42pC6-dxI',
        'W3': '1C-Yyc17WFWScC5eu5WttQc8P8vCFuLvh8MqlBKiwF8c',
        'W4': '1J0xqeC05H1RVPSERi8fCXWUTGJ4iH3SI0w59BoFh_KU',
        'W5': '1T9jbfP1iapNLS94QZMiYw5K6LrZoP01Qf1jvTWpvdkU',
        'W6': '1G0chmXUmWC-lrZLg9ToQTGCdQmnbP0DA204bIKJn9yc',
    },
    'sun': {
        'W1': '1R-bT3RMu41cx-fnac3mNdb3OT_kb6vof1eMdCbU8_jM',
        'W2': '1ugtKZhO4jLtxW_yEwrrPXZtry2pLaP-6QgiIHKldl5E',
        'W3': '1OzP1XwzU3c-rNXyqxU3JKrrXHeXzSu3D6FIcfym4fHE',
        'W4': '1dNwW9dL4YPBEypUHQ6_-Dse8XL851W0WUiCBb28hRBQ',   
        'W5': '16wXGP08nESdLg4am4wfngeMNbK81IJAo3p0udg_iVlU',
        'W6': '1ZQEBOPGAlc6f2IUwavEKpPHY7dQxV4lN5iSQm1WHnX4',
    }
}
id_to_key_task2 = {
    'ind': {
        'W1': '1Wa33qUjeB0pq1QI87jUqHTXlQ88ADCxAFp77vFAhSVU',
        'W2': '1FbYeTu3ZK4vBLoPVpJqCFB8Rj6yzazYRpxRtmPXXf-k',
        'W3': '1UysOeI1QnU8sNXqwQ7FGnULvKCjt8Tm6eoFEdYeQ0xY',
        'W4': '1lKIAWvs9D8JoyNrZJzB6pU4KVtfWstH3tOEfM9dZMDc',
        'W5': '1nQsQPAoxeRr9QzybjrkpTn5-IuBBCByMNN8txkUVV4w',
        'W6': '1SCV9OBxvHwxQ31t6GFirO6et-raMbMpJdfLjn1_HsgU',
    },
    'sun': {
        'W1': '1-FLTsgge53Wgb3HmIlM-oCOWow4_kLEJ0bey8MaMxVI',
        'W2': '1-WLiRHFXlD5BawHdBkI2kLSlLttBXU4ufnTwLB8pufM',
        'W3': '1EVv6ktg-6ZC5e9UBFvPIVrZI0T2WAbx3OHau_T-OgSE',
        'W4': '1XyXhn_R3VuNcsmHCmCEwojFM1hUPGhQe6FDrfo3ALds',
        'W5': '1YsreB2g0AeDbiFOIu2JOUmbAkwugBRaaf8SMUA2mrg0',
        'W6': '1oU4K52UaKJT3EEvDdOs8o4tD97Wxl4SKSFQK193w2bQ',
    }
}

#### Processing data (with commonsense QA format)

In [5]:
answers_letters = ['A', 'B', 'C', 'D', 'E']
cat_id_to_en = {
    'Kuliner': 'culinary',
    'Tempat': 'place',
    'Budaya': 'culture',
    'Sejarah': 'history',
    'Aktivitas': 'activity',
}
data_by_lang = {}
for lang in ['ind', 'sun']:
    data_by_id = {}
    for ref_id in tqdm(workers_ids, desc=f'Processing {lang}'):
        sh_task1 = client.open_by_key(id_to_key_task1[lang][ref_id])
        wks_task1 = sh_task1.worksheet('title', 'Data')

        categories = [c for c in wks_task1.get_col(2)[1:] if c != '']
        question_concepts = [c for c in wks_task1.get_col(3)[1:] if c != '']
        question_concepts_trans = [c for c in wks_task1.get_col(4)[1:] if c != ''] if lang == "sun" else []
        
        question_col_num = 4 if lang == 'ind' else 5
        questions = [q for q in wks_task1.get_col(question_col_num)[1:] if q != '']
        
        options_col_num = 5 if lang == 'ind' else 6
        options = wks_task1.get_col(options_col_num)[1:]
        options_group, options_buffer = [], []
        for option in options:
            options_buffer.append(option)
            if len(options_buffer) == 5:
                options_group.append(options_buffer)
                options_buffer = []
        
        gold_col_num  = 6 if lang == 'ind' else 7
        gold = wks_task1.get_col(gold_col_num)[1:]
        gold_group, gold_buffer = [], []
        for ans in gold:
            gold_buffer.append(ans)
            if len(gold_buffer) == 5:
                gold_group.append(gold_buffer)
                gold_buffer = []
        
        data = []
        for i in range(len(questions)):
            answer_texts = [o[3:] for o in options_group[i]]
            answer = answers_letters[gold_group[i].index('TRUE')]
            data.append({
                'category': cat_id_to_en[categories[i]],
                'question_concepts': question_concepts[i] if lang == 'ind' else question_concepts_trans[i],
                'question': questions[i],
                'choices': {
                    'label': answers_letters,
                    'text': answer_texts
                },
                'answer_creator': answer,
                'answers': {},
                'answers_uncertainty': {},
                'question_ambiguity': {},
                'option_ambiguity': {},
                'reason': {}
            })

        data_by_id[ref_id] = data
    
    data_by_lang[lang] = data_by_id

Processing ind: 100%|██████████| 6/6 [00:16<00:00,  2.80s/it]
Processing sun: 100%|██████████| 6/6 [02:01<00:00, 20.30s/it]


In [6]:
data_by_lang['ind']['W1'][0]

{'category': 'culinary',
 'question_concepts': 'adab makan',
 'question': 'Apakah adab makan utama masyarakat Indonesia?',
 'choices': {'label': ['A', 'B', 'C', 'D', 'E'],
  'text': ['Tidak berbicara saat makan',
   'Menghabiskan makanan',
   'Makan menggunakan tangan kanan',
   'Makan sambil duduk',
   'Tidak mengecap saat makan']},
 'answer_creator': 'C',
 'answers': {},
 'answers_uncertainty': {},
 'question_ambiguity': {},
 'option_ambiguity': {},
 'reason': {}}

In [7]:
data_by_lang['sun']['W1'][0]

{'category': 'culinary',
 'question_concepts': 'adab dahar',
 'question': 'Dihandap ieu anu termasuk kana adab dahar anu umum di daerah Sunda nyaeta?',
 'choices': {'label': ['A', 'B', 'C', 'D', 'E'],
  'text': ['Nganggo panangan kenca',
   'Nyandak kaemaman anu tebih',
   'Dahar bari nyarios',
   'Nganggo panangan katuhu',
   'Dahar bari diuk']},
 'answer_creator': 'D',
 'answers': {},
 'answers_uncertainty': {},
 'question_ambiguity': {},
 'option_ambiguity': {},
 'reason': {}}

In [8]:
for lang in ['ind', 'sun']:
    print(f'Processing {lang}')
    for ref_id in workers_ids:
        sh_task2 = client.open_by_key(id_to_key_task2[lang][ref_id])
        for pred_id in tqdm(workers_ids, desc=f'Processing {ref_id}'):
            if pred_id == ref_id:
                continue
            wks_task2 = sh_task2.worksheet('title', pred_id)

            q_ambiguity_col_num = 8 if lang == 'ind' else 9
            q_ambiguity = [q for q in wks_task2.get_col(q_ambiguity_col_num)[1:] if q != '']

            uncertainty_col_num = 10 if lang == 'ind' else 11
            uncertainty = [q for q in wks_task2.get_col(uncertainty_col_num)[1:] if q != '']

            reason_col_num = 11 if lang == 'ind' else 12
            reasons = wks_task2.get_col(reason_col_num)[1:]
            reason_single, reason_buffer = [], []
            for reason in reasons:
                reason_buffer.append(reason)
                if len(reason_buffer) == 5:
                    reason_single.append(reason_buffer[0])
                    reason_buffer = []

            answer_col_num = 6 if lang == 'ind' else 7
            answers = wks_task2.get_col(answer_col_num)[1:]
            answers_single, answers_buffer = [], []
            for answer in answers:
                answers_buffer.append(answer)
                if len(answers_buffer) == 5:
                    answers_single.append(answers_letters[answers_buffer.index('TRUE')])
                    answers_buffer = []

            options_col_num = 9 if lang == 'ind' else 10
            options = wks_task2.get_col(options_col_num)[1:]
            options_group, options_buffer = [], []
            for option in options:
                options_buffer.append(option)
                if len(options_buffer) == 5:
                    options_group.append(options_buffer)
                    options_buffer = []
            
            for i in range(len(q_ambiguity)):
                data_by_lang[lang][pred_id][i]['answers'][ref_id] = answers_single[i]
                data_by_lang[lang][pred_id][i]['answers_uncertainty'][ref_id] = uncertainty[i]
                data_by_lang[lang][pred_id][i]['question_ambiguity'][ref_id] = q_ambiguity[i]
                data_by_lang[lang][pred_id][i]['option_ambiguity'][ref_id] = options_group[i]
                data_by_lang[lang][pred_id][i]['reason'][ref_id] = reason_single[i]

Processing ind


Processing W1: 100%|██████████| 6/6 [00:09<00:00,  1.61s/it]
Processing W2: 100%|██████████| 6/6 [02:00<00:00, 20.10s/it]
Processing W3: 100%|██████████| 6/6 [00:08<00:00,  1.48s/it]
Processing W4: 100%|██████████| 6/6 [00:20<00:00,  3.36s/it]
Processing W5: 100%|██████████| 6/6 [00:10<00:00,  1.70s/it]
Processing W6: 100%|██████████| 6/6 [00:10<00:00,  1.76s/it]


Processing sun


Processing W1: 100%|██████████| 6/6 [01:59<00:00, 19.88s/it]
Processing W2: 100%|██████████| 6/6 [00:09<00:00,  1.61s/it]
Processing W3: 100%|██████████| 6/6 [00:16<00:00,  2.82s/it]
Processing W4: 100%|██████████| 6/6 [00:09<00:00,  1.54s/it]
Processing W5: 100%|██████████| 6/6 [00:14<00:00,  2.47s/it]
Processing W6: 100%|██████████| 6/6 [01:56<00:00, 19.45s/it]


In [9]:
data_by_lang['ind']['W1'][2]

{'category': 'culinary',
 'question_concepts': 'aduk',
 'question': 'Apa yang biasanya digunakan untuk mengaduk kopi?',
 'choices': {'label': ['A', 'B', 'C', 'D', 'E'],
  'text': ['Sedotan', 'Bungkus kopi', 'Garpu', 'Sumpit', 'Sendok']},
 'answer_creator': 'C',
 'answers': {'W2': 'E', 'W3': 'E', 'W4': 'E', 'W5': 'A', 'W6': 'E'},
 'answers_uncertainty': {'W2': 'certain',
  'W3': 'certain',
  'W4': 'certain',
  'W5': 'certain',
  'W6': 'certain'},
 'question_ambiguity': {'W2': 'clear',
  'W3': 'clear',
  'W4': 'clear',
  'W5': 'clear',
  'W6': 'clear'},
 'option_ambiguity': {'W2': ['clear', 'clear', 'clear', 'clear', 'clear'],
  'W3': ['clear', 'clear', 'clear', 'clear', 'clear'],
  'W4': ['clear', 'clear', 'clear', 'clear', 'clear'],
  'W5': ['clear', 'clear', 'clear', 'clear', 'clear'],
  'W6': ['clear', 'clear', 'clear', 'clear', 'clear']},
 'reason': {'W2': '', 'W3': '', 'W4': '', 'W5': '', 'W6': ''}}

In [10]:
data_by_lang['sun']['W1'][0]

{'category': 'culinary',
 'question_concepts': 'adab dahar',
 'question': 'Dihandap ieu anu termasuk kana adab dahar anu umum di daerah Sunda nyaeta?',
 'choices': {'label': ['A', 'B', 'C', 'D', 'E'],
  'text': ['Nganggo panangan kenca',
   'Nyandak kaemaman anu tebih',
   'Dahar bari nyarios',
   'Nganggo panangan katuhu',
   'Dahar bari diuk']},
 'answer_creator': 'D',
 'answers': {'W2': 'D', 'W3': 'D', 'W4': 'D', 'W5': 'D', 'W6': 'D'},
 'answers_uncertainty': {'W2': 'certain',
  'W3': 'certain',
  'W4': 'certain',
  'W5': 'certain',
  'W6': 'certain'},
 'question_ambiguity': {'W2': 'clear',
  'W3': 'clear',
  'W4': 'clear',
  'W5': 'clear',
  'W6': 'clear'},
 'option_ambiguity': {'W2': ['clear', 'clear', 'clear', 'clear', 'clear'],
  'W3': ['clear', 'clear', 'clear', 'clear', 'clear'],
  'W4': ['clear', 'clear', 'clear', 'clear', 'clear'],
  'W5': ['clear', 'clear', 'clear', 'clear', 'clear'],
  'W6': ['clear', 'clear', 'clear', 'clear', 'clear']},
 'reason': {'W2': '', 'W3': '', 'W

In [11]:
# Dump the dataset
for lang in ['ind', 'sun']:
    with open(f'../dataset/human/raw_{lang}.json', 'w', encoding='utf-8') as fp:
        json.dump(data_by_lang[lang], fp)

#### Processing question prompt

In [12]:
answers_letters = ['A', 'B', 'C', 'D', 'E']
prompts_by_lang, data_by_lang = {}, {}
mode = 'raw'
if mode == 'raw':
    for lang in ['ind', 'sun']:
        prompts_by_id = {}
        with open(f'../dataset/human/raw_{lang}.json', encoding='utf-8') as fp:
            data_by_lang[lang] = json.load(fp)
        for ref_id in tqdm(workers_ids, desc='Processing question prompt'):
            prompts = []
            for item in data_by_lang[lang][ref_id]:
                question_prompt = item['question']
                for label, text in zip(item['choices']['label'], item['choices']['text']):
                    question_prompt += f"\n{label}. {text}"
                prompts.append({
                    'question_prompt': question_prompt,
                    'answer': item['answer_creator']
                })

            prompts_by_id[ref_id] = prompts
        
        prompts_by_lang[lang] = prompts_by_id

Processing question prompt: 100%|██████████| 6/6 [00:00<00:00, 986.47it/s]
Processing question prompt: 100%|██████████| 6/6 [00:00<00:00, 782.28it/s]


In [13]:
data_by_lang['ind']['W1'][0]

{'category': 'culinary',
 'question_concepts': 'adab makan',
 'question': 'Apakah adab makan utama masyarakat Indonesia?',
 'choices': {'label': ['A', 'B', 'C', 'D', 'E'],
  'text': ['Tidak berbicara saat makan',
   'Menghabiskan makanan',
   'Makan menggunakan tangan kanan',
   'Makan sambil duduk',
   'Tidak mengecap saat makan']},
 'answer_creator': 'C',
 'answers': {'W2': 'C', 'W3': 'C', 'W4': 'C', 'W5': 'C', 'W6': 'C'},
 'answers_uncertainty': {'W2': 'certain',
  'W3': 'certain',
  'W4': 'uncertain',
  'W5': 'uncertain',
  'W6': 'certain'},
 'question_ambiguity': {'W2': 'clear',
  'W3': 'clear',
  'W4': 'clear',
  'W5': 'clear',
  'W6': 'clear'},
 'option_ambiguity': {'W2': ['clear', 'clear', 'clear', 'clear', 'clear'],
  'W3': ['clear', 'clear', 'clear', 'clear', 'clear'],
  'W4': ['clear', 'clear', 'clear', 'clear', 'clear'],
  'W5': ['clear', 'clear', 'clear', 'clear', 'clear'],
  'W6': ['clear', 'clear', 'clear', 'clear', 'clear']},
 'reason': {'W2': '',
  'W3': '',
  'W4': '[

In [14]:
prompts_by_lang['ind']['W1'][0]

{'question_prompt': 'Apakah adab makan utama masyarakat Indonesia?\nA. Tidak berbicara saat makan\nB. Menghabiskan makanan\nC. Makan menggunakan tangan kanan\nD. Makan sambil duduk\nE. Tidak mengecap saat makan',
 'answer': 'C'}

#### Get ChatGPT Answers

In [None]:
openai.api_key = os.environ['OPENAI_API_KEY']
openai.organization = os.environ['OPENAI_UILAB_KEY']
model_name = "gpt-4-1106-preview"
resp_history_filename = f"{model_name}_history_231107.csv"
resp_history_path = Path(resp_history_filename)
if resp_history_path.is_file():
    print("Response history found!")
    resp_history_df = pd.read_csv(resp_history_filename)
    response_history = dict(zip(resp_history_df.prompt, resp_history_df.response))
else:
    print("Response history not found. Initializing new one...")
    response_history = {}

In [None]:
def get_openai_chat_completion(input_prompt, model_name, resp_num=1, temp=0.1):
    return openai.ChatCompletion.create(
        model=model_name,
        messages=[
            {
                'role': 'user',
                'content': input_prompt
            }
        ],
        temperature=temp,
        n=resp_num
    )

In [None]:
def get_input_prompt(question_prompt):
    end_prompt = "Give only one answer that most likely to be the correct answer with a prefix that says \"Answer:\" follows by the option letter. For example:\nAnswer: Z"
    return f"{question_prompt}\n\n{end_prompt}"

In [None]:
def get_openai_answer(input_prompt, model_name, resp_num=1):
    if input_prompt in response_history:
        return response_history[input_prompt]
    
    resp = get_openai_chat_completion(input_prompt, model_name, resp_num)
    answer = resp.choices[0].message.content.strip().upper()
    answer_cleaned = answer.replace('ANSWER: ', '')
    if '. ' in answer_cleaned:
        answer_cleaned = answer_cleaned.split('. ')[0]

    if len(answer_cleaned) > 1:
        print('Answer len > 1, retry...')
        print('Answer before:', answer_cleaned)
        resp = get_openai_chat_completion(input_prompt, model_name, resp_num)
        answer = resp.choices[0].message.content.strip().upper()
        answer_cleaned = answer.replace('ANSWER: ', '')
        if '. ' in answer_cleaned:
            answer_cleaned = answer_cleaned.split('. ')[0]
        print('Answer after:', answer_cleaned)

    response_history[input_prompt] = answer_cleaned
    resp_history_df = pd.DataFrame({'prompt': response_history.keys(), 'response': response_history.values()})
    resp_history_df.to_csv(resp_history_filename, index=False)

    return answer_cleaned

In [None]:
answers_by_lang = {}
for lang in ['ind', 'sun']:
    answers_by_id = {}
    prompts_by_id = prompts_by_lang[lang]
    print('Language:', lang)
    for ref_id in workers_ids:
        answers = []
        for prompt_item in tqdm(prompts_by_id[ref_id], desc=f"Processing {ref_id}"):
            input_prompt = get_input_prompt(prompt_item['question_prompt'])
            model_pred = get_openai_answer(input_prompt, model_name, resp_num=1)
            answers.append(model_pred)
        answers_by_id[ref_id] = answers
    answers_by_lang[lang] = answers_by_id

In [None]:
conflict_num_by_lang = {}
for lang in ['ind', 'sun']:
    conflict_num_by_id = {}
    for ref_id in workers_ids:
        conflict_num = 0
        for pred, gold in zip(answers_by_lang[lang][ref_id], prompts_by_lang[lang][ref_id]):
            if pred != gold['answer']:
                conflict_num += 1
        conflict_num_by_id[ref_id] = conflict_num
    conflict_num_by_lang[lang] = conflict_num_by_id

In [None]:
conflict_num_by_lang

#### Analyze anno conflict num

In [None]:
for lang in ['ind', 'sun']:
    # Get gold answer
    gold_ans = {}
    for ref_id in tqdm(workers_ids):
        sh_task1 = client.open_by_key(id_to_key_task1[lang][ref_id])
        wks_task1 = sh_task1.worksheet('title', 'Data')
        gold_col_num  = 6 if lang == 'ind' else 7
        gold_ans[ref_id] = wks_task1.get_col(gold_col_num)[1:]

    conflict_data = {}
    for pred_id in tqdm(workers_ids):
        all_conflict_counts = []
        sh_pred = client.open_by_key(id_to_key_task2[lang][pred_id])
        for ref_id in workers_ids:
            if pred_id == ref_id:
                all_conflict_counts.append(0)
            else:
                wks_pred = sh_pred.worksheet('title', ref_id)

                # Get answers
                ans_col_num  = 6 if lang == 'ind' else 7
                pred_ans = wks_pred.get_col(ans_col_num)[1:]
                
                # Get conflict status
                stat, stat_buffer = [], []
                for pred, gold in zip(pred_ans, gold_ans[ref_id]):

                    stat_buffer.append('OK' if pred == gold else 'CONFLICT')
                    if len(stat_buffer) == 5:
                        stat.append('CONFLICT' if 'CONFLICT' in stat_buffer else 'OK')
                        stat_buffer = []

                ok_count, conflict_count = stat.count('OK'), stat.count('CONFLICT')
                if ok_count + conflict_count != 250:
                    print(pred_id, ref_id, ok_count, conflict_count)
                assert ok_count + conflict_count == 250
                all_conflict_counts.append(conflict_count)
        conflict_data[pred_id] = all_conflict_counts

    conflict_df = pd.DataFrame.from_dict(conflict_data, orient='index', columns=workers_ids)
    print('Conflict data for', lang)
    print(conflict_df)