## Process ITC datasets to have gold answer extracted from output.

In [1]:
import pandas as pd
import os

In [2]:
itc_data = 'test_itc.jsonl'

In [3]:
df = pd.read_json(itc_data, orient='records', lines=True)
df.head()

Unnamed: 0,instruction,input,output,origin
0,Directions : There is a certain relation betwe...,,This question is a type of verbal analogy. The...,rs_aggarwal_book_exercise_1B
1,There is a certain relation between two given ...,,The question is a form of analogy reasoning pr...,rs_aggarwal_book_exercise_1B
2,There is a certain relation between two given ...,,This is a question of analogy. The analogy is ...,rs_aggarwal_book_exercise_1B
3,Choose the word which is least like the other ...,,The given question is a verbal reasoning quest...,rs_aggarwal_book_exercise_illustrative example
4,Choose the word which is least like the other ...,,This question is a test of understanding categ...,rs_aggarwal_book_exercise_illustrative example


In [10]:
df.output[18]

"The relationship between the two words in the question is that the first word is a process that results in the creation of the second word. In this case, 'Evaporation' is the process through which 'Clouds' are formed.   We need to find a similar pair of words among the choices given.   (a) Mountain : Snow - This pair does not represent a process-result relationship. A mountain does not create snow.  (b) Pressure : Atmosphere - This pair does not represent a process-result relationship. Pressure is a result of the atmosphere, not the other way around.  (c) Book : Pages - This pair does not represent a process-result relationship. A book is made up of pages, but it does not create them.  (d) Tension : Breakdown - This pair represents a process-result relationship. 'Tension' can lead to a 'Breakdown', similar to how 'Evaporation' can lead to 'Clouds'.   Hence, the answer is (d)."

In [50]:
gold_ans_filter_words = ['therefore', 'hence', 'so', 'thus']

In [21]:
def split_text_on_pivot(large_text, pivot_word):
    """
    Splits the given text into two parts based on the pivot word.
    
    :param large_text: The text to be split
    :param pivot_word: The word on which to split the text
    :return: A tuple containing two parts of the text
    """
    if pivot_word in large_text:
        parts = large_text.split(pivot_word, 1)
        return parts[0], parts[1].strip()
    return large_text, None

In [15]:
split_text_on_pivot(df.output[18].lower(), gold_ans_filter_words[2])

("the relationship between the two words in the question is that the first word is a process that results in the creation of the second word. in this case, 'evaporation' is the process through which 'clouds' are formed.   we need to find a similar pair of words among the choices given.   (a) mountain : snow - this pair does not represent a process-result relationship. a mountain does not create snow.  (b) pressure : atmosphere - this pair does not represent a process-result relationship. pressure is a result of the atmosphere, not the other way around.  (c) book : pages - this pair does not represent a process-result relationship. a book is made up of pages, but it does not create them.  (d) tension : breakdown - this pair represents a process-result relationship. 'tension' can lead to a 'breakdown', similar to how 'evaporation' can lead to 'clouds'.   ",
 ', the answer is (d).')

In [16]:
PUNCTUATIONS = [',', '.']

In [45]:
def process_gold_text(text):
    # print(f'GOLD: {text}')
    if text[0] in PUNCTUATIONS:
        return " ".join(text.split(" ")[1:])
    else:
        return text

In [51]:
gold_answers = []
for row in df.iterrows():
    # print(row)
    ## Take the last sentence only:
    row_last_sent = " ".join(row[1].output.split('.')[-2:])
    # print('row_last_sent', row_last_sent)
    for word in gold_ans_filter_words:
        # print(f'Searching for word [{word}]')
        _, gold = split_text_on_pivot(row_last_sent.lower(), word)
        if gold and gold is not None:
            break
    # print(f"gold: [{gold}]")
    if gold is None:
        # print(f'\ngold answer not found for [{row_last_sent}]')
        gold = row[1].output
    # print(f"GOLD1: [{gold}]\n\n\n")
    gold = process_gold_text(gold)
    # print(f"GOLD2: [{gold}]\n\n\n")
    gold_answers.append(gold)

# df['gold_answer'] = gold_answers

In [52]:
gold_answers

['the correct answer is (c) malaria',
 "the answer is '(d) history'",
 'the answer is (d) judgement',
 'the correct answer is (c) oil',
 'lotus (b) is the least like the other flowers in the group',
 'the answer is (b) sword',
 "the eagle is the odd one out in this group as it doesn't share the common characteristic of being flightless",
 "the word 'river' is the odd one out as it does not contain stagnant water like the others",
 'the arrow is the odd one out in this group of words because it is not a melee weapon and is not used while holding in hand, unlike the other four',
 'the correct answer is (c) larva',
 'the correct answer is (d) ampere',
 'the answer is (c) beans',
 'the correct answer is (d) novel: drama : literature',
 'the choice that represents a similar relationship to the one in the question is (d) student : boy : girl',
 'the correct answer is (b) accident : injury : pain',
 'fail ahmed is the odd one out in this context',
 "'exercising' is the odd one out as it is a 

In [55]:
df['gold_answer'] = gold_answers
# head_tail = os.path.split(itc_data)
# itc_save_path = f"{head_tail[0]}_gold.{head_tail[1].split('.')[1]}"
df.to_json('test_itc_gold.jsonl', orient='records', lines=True)