### Import Libraries

In [141]:
import os
import ast
import csv
import json
import openai
import pandas as pd

from dotenv import load_dotenv
from tqdm import tqdm

### Load Environment Variables

In [142]:
load_dotenv()

api_key = os.environ.get("API_KEY")
org_key = os.environ.get("ORG_KEY")

### Load CSV Rephrase Data

In [143]:
# Specify the path to your CSV file
split = ["train", "validation", "test"]

all_data = {}

for s in split:
    csv_file_path = f'{s}_rephrase.csv'

    # Initialize an empty list to store the data
    data_list = []

    # Open the CSV file for reading
    with open(csv_file_path, newline='') as csvfile:
        # Create a CSV reader object
        csv_reader = csv.DictReader(csvfile)
        
        # Iterate through each row in the CSV file
        for row in csv_reader:
            # Append the row (as a dictionary) to the data_list
            row["choices"] = ast.literal_eval(row["choices"])

            rephrase_params = ["concept", "name", "option"]
            for param in rephrase_params:
                if row[param] == "True":
                    row[param] = True
                elif row[param] == "False":
                    row[param] = False
                else:
                    raise TypeError(f"{param} data cannot be recognized")

            data_list.append(row)
    
    all_data[s] = data_list

### Generate Prompt

In [144]:
# Function to generate choice text
def generate_choices_text(choices):
    labels = choices["label"]
    texts = choices["text"]

    choice_text = ""
    for idx, label in enumerate(labels):
        choice_text += f'{label}. "{texts[idx]}"\n'
    
    return choice_text

# Function to generate answer text
def generate_answer_text(choices, answerKey):
    idx = choices["label"].index(answerKey)
    answer_text = f'{answerKey}. "{choices["text"][idx]}"'
    
    return answer_text

# Function to generate prompts based on the conditions
def generate_rephrase_name_prompt(row):
    return f"""Change all names in the given phrases to Indonesian names. Change only the names. Keep all remaining phrases and keep it all in english and reply with only your answer.

Phrase: {row['question']}
Answer:"""

def generate_rephrase_all_prompt(row):
    return f"""Given a commonsense question, a concept, options, and the question answer, change them to become relevant to Indonesia. If an aspect is flagged to be changed, then you need to change it completely. If it's flagged as keep, then keep as it is. Make sure your changes are still in the same domain/topic with the given data, and there is only one clear answer in the options. Reply with only your changed data in a JSON format.

Data:
###
Question: {row['question']} -> Change
Concept: {row['question_concept']} -> {'Change' if row['concept'] else 'Keep'}
Options: -> {'Change' if row['option'] else 'Keep'}
{generate_choices_text(row['choices'])}Question Answer: {generate_answer_text(row['choices'], row['answerKey']) if row['answerKey'] else ''}
###

Changed data:"""

### Rephrase Function

In [151]:
def get_openai_chat_completion(input_prompt, model_name):
    return openai.ChatCompletion.create(
        model=model_name,
        messages=[
            {
                'role': 'user',
                'content': input_prompt 
            }
        ],
        temperature=0.1
    )

# Define a function to rephrase the CSV data using OpenAI GPT-3.5-Turbo
def rephrase_csv_data(row, model_name, history):
    if row["name"] and not row["concept"] and not row["option"]:
        input_prompt = generate_rephrase_name_prompt(row)
    else:
        input_prompt = generate_rephrase_all_prompt(row)

    if input_prompt in history.keys():
        return history[input_prompt]

    try:
        completion = get_openai_chat_completion(input_prompt, model_name)
    except Exception:
        print('Caught exception, wait for 1 min...')
        time.sleep(60)
        completion = get_openai_chat_completion(input_prompt, model_name)
    
    response = completion.choices[0].message.content.strip()
            
    return input_prompt, response

def postprocess_result(row, response, split):
    if row["name"] and not row["concept"] and not row["option"]:
        if "Answer: " in response:
            rephrased_result = response.split("Answer: ")[-1]
        else:
            rephrased_result = response
    else:
        rephrased_result = ast.literal_eval(response)
        letters = ['A', 'B', 'C', 'D', 'E']

        if isinstance(rephrased_result["Options"], list):
            if len(rephrased_result["Options"]) == 5:
                rephrased_result["Options"] = dict(zip(letters, rephrased_result["Options"]))
            else:
                raise ValueError(f"Option output is not right: {rephrased_result['Options']}")
        
        if split != "test":
            rephrased_result["Question Answer"] = rephrased_result["Question Answer"][0]
            if rephrased_result["Question Answer"] not in letters:
                raise ValueError(f"Answer key not in options: {rephrased_result['Question Answer']}")
    
    return rephrased_result

### Run Rephrase

In [147]:
openai.api_key = api_key
openai.organization = org_key

model_name = "gpt-3.5-turbo"

In [149]:
for s in split:
    print(f"Process data on split: {s}")
    
    history_path = f"{s}_history_91023.csv"
    if os.path.exists(history_path):
        print(f"Load response history from file {history_path}")
        resp_history_df = pd.read_csv(history_path, converters={'response': lambda x: ast.literal_eval(x)})
        response_history = dict(zip(resp_history_df.prompt, resp_history_df.response))
    else:
        print(f"Initialize response history")
        response_history = {}

    rephrased_results = []
    for data in tqdm(all_data[s][:10]):
        rephrased_data = data.copy()
        
        prompt, response = rephrase_csv_data(data, model_name, response_history)
        response_history[prompt] = response
        
        result = postprocess_result(data, response, s)
        if isinstance(result, dict):
            rephrased_data["question"] = result["Question"]
            rephrased_data["question_concept"] = result["Concept"]
            rephrased_data["choices"] = {
                "label": list(result["Options"].keys()),
                "text": list(result["Options"].values())
            }
            if s != "test":
                rephrased_data["answerKey"] = result["Question Answer"]
        elif isinstance(result, str):
            rephrased_data["question"] = result
        
        rephrased_results.append(rephrased_data)
    
    resp_history_df = pd.DataFrame({'prompt': response_history.keys(), 'response': response_history.values()})
    resp_history_df.to_csv(history_path, index=False)

    print(f"Response history saved to {history_path}")

    # Specify the path to the CSV file
    rephrased_file_path = f'{s}_rephrased.csv'

    # Get the keys from the first dictionary
    header = rephrased_results[0].keys()

    # Write the data to the CSV file
    with open(rephrased_file_path, 'w', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=header)
        
        # Write the header
        writer.writeheader()
        
        # Write the data
        for row in rephrased_results:
            writer.writerow(row)

    print(f'CSV file "{rephrased_file_path}" has been created with the data.')


Process data on split: train
Initialize response history


100%|██████████| 10/10 [00:22<00:00,  2.25s/it]


Response history saved to train_history_91023.csv
CSV file "train_rephrased.csv" has been created with the data.
Process data on split: validation
Initialize response history


100%|██████████| 10/10 [00:30<00:00,  3.01s/it]


Response history saved to validation_history_91023.csv
CSV file "validation_rephrased.csv" has been created with the data.
Process data on split: test
Initialize response history


  0%|          | 0/10 [00:03<?, ?it/s]


ValueError: Anwer key not in options: r

In [137]:
x = {
    'Question': 'Where is a Javanese eagle safe?',
    'Concept': 'Javanese eagle',
    'Options': {'A': 'rainforest',
    'B': 'rice fields',
    'C': 'in Java',
    'D': 'national park',
    'E': 'mountain'},
    'Question Answer': 'D'
}

In [138]:
y = x["Options"]

In [140]:
new_option = {"label": list(y.keys()), "text": list(y.values())}
new_option

{'label': ['A', 'B', 'C', 'D', 'E'],
 'text': ['rainforest', 'rice fields', 'in Java', 'national park', 'mountain']}