### Import Libraries

In [1]:
import os
import ast
import csv
import time
import openai
import pandas as pd

from dotenv import load_dotenv
from tqdm import tqdm

### Load Environment Variables

In [2]:
load_dotenv()

api_key = os.environ.get("API_KEY")
org_key = os.environ.get("ORG_KEY")

### Load CSV Rephrase Data

In [3]:
# Specify the path to your CSV file
split = ["validation", "test", "train"]

all_data = {}

for s in split:
    csv_file_path = f'{s}_rephrase.csv'

    # Initialize an empty list to store the data
    data_list = []

    # Open the CSV file for reading
    with open(csv_file_path, newline='') as csvfile:
        # Create a CSV reader object
        csv_reader = csv.DictReader(csvfile)
        
        # Iterate through each row in the CSV file
        for row in csv_reader:
            # Append the row (as a dictionary) to the data_list
            row["choices"] = ast.literal_eval(row["choices"])

            rephrase_params = ["concept", "name", "option"]
            for param in rephrase_params:
                if row[param] == "True":
                    row[param] = True
                elif row[param] == "False":
                    row[param] = False
                else:
                    raise TypeError(f"{param} data cannot be recognized")

            data_list.append(row)
    
    all_data[s] = data_list

### Generate Prompt

In [4]:
# Function to generate choice text
def generate_choices_text(choices):
    labels = choices["label"]
    texts = choices["text"]

    choice_text = ""
    for idx, label in enumerate(labels):
        choice_text += f'{label}. "{texts[idx]}"\n'
    
    return choice_text

# Function to generate answer text
def generate_answer_text(choices, answerKey):
    idx = choices["label"].index(answerKey)
    answer_text = f'{answerKey}. "{choices["text"][idx]}"'
    
    return answer_text

# Function to generate prompts based on the conditions
def generate_rephrase_all_prompt(row):
    return f"""Given a data consists of question, a concept, options, and question answer, change them to become related to Indonesia. If an element is marked to be changed, then you need to change it completely. If it's marked as keep, then keep as it is. Make sure your changes are still in the same domain/topic with the given data, and there is only one clear answer in the question answer for the question in the options. Reply with only your changed data in a JSON format.

Data:
###
Question: {row['question']} -> Change
Concept: {row['question_concept']} -> {'Change' if row['concept'] else 'Keep'}
Options: -> {'Change' if row['option'] else 'Keep'}
{generate_choices_text(row['choices'])}Question Answer: {generate_answer_text(row['choices'], row['answerKey']) if row['answerKey'] else ''}
###

Changed data:"""

### Rephrase Function

In [5]:
def get_openai_chat_completion(input_prompt, model_name):
    completion = openai.ChatCompletion.create(
        model=model_name,
        messages=[
            {
                'role': 'user',
                'content': input_prompt 
            }
        ],
        temperature=0.1,
        request_timeout=60,
    )
    return completion

# Define a function to rephrase the CSV data using OpenAI GPT-3.5-Turbo
def rephrase_csv_data(row, model_name, history):
    input_prompt = generate_rephrase_all_prompt(row)

    if input_prompt in history.keys():
        return input_prompt, history[input_prompt]["response"]

    try:
        completion = get_openai_chat_completion(input_prompt, model_name)
    except Exception:
        print('Caught exception, wait for 1 min...')
        time.sleep(60)
        completion = get_openai_chat_completion(input_prompt, model_name)
    
    response = completion.choices[0].message.content.strip()
            
    return input_prompt, response

def postprocess_result(row, response):
    rephrased_result = ast.literal_eval(response)
    letters = ['A', 'B', 'C', 'D', 'E']

    if isinstance(rephrased_result["Options"], list):
        if len(rephrased_result["Options"]) == 5:
            rephrased_result["Options"] = dict(zip(letters, rephrased_result["Options"]))
        else:
            option_labels = [option[0] for option in rephrased_result["Options"]]
            if set(option_labels).issubset(set(letters)):
                option_texts = [option[4:] for option in rephrased_result["Options"]]
                label_not_in_option = list(set(letters) - set(option_labels))
                for label in label_not_in_option:
                    chosen_idx = row["choices"]["label"].index(label)
                    chosen_text = row["choices"]["text"][chosen_idx]
                    option_texts.insert(chosen_idx, chosen_text)

                rephrased_result["Options"] = dict(zip(letters, option_texts))
            else:
                raise ValueError(f"Option output is not right: {rephrased_result['Options']}")
    
    # if split != "test":
    if rephrased_result["Question Answer"][0] not in letters:
        option_texts = [text.lower() for text in list(rephrased_result["Options"].values())]
        if rephrased_result['Question Answer'].lower() in option_texts:
            option_labels = list(rephrased_result["Options"].keys())
            answer_idx = option_texts.index(rephrased_result['Question Answer'].lower())
            rephrased_result["Question Answer"] = option_labels[answer_idx]
        else:
            raise ValueError(f"Answer key not in options: {rephrased_result['Question Answer']}")
    else:
        rephrased_result["Question Answer"] = rephrased_result["Question Answer"][0]
    
    return rephrased_result

def clean_data(row):
    letters = {'a', 'b', 'c', 'd', 'e'}
    alt_letters = {'b', 'c', 'd', 'e', 'f'}

    options = row["choices"]["text"]
    labels = set([option[0].lower() for option in options])

    if labels == letters or labels == alt_letters:
        options = [option[4:] for option in options]
    
    options = [option.replace('"', '') for option in options]
    row["choices"]["text"] = options

    return row

### Run Rephrase

In [6]:
openai.api_key = api_key
openai.organization = org_key

model_name = "gpt-3.5-turbo"

In [9]:
for s in split:
    print(f"Process data on split: {s}")
    
    history_path = f"{s}_history_91223.csv"
    if os.path.exists(history_path):
        print(f"Load response history from file {history_path}")
        resp_history_df = pd.read_csv(history_path, converters={'response': lambda x: ast.literal_eval(x)})
        response_history = dict(zip(resp_history_df.prompt, resp_history_df.response))
    else:
        print(f"Initialize response history")
        response_history = {}

    rephrased_results = []
    for data in tqdm(all_data[s]):
        rephrased_data = data.copy()

        if data["concept"] or data["option"]:
            prompt, response = rephrase_csv_data(data, model_name, response_history)
            response_history[prompt] = {"response": response}
            
            resp_history_df = pd.DataFrame({'prompt': response_history.keys(), 'response': response_history.values()})
            resp_history_df.to_csv(history_path, index=False)

            result = postprocess_result(data, response)
            # if isinstance(result, dict):
            rephrased_data["question"] = result["Question"]
            rephrased_data["choices"] = {
                "label": list(result["Options"].keys()),
                "text": [text.lower() for text in list(result["Options"].values())],
            }
            if "Concept" in list(result.keys()):
                rephrased_data["question_concept"] = result["Concept"]
            # if s != "test":
            rephrased_data["answerKey"] = result["Question Answer"]
        # elif isinstance(result, str):
        #     rephrased_data["question"] = result
        
        rephrased_results.append(clean_data(rephrased_data))

    # Specify the path to the CSV file
    rephrased_file_path = f'{s}_rephrased_91223.csv'

    # Get the keys from the first dictionary
    header = rephrased_results[0].keys()

    # Write the data to the CSV file
    with open(rephrased_file_path, 'w', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=header)
        
        # Write the header
        writer.writeheader()
        
        # Write the data
        for row in rephrased_results:
            writer.writerow(row)

    print(f'CSV file "{rephrased_file_path}" has been created with the data.')

Process data on split: validation
Load response history from file validation_history_91223.csv


  0%|          | 0/274 [00:00<?, ?it/s]

100%|██████████| 274/274 [00:03<00:00, 75.67it/s]


CSV file "validation_rephrased_91223.csv" has been created with the data.
Process data on split: test
Load response history from file test_history_91223.csv


100%|██████████| 236/236 [00:02<00:00, 82.14it/s]


CSV file "test_rephrased_91223.csv" has been created with the data.
Process data on split: train
Load response history from file train_history_91223.csv


100%|██████████| 2162/2162 [01:30<00:00, 23.87it/s]

CSV file "train_rephrased_91223.csv" has been created with the data.



