### Import Libraries

In [1]:
import os
import ast
import csv
import time
import openai
import pandas as pd

from dotenv import load_dotenv
from tqdm import tqdm

### Load Environment Variables

In [2]:
load_dotenv()

snapshot = "v3-gpt3_5-1106"
api_key = os.environ.get("API_KEY")
org_key = os.environ.get("ORG_KEY")

### Load CSV Rephrase Data

In [3]:
# Specify the path to your CSV file
# split = ["validation", "test", "train"]
split = ["test"]

all_data = {}

for s in split:
    csv_file_path = f'./{snapshot}/{s}_rephrased_{snapshot}.csv'

    # Initialize an empty list to store the data
    data_list = []

    # Open the CSV file for reading
    with open(csv_file_path, newline='') as csvfile:
        # Create a CSV reader object
        csv_reader = csv.DictReader(csvfile)
        
        # Iterate through each row in the CSV file
        for row in csv_reader:
            # Append the row (as a dictionary) to the data_list
            row["choices"] = ast.literal_eval(row["choices"])

            rephrase_params = ["concept", "name", "option"]
            for param in rephrase_params:
                if row[param] == "True":
                    row[param] = True
                elif row[param] == "False":
                    row[param] = False
                else:
                    raise TypeError(f"{param} data cannot be recognized")

            data_list.append(row)
    
    all_data[s] = data_list

### Generate Prompt

In [4]:
# Function to generate choice text
def generate_choices_text(choices):
    labels = choices["label"]
    texts = choices["text"]

    choice_text = ""
    for idx, label in enumerate(labels):
        choice_text += f'{label}. "{texts[idx]}"\n'
    
    return choice_text

# Function to generate answer text
def generate_answer_text(choices, answerKey):
    idx = choices["label"].index(answerKey)
    answer_text = f'{answerKey}. "{choices["text"][idx]}"'
    
    return answer_text

# Function to generate prompts based on the conditions
def generate_rephrase_name_prompt(row):
    return f"""Change all names in the given question to Indonesian names. Change only the names, keep all other phrases in the question the same and keep it all in english.

Question: {row['question']}
Changed Question:"""

### Rephrase Function

In [5]:
def get_openai_chat_completion(input_prompt, model_name):
    completion = openai.ChatCompletion.create(
        model=model_name,
        messages=[
            {
                'role': 'user',
                'content': input_prompt 
            }
        ],
        temperature=0.3,
        request_timeout=60,
    )
    return completion

# Define a function to rephrase the CSV data using OpenAI GPT-3.5-Turbo
def rephrase_csv_data(row, model_name, history):
    input_prompt = generate_rephrase_name_prompt(row)

    if input_prompt in history.keys():
        return input_prompt, history[input_prompt]["response"]

    try:
        completion = get_openai_chat_completion(input_prompt, model_name)
    except Exception:
        print('Caught exception, wait for 1 min...')
        time.sleep(60)
        completion = get_openai_chat_completion(input_prompt, model_name)
    
    response = completion.choices[0].message.content.strip()
            
    return input_prompt, response

def postprocess_result(response):
    if "Question: " in response:
        rephrased_result = response.split("Question: ")[-1]
    else:
        rephrased_result = response
    
    return rephrased_result

### Run Rephrase

In [6]:
openai.api_key = api_key
openai.organization = org_key

model_name = "gpt-3.5-turbo-1106"

In [7]:
for s in split:
    print(f"Process data on split: {s}")
    
    history_path = f"./{snapshot}/{s}_name_history_{snapshot}.csv"
    if os.path.exists(history_path):
        print(f"Load response history from file {history_path}")
        resp_history_df = pd.read_csv(history_path, converters={'response': lambda x: ast.literal_eval(x)})
        response_history = dict(zip(resp_history_df.prompt, resp_history_df.response))
    else:
        print(f"Initialize response history")
        response_history = {}

    rephrased_results = []
    for data in tqdm(all_data[s]):
        rephrased_data = data.copy()

        if data["name"]:
            prompt, response = rephrase_csv_data(data, model_name, response_history)
            response_history[prompt] = {"response": response}
        
            resp_history_df = pd.DataFrame({'prompt': response_history.keys(), 'response': response_history.values()})
            resp_history_df.to_csv(history_path, index=False)

            result = postprocess_result(response)
            if isinstance(result, str):
                rephrased_data["question"] = result

        rephrased_results.append(rephrased_data)

    # Specify the path to the CSV file
    rephrased_file_path = f'./{snapshot}/{s}_rephrased_name_{snapshot}.csv'

    # Get the keys from the first dictionary
    header = rephrased_results[0].keys()

    # Write the data to the CSV file
    with open(rephrased_file_path, 'w', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=header)
        
        # Write the header
        writer.writeheader()
        
        # Write the data
        for row in rephrased_results:
            writer.writerow(row)

    print(f'CSV file "{rephrased_file_path}" has been created with the data.')


Process data on split: test
Initialize response history


 47%|████▋     | 111/236 [00:55<01:04,  1.94it/s]

Caught exception, wait for 1 min...


 66%|██████▌   | 156/236 [03:12<00:22,  3.57it/s]

Caught exception, wait for 1 min...


 75%|███████▍  | 176/236 [05:28<01:03,  1.06s/it]

Caught exception, wait for 1 min...


 94%|█████████▍| 222/236 [07:47<00:05,  2.69it/s]

Caught exception, wait for 1 min...


 96%|█████████▌| 226/236 [09:50<01:42, 10.29s/it]

Caught exception, wait for 1 min...


 98%|█████████▊| 231/236 [11:55<01:03, 12.61s/it]

Caught exception, wait for 1 min...


100%|██████████| 236/236 [13:59<00:00,  3.56s/it]

CSV file "./v3-gpt3_5-1106/test_rephrased_name_v3-gpt3_5-1106.csv" has been created with the data.



