### Import Libraries

In [1]:
import os
import ast
import csv
import time
import openai
import pandas as pd

from dotenv import load_dotenv
from tqdm import tqdm

### Load Environment Variables

In [2]:
load_dotenv()

api_key = os.environ.get("API_KEY")
org_key = os.environ.get("ORG_KEY")

### Load CSV Rephrase Data

In [3]:
# Specify the path to your CSV file
split = ["validation", "test", "train"]

all_data = {}

for s in split:
    csv_file_path = f'{s}_rephrase.csv'

    # Initialize an empty list to store the data
    data_list = []

    # Open the CSV file for reading
    with open(csv_file_path, newline='') as csvfile:
        # Create a CSV reader object
        csv_reader = csv.DictReader(csvfile)
        
        # Iterate through each row in the CSV file
        for row in csv_reader:
            # Append the row (as a dictionary) to the data_list
            row["choices"] = ast.literal_eval(row["choices"])

            rephrase_params = ["concept", "name", "option"]
            for param in rephrase_params:
                if row[param] == "True":
                    row[param] = True
                elif row[param] == "False":
                    row[param] = False
                else:
                    raise TypeError(f"{param} data cannot be recognized")

            data_list.append(row)
    
    all_data[s] = data_list

### Generate Prompt

In [4]:
# Function to generate choice text
def generate_choices_text(choices):
    labels = choices["label"]
    texts = choices["text"]

    choice_text = ""
    for idx, label in enumerate(labels):
        choice_text += f'{label}. "{texts[idx]}"\n'
    
    return choice_text

# Function to generate answer text
def generate_answer_text(choices, answerKey):
    idx = choices["label"].index(answerKey)
    answer_text = f'{answerKey}. "{choices["text"][idx]}"'
    
    return answer_text

# Function to generate prompts based on the conditions
def generate_rephrase_all_prompt(row):
    return f"""Change the given data to make it relevant to Indonesia in any ways. Make all elements relevant to each other. Keep the data in English. Return with only your changed data in a JSON format where question is string, concept is string, options is dictionary where label is the keys and option text is the values, and question_answer is string contain one label from the options.

Data:
###
Question: {row['question']}
Concept: {row['question_concept']}
Options:
{generate_choices_text(row['choices'])}Question Answer: {generate_answer_text(row['choices'], row['answerKey']) if row['answerKey'] else ''}
###

Changed data in JSON:"""

### OpenAI Completion and Rephrase Function

In [5]:
def get_openai_chat_completion(input_prompt, model_name, temp=0.1, timeout=60):
    completion = openai.ChatCompletion.create(
        model=model_name,
        messages=[
            {
                'role': 'user',
                'content': input_prompt 
            }
        ],
        temperature=temp,
        request_timeout=timeout,
    )
    return completion

def get_openai_completion(input_prompt, model_name, max_tokens=256, temp=0.1, timeout=60):
    completion = openai.Completion.create(
        model=model_name,
        prompt=input_prompt,
        max_tokens=max_tokens,
        temperature=temp,
        request_timeout=timeout,
    )
    return completion

# Define a function to rephrase the CSV data using OpenAI GPT-3.5-Turbo
def rephrase_csv_data(row, model_name, history, api_type="instruct"):
    input_prompt = generate_rephrase_all_prompt(row)

    if input_prompt in history.keys():
        return input_prompt, history[input_prompt]["response"]

    if api_type == "chat":
        try:
            completion = get_openai_chat_completion(input_prompt, model_name)
        except Exception:
            print('Caught exception, wait for 1 min...')
            time.sleep(60)
            completion = get_openai_chat_completion(input_prompt, model_name)
        response = completion.choices[0].message.content.strip()
    
    elif api_type == "instruct":
        try:
            completion = get_openai_completion(input_prompt, model_name)
        except Exception:
            print('Caught exception, wait for 1 min...')
            time.sleep(60)
            completion = get_openai_completion(input_prompt, model_name)
        response = completion.choices[0].text.strip()
            
    return input_prompt, response

### Post-process and Clean Function

In [6]:
def fix_response_keys(response):
    if "question" not in list(response.keys()):
        if "Question" in list(response.keys()):
            response["question"] = response["Question"]
        else:
            raise ValueError(f"Response is not right: {response}")
    
    if "concept" not in list(response.keys()):
        if "Concept" in list(response.keys()):
            response["concept"] = response["Concept"]
        else:
            raise ValueError(f"Response is not right: {response}")
    
    if "options" not in list(response.keys()):
        if "Options" in list(response.keys()):
            response["options"] = response["Options"]
        else:
            raise ValueError(f"Response is not right: {response}")
    
    if "question_answer" not in list(response.keys()):
        if "Question Answer" in list(response.keys()):
            response["question_answer"] = response["Question Answer"]
        elif "Question_Answer" in list(response.keys()):
            response["question_answer"] = response["Question_Answer"]
        else:
            raise ValueError(f"Response is not right: {response}")
    
    return response

def postprocess_result(row, response):
    # response = "\n".join(response.split("\n")[1:-1])
    rephrased_result = fix_response_keys(ast.literal_eval(response))
    letters = ['A', 'B', 'C', 'D', 'E']

    if isinstance(rephrased_result["options"], list):
        if len(rephrased_result["options"]) == 5:
            rephrased_result["options"] = dict(zip(letters, rephrased_result["options"]))
        else:
            option_labels = [option[0].lower() for option in rephrased_result["options"]]
            if set(option_labels).issubset(set([l.lower() for l in letters])):
                option_texts = [option[4:] if option[3] == "\"" else option[3:] for option in rephrased_result["options"]]
                label_not_in_option = [label.upper() for label in list(set([l.lower() for l in letters]) - set(option_labels))]
                for label in label_not_in_option:
                    chosen_idx = row["choices"]["label"].index(label)
                    chosen_text = row["choices"]["text"][chosen_idx]
                    option_texts.insert(chosen_idx, chosen_text)

                rephrased_result["options"] = dict(zip(letters, option_texts))
            else:
                raise ValueError(f"Option output is not right: {rephrased_result['options']}")
    
    answer = rephrased_result["question_answer"] 
    if len(answer) > 1 and answer[0].lower() not in [l.lower() for l in letters]:
        option_texts = [text.lower() for text in list(rephrased_result["options"].values())]
        if rephrased_result['question_answer'].lower() in option_texts:
            option_labels = list(rephrased_result["options"].keys())
            answer_idx = option_texts.index(rephrased_result['question_answer'].lower())
            rephrased_result["question_answer"] = option_labels[answer_idx]
        else:
            raise ValueError(f"Answer key not in options: {rephrased_result['question_answer']}")
    else:
        rephrased_result["question_answer"] = rephrased_result["question_answer"][0]
    
    return rephrased_result

def clean_data(row):
    letters = {'a', 'b', 'c', 'd', 'e'}
    alt_letters = {'b', 'c', 'd', 'e', 'f'}

    options = row["choices"]["text"]
    labels = set([option[0].lower() for option in options])

    if labels == letters or labels == alt_letters:
        options = [option[4:] if option[3] == "\"" else option[3:] for option in options]
    
    options = [option.replace('"', '') for option in options]
    row["choices"]["text"] = options

    return row

### Run Rephrase

In [9]:
openai.api_key = api_key
openai.organization = org_key

snapshot = "v3-gpt3_5-1106"
model_name = "gpt-3.5-turbo-1106"

In [10]:
for s in ["test"]:
    print(f"Process data on split: {s}")
    
    history_path = f"{snapshot}/{s}_history_{snapshot}.csv"
    if os.path.exists(history_path):
        print(f"Load response history from file {history_path}")
        resp_history_df = pd.read_csv(history_path, converters={'response': lambda x: ast.literal_eval(x)})
        response_history = dict(zip(resp_history_df.prompt, resp_history_df.response))
    else:
        print(f"Initialize response history")
        response_history = {}

    rephrased_results = []
    for data in tqdm(all_data[s]):
        rephrased_data = data.copy()

        if data["concept"] or data["option"]:
            prompt, response = rephrase_csv_data(data, model_name, response_history, api_type="chat")
            response_history[prompt] = {"response": response}
            
            resp_history_df = pd.DataFrame({'prompt': response_history.keys(), 'response': response_history.values()})
            resp_history_df.to_csv(history_path, index=False)

            result = postprocess_result(data, response)
            
            rephrased_data["question"] = result["question"]
            rephrased_data["choices"] = {
                "label": list(result["options"].keys()),
                "text": [text.lower() for text in list(result["options"].values())],
            }
            
            if "concept" in list(result.keys()):
                rephrased_data["question_concept"] = result["concept"]
            
            rephrased_data["answerKey"] = result["question_answer"]
        
        rephrased_results.append(clean_data(rephrased_data))

    # Specify the path to the CSV file
    rephrased_file_path = f'{snapshot}/{s}_rephrased_{snapshot}.csv'

    # Get the keys from the first dictionary
    header = rephrased_results[0].keys()

    # Write the data to the CSV file
    with open(rephrased_file_path, 'w', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=header)
        
        # Write the header
        writer.writeheader()
        
        # Write the data
        for row in rephrased_results:
            writer.writerow(row)

    print(f'CSV file "{rephrased_file_path}" has been created with the data.')

Process data on split: test
Initialize response history


  0%|          | 0/236 [00:00<?, ?it/s]

  1%|          | 2/236 [00:04<08:18,  2.13s/it]

Caught exception, wait for 1 min...


  2%|▏         | 5/236 [02:07<1:36:51, 25.16s/it]

Caught exception, wait for 1 min...


 11%|█         | 25/236 [04:34<03:39,  1.04s/it] 

Caught exception, wait for 1 min...


 12%|█▏        | 28/236 [06:38<59:53, 17.28s/it]  

Caught exception, wait for 1 min...


 16%|█▌        | 38/236 [08:44<24:55,  7.55s/it]  

Caught exception, wait for 1 min...


 19%|█▊        | 44/236 [10:47<43:08, 13.48s/it]

Caught exception, wait for 1 min...


 21%|██        | 49/236 [12:55<43:32, 13.97s/it]  

Caught exception, wait for 1 min...


 36%|███▌      | 85/236 [15:46<04:13,  1.68s/it]  

Caught exception, wait for 1 min...


 50%|████▉     | 117/236 [18:19<02:12,  1.12s/it] 

Caught exception, wait for 1 min...


 53%|█████▎    | 125/236 [20:33<09:30,  5.14s/it]

Caught exception, wait for 1 min...


 63%|██████▎   | 148/236 [23:05<02:06,  1.44s/it]  

Caught exception, wait for 1 min...


 64%|██████▎   | 150/236 [25:08<28:01, 19.56s/it]

Caught exception, wait for 1 min...


 67%|██████▋   | 157/236 [27:17<13:22, 10.16s/it]

Caught exception, wait for 1 min...


 73%|███████▎  | 173/236 [29:34<04:51,  4.62s/it]

Caught exception, wait for 1 min...


 81%|████████  | 190/236 [32:02<01:44,  2.27s/it]

Caught exception, wait for 1 min...


 94%|█████████▎| 221/236 [34:56<00:30,  2.00s/it]

Caught exception, wait for 1 min...


100%|██████████| 236/236 [37:08<00:00,  9.44s/it]

CSV file "v3-gpt3_5-1106/test_rephrased_v3-gpt3_5-1106.csv" has been created with the data.





In [8]:
test = all_data["test"]
data = [test[0]]

model_name = "gpt-3.5-turbo-1106"
for d in data:
    print(d)
    prompt, response = rephrase_csv_data(d, model_name, {}, api_type="chat")
    print(response)

{'id': '90b30172e645ff91f7171a048582eb8b', 'question': 'The townhouse was a hard sell for the realtor, it was right next to a high rise what?', 'question_concept': 'townhouse', 'choices': {'label': ['A', 'B', 'C', 'D', 'E'], 'text': ['suburban development', 'apartment building', 'bus stop', 'michigan', 'suburbs']}, 'answerKey': '', 'concept': False, 'name': False, 'option': True}
Caught exception, wait for 1 min...
{
  "question": "What is the most common type of housing in Jakarta?",
  "concept": "townhouse",
  "options": {
    "A": "apartment building",
    "B": "traditional house",
    "C": "office building",
    "D": "shopping mall",
    "E": "suburban development"
  },
  "question_answer": "E"
}


In [24]:
result = postprocess_result(d, response)
print(result)

{'question': "Siti couldn't maintain her grip on Budi during the Pencak Silat match. He kept slipping away. With the bout extending for an extended period, she decided to do what?", 'concept': 'grip', 'options': {'A': 'release', 'B': 'tighten', 'C': 'surrender', 'D': 'maintain', 'E': 'lower'}, 'question_answer': 'A'}
