### Import Libraries

In [1]:
import os
import ast
import csv
import time
import openai
import pandas as pd

from dotenv import load_dotenv
from tqdm import tqdm

### Load Environment Variables

In [2]:
load_dotenv()

snapshot = "v3-gpt4-1106"
api_key = os.environ.get("API_KEY1")
org_key = os.environ.get("ORG_KEY1")

### Load CSV Rephrase Data

In [3]:
# Specify the path to your CSV file
split = ["validation", "test", "train"]
# split = ["test"]

all_data = {}

for s in split:
    csv_file_path = f'./{snapshot}/{s}_rephrased_{snapshot}.csv'

    # Initialize an empty list to store the data
    data_list = []

    # Open the CSV file for reading
    with open(csv_file_path, newline='') as csvfile:
        # Create a CSV reader object
        csv_reader = csv.DictReader(csvfile)
        
        # Iterate through each row in the CSV file
        for row in csv_reader:
            # Append the row (as a dictionary) to the data_list
            row["choices"] = ast.literal_eval(row["choices"])

            rephrase_params = ["concept", "name", "option"]
            for param in rephrase_params:
                if row[param] == "True":
                    row[param] = True
                elif row[param] == "False":
                    row[param] = False
                else:
                    raise TypeError(f"{param} data cannot be recognized")

            data_list.append(row)
    
    all_data[s] = data_list

### Generate Prompt

In [4]:
# Function to generate choice text
def generate_choices_text(choices):
    labels = choices["label"]
    texts = choices["text"]

    choice_text = ""
    for idx, label in enumerate(labels):
        choice_text += f'{label}. "{texts[idx]}"\n'
    
    return choice_text

# Function to generate answer text
def generate_answer_text(choices, answerKey):
    idx = choices["label"].index(answerKey)
    answer_text = f'{answerKey}. "{choices["text"][idx]}"'
    
    return answer_text

# Function to generate prompts based on the conditions
def generate_rephrase_name_prompt(row):
    return f"""Change all names in the given question to Indonesian names. Change only the names, keep all other phrases in the question the same and keep it all in Indonesian.

Question: {row['question']}
Changed Question:"""

### Rephrase Function

In [5]:
def get_openai_chat_completion(input_prompt, model_name):
    completion = openai.ChatCompletion.create(
        model=model_name,
        messages=[
            {
                'role': 'user',
                'content': input_prompt 
            }
        ],
        temperature=0.3,
        request_timeout=60,
    )
    return completion

# Define a function to rephrase the CSV data using OpenAI GPT-3.5-Turbo
def rephrase_csv_data(row, model_name, history):
    input_prompt = generate_rephrase_name_prompt(row)

    if input_prompt in history.keys():
        return input_prompt, history[input_prompt]["response"]

    try:
        completion = get_openai_chat_completion(input_prompt, model_name)
    except Exception:
        print('Caught exception, wait for 1 min...')
        time.sleep(60)
        completion = get_openai_chat_completion(input_prompt, model_name)
    
    response = completion.choices[0].message.content.strip()
            
    return input_prompt, response

def postprocess_result(response):
    if "Question: " in response:
        rephrased_result = response.split("Question: ")[-1]
    else:
        rephrased_result = response
    
    return rephrased_result

### Run Rephrase

In [6]:
openai.api_key = api_key
openai.organization = org_key

model_name = "gpt-3.5-turbo"

In [19]:
for s in ["train"]:
    print(f"Process data on split: {s}")
    
    history_path = f"./{snapshot}/{s}_name_history_{snapshot}.csv"
    if os.path.exists(history_path):
        print(f"Load response history from file {history_path}")
        resp_history_df = pd.read_csv(history_path, converters={'response': lambda x: ast.literal_eval(x)})
        response_history = dict(zip(resp_history_df.prompt, resp_history_df.response))
    else:
        print(f"Initialize response history")
        response_history = {}

    rephrased_results = []
    for data in tqdm(all_data[s]):
        rephrased_data = data.copy()

        if data["name"]:
            prompt, response = rephrase_csv_data(data, model_name, response_history)
            response_history[prompt] = {"response": response}
        
            resp_history_df = pd.DataFrame({'prompt': response_history.keys(), 'response': response_history.values()})
            resp_history_df.to_csv(history_path, index=False)

            result = postprocess_result(response)
            if isinstance(result, str):
                rephrased_data["question"] = result

        rephrased_results.append(rephrased_data)

    # Specify the path to the CSV file
    rephrased_file_path = f'./{snapshot}/{s}_rephrased_name_{snapshot}.csv'

    # Get the keys from the first dictionary
    header = rephrased_results[0].keys()

    # Write the data to the CSV file
    with open(rephrased_file_path, 'w', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=header)
        
        # Write the header
        writer.writeheader()
        
        # Write the data
        for row in rephrased_results:
            writer.writerow(row)

    print(f'CSV file "{rephrased_file_path}" has been created with the data.')


Process data on split: train
Initialize response history


 93%|█████████▎| 2002/2162 [23:01<02:06,  1.27it/s]

Caught exception, wait for 1 min...


100%|██████████| 2162/2162 [26:51<00:00,  1.34it/s]  

CSV file "./v3-gpt4-1106/train_rephrased_name_v3-gpt4-1106.csv" has been created with the data.





In [12]:
test = all_data["test"]
data = [test[2]]

model_name = "gpt-3.5-turbo-1106"
for d in data:
    print(d)
    prompt, response = rephrase_csv_data(d, model_name, {})
    print(response)

{'id': '5929f5704637184dc3390dd6964cacca', 'question': 'John khawatir ketika sebuah lubang besar muncul di halaman rumahnya di Indonesia. Jika lubang itu lebih besar, mungkin bisa menelan apa?', 'question_concept': 'lubang besar', 'choices': {'label': ['A', 'B', 'C', 'D', 'E'], 'text': ['rumahnya', 'rumah tetangga', 'ruang cuci', 'mobilnya', 'apartemennya']}, 'answerKey': 'A', 'concept': True, 'name': True, 'option': False}
Budi khawatir ketika sebuah lubang besar muncul di halaman rumahnya di Indonesia. Jika lubang itu lebih besar, mungkin bisa menelan apa?


In [17]:
test = all_data["validation"]

In [18]:
for idx, data in enumerate(rephrased_results):
    if data["question"] != test[idx]["question"]:
        print(f"IDX: {idx}")
        print(f"Raw: {test[idx]['question']}")
        print(f"Name: {data['question']}")


IDX: 1
Raw: James sedang mencari tempat yang baik untuk membeli lahan pertanian. Di mana dia sebaiknya mencari?
Name: Budi sedang mencari tempat yang baik untuk membeli lahan pertanian. Di mana dia sebaiknya mencari?
IDX: 4
Raw: Janet menonton film tersebut karena dia menyukai apa?
Name: Anita menonton film tersebut karena dia menyukai apa?
IDX: 7
Raw: Joko ingin menemukan peta bawah tanah kuno dari tahun 50-an. Di mana dia mungkin mencarinya?
Name: Budi ingin menemukan peta bawah tanah kuno dari tahun 50-an. Di mana dia mungkin mencarinya?
IDX: 8
Raw: Budi terburu-buru untuk pulang ke rumahnya di Jakarta, tetapi lampu lalu lintas berubah menjadi kuning dan dia terpaksa melakukan apa?
Name: Ahmad terburu-buru untuk pulang ke rumahnya di Jakarta, tetapi lampu lalu lintas berubah menjadi kuning dan dia terpaksa melakukan apa?
IDX: 9
Raw: Kadal bernama Bob tinggal di tempat yang hangat dengan banyak air. Di manakah dia kemungkinan besar tinggal?
Name: Kadal bernama Budi tinggal di tempat 

In [12]:
test[0]

{'id': '90b30172e645ff91f7171a048582eb8b',
 'question': 'Rumah susun tersebut sulit dijual oleh agen properti, karena tepat berada di samping gedung apa?',
 'question_concept': 'rumah susun',
 'choices': {'label': ['A', 'B', 'C', 'D', 'E'],
  'text': ['perkembangan pinggiran kota',
   'gedung apartemen',
   'halte bus',
   'jakarta',
   'pinggiran kota']},
 'answerKey': 'B',
 'concept': False,
 'name': False,
 'option': True}