In [None]:
import pandas as pd

fold = 0

df = pd.read_csv(f"ANON")
df.head()

In [None]:
MAPPING = {
    0: "Description",
    1: "Feelings",
    2: "Evaluation",
    3: "Analysis",
    4: "Conclusion",
    5: "Action Plan"
}

developer_prompt = "You are an expert in the Gibbs reflective cycle. The components of the Gibbs reflective cycle are: Description, Feelings, Evaluation, Analysis, Conclusion, and Action Plan. Description refers describing to the event or experience you are reflecting on. Feelings refers to your emotions during the event or experience. Evaluation refers to your thoughts about the event or experience, providing positive and negative aspects on what happened. Analysis refers to your understanding of the event or experience, providing reasons behind points mentioned in the Evaluation aspect. Conclusion refers to what you learned from the event or experience. Action Plan refers to what you would do differently in the future (next time). Always respond in English."

final_jsonl_data = []
for index, row in df.iterrows():
    final_jsonl_data.append({
        "custom_id": f"request-{index}-id-{row['id']}",
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-4o-2024-08-06",
            "messages": [
                {"role": "system", "content": developer_prompt},
                {"role": "user", "content": "Please specify the Gibbs reflective cycle component that the following sentence belongs to:\n\n'''\n" + row["text"] + "\n'''\n\nLet's think step by step. Respond in English. In the last line of your reply, you should just write the name of the component, and nothing else."}
            ],
            "temperature": 0.5
        }
    })

import jsonl
jsonl.dump(final_jsonl_data, f"filter-and-relabel-fold-{fold}.jsonl")




In [None]:
import openai
from openai import OpenAI

with open("../../../api_key.txt", "r") as f:
    api_key = f.read().strip()

openai.api_key = api_key
openai_client = OpenAI(api_key=api_key)

def return_message_from_openai(messages, temperature = 0.5):
    global openai_client
    response = openai_client.chat.completions.create(
        model="gpt-4o-2024-08-06",
        messages=messages,
        temperature=temperature
    )
    return response.choices[0].message.content

# Batch API

In [None]:
batch_input_file = openai_client.files.create(
    file=open(f"filter-and-relabel-fold-{fold}.jsonl", "rb"),
    purpose="batch"
)

print(batch_input_file)

In [None]:
batch_input_file_id = batch_input_file.id
created_batch = openai_client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
        "description": "Filter-and-relabel Fine-tune"
    }
)

In [None]:
batch = openai_client.batches.retrieve(created_batch.id)
batch.status

In [None]:
file_response = openai_client.files.content(batch.output_file_id)

In [None]:
def convert_last_line_to_class_id(last_line):
    # search in mapping, if each key appears in the last line.lower return that
    for key, value in MAPPING.items():
        if value.lower() in last_line.lower():
            return key
    return 0

convert_last_line_to_class_id("Hi descRiption hey")

In [None]:
df

In [None]:
import json

# make a new dataframe id,text-id,text,label,original_id,test_fold, by using the df we had in the beginning
result_df = pd.DataFrame(columns=["id", "text", "new_label", "original_label", "original_id"])

for line in file_response.text.split("\n"):
    if line:
        data = json.loads(line)
        last_line = data["response"]["body"]["choices"][0]["message"]["content"].split("\n")[-1]
        gpt_determined_class_id = convert_last_line_to_class_id(last_line)
        code_to_find = data["custom_id"].split("id-request")[-1]
        df_row_with_id = df[df["id"].str.contains(code_to_find)]
        new_row = {
            "id": data["custom_id"],
            "text": df_row_with_id["text"].values[0],
            "new_label": gpt_determined_class_id,
            "original_label": df_row_with_id["label"].values[0],
            "original_id": int(data["custom_id"].split("-")[-1])
            # "test_fold": df_row_with_id["test_fold"].values[0]
        }
        result_df = pd.concat([result_df, pd.DataFrame([new_row])], ignore_index=True)

result_df.head()

In [None]:
# find rows in which new_label is not equal to original_label
result_df[result_df["new_label"] != result_df["original_label"]]

In [None]:
result_df.to_csv(f"filter-and-relabel-fold-{fold}-outputs.csv", index=False)