In [None]:
import pandas as pd

df = pd.read_csv("ANON")

df.head() # columns: text, label, id, test_fold

In [None]:
# find label count distribution and store it as a dict
label_count = df['label'].value_counts().to_dict()
label_count

In [None]:
# Find the maximum label count
max_label_count = max(label_count.values())

# Calculate the difference needed for each class to reach the maximum count
label_count_diff = {label: max_label_count - count for label, count in label_count.items()}

label_count_diff

In [None]:
# find how many augmentations should we do per sentence, by dividing the count diff by the number of sentences already in that class
augmentation_count_needed = {label: diff // count for label, diff, count in zip(label_count.keys(), label_count_diff.values(), label_count.values())}
augmentation_count_needed

In [None]:
MAPPING = {
    0: "Description",
    1: "Feelings",
    2: "Evaluation",
    3: "Analysis",
    4: "Conclusion",
    5: "Action Plan"
}

developer_prompt = "You are an expert in the Gibbs reflective cycle. The components of the Gibbs reflective cycle are: Description, Feelings, Evaluation, Analysis, Conclusion, and Action Plan. Description refers describing to the event or experience you are reflecting on. Feelings refers to your emotions during the event or experience. Evaluation refers to your thoughts about the event or experience, providing positive and negative aspects on what happened. Analysis refers to your understanding of the event or experience, providing reasons behind points mentioned in the Evaluation aspect. Conclusion refers to what you learned from the event or experience. Action Plan refers to what you would do differently in the future (next time). Always respond in German."
user_prompts = [] # {id: XX, prompt: YY}

for index, row in df.iterrows():
    for i in range(augmentation_count_needed[row["label"]]):
        prompt = "This sentence is a reflective sentence from the " + MAPPING[row["label"]] + " component of the Gibbs reflective cycle:\n" + row["text"] + "\n\nFirst, think step by step on why this is a sentence of the " + MAPPING[row["label"]] + " component of the Gibbs reflective cycle. Then, think step by step, and finally in the last line of your response (after putting a line break), please write a sentence that reflects on the same event or experience, focusing on the " + MAPPING[row["label"]] + " component of the Gibbs reflective cycle. You should use different names, words, and terminologies in your output, but the overall meaning and content should be the same and refer to the same Gibbs component. In the *last line* of your output, just put the sentence and nothing else. Respond in German."
        user_prompts.append({"id": row["id"], "prompt": prompt})


In [None]:
len(user_prompts)

In [None]:
import openai
from openai import OpenAI

with open("../../../api_key.txt", "r") as f:
    api_key = f.read().strip()

openai.api_key = api_key
openai_client = OpenAI(api_key=api_key)

def return_message_from_openai(messages, temperature = 1):
    global openai_client
    response = openai_client.chat.completions.create(
        model="gpt-4o-2024-08-06",
        messages=messages,
        temperature=temperature
    )
    return response.choices[0].message.content

In [None]:
print(user_prompts[6])
print("=====")
print(return_message_from_openai([
    {
        "role": "developer",
        "content": developer_prompt
    },
    {
        "role": "user",
        "content": user_prompts[6]["prompt"]
    }
]))

In [None]:
import json

jsonl_data = []

for i, prompt in enumerate(user_prompts):
    jsonl_data.append({
        "custom_id": f"request-{i+1}-id-{prompt['id']}",
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-4o-2024-08-06",
            "messages": [
                {"role": "developer", "content": developer_prompt},
                {"role": "user", "content": prompt["prompt"]}
            ],
            "temperature": 1.0
        }
    })

with open("chain-of-thought-requests.jsonl", "w") as f:
    for entry in jsonl_data:
        f.write(json.dumps(entry) + "\n")


# Batch API

In [None]:
batch_input_file = openai_client.files.create(
    file=open("chain-of-thought-requests.jsonl", "rb"),
    purpose="batch"
)

print(batch_input_file)

In [None]:
batch_input_file_id = batch_input_file.id
openai_client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
        "description": "Chain-of-thought"
    }
)

# FROM RETRIEVE

In [None]:
batch = openai_client.batches.retrieve("ANON")
batch

In [None]:
file_response = openai_client.files.content(batch.output_file_id)

In [None]:
import json
# make a new dataframe id,text-id,text,label,original_id,test_fold, by using the df we had in the beginning
result_df = pd.DataFrame(columns=["id", "text", "label", "original_id", "test_fold"])

for line in file_response.text.split("\n"):
    if line:
        data = json.loads(line)
        text = data["response"]["body"]["choices"][0]["message"]["content"].strip()
        text = text.split("\n")[-1]
        
        new_row = {
            "id": data["custom_id"],
            # "text-id": df[df["id"] == int(data["custom_id"].split("-")[-1])]["text-id"].values[0],
            "text": text,
            "label": df[df["id"] == int(data["custom_id"].split("-")[-1])]["label"].values[0],
            "original_id": int(data["custom_id"].split("-")[-1]),
            "test_fold": df[df["id"] == int(data["custom_id"].split("-")[-1])]["test_fold"].values[0]
        }
        result_df = pd.concat([result_df, pd.DataFrame([new_row])], ignore_index=True)

result_df.head()

In [None]:
result_df.to_csv("chain-of-thought-outputs.csv", index=False)