In [None]:
fold = 0

import pandas as pd

df = pd.read_csv(f"../../../training/discourse/fourth-round/wrong-predictions-fold-{fold}.csv")

df.head() # columns: text, label

In [None]:
label_count = df['label'].value_counts().to_dict()
label_count

In [None]:
# Find the maximum label count
max_label_count = max(label_count.values())

# Calculate the difference needed for each class to reach the maximum count
label_count_diff = {label: max_label_count - count for label, count in label_count.items()}

label_count_diff

In [None]:
# find how many augmentations should we do per sentence, by dividing the count diff by the number of sentences already in that class
augmentation_count_needed = {label: diff // count for label, diff, count in zip(label_count.keys(), label_count_diff.values(), label_count.values())}
augmentation_count_needed

In [None]:
MAPPING = {
    0: "Evidence",
    1: "Claim",
    2: "Concluding Statement",
    3: "Lead",
    4: "Position",
    5: "Counterclaim",
    6: "Rebuttal"
}

developer_prompt = "You are an expert in analyzing persuasive essays and understanding argumentative and discourse elements. The discourse elements are: Lead, Position, Claim, Counterclaim, Rebuttal, Evidence, and Concluding Statement. Lead refers to an introduction that begins with a statistic, a quotation, a description, or some other device to grab the reader's attention and point toward the thesis. Position refers to an opinion or conclusion on the main question. Claim refers to a claim that supports the position. Counterclaim refers to a claim that refutes another claim or gives an opposing reason to the position. Rebuttal refers to a claim that refutes a counterclaim. Evidence refers to ideas or examples that support claims, counterclaims, rebuttals, or the position. Concluding statement refers to a concluding statement that restates the position and claims."
user_prompts = [] # {id: XX, prompt: YY}

# random seed
import random
random.seed(42)

for index, row in df.iterrows():
    for i in range(augmentation_count_needed[row["label"]]):
        prompt = "This sentence is from the " + MAPPING[row["label"]] + " discourse element:\n" + row["text"] + "\n\nFirst, think step by step on why this is a sentence of the " + MAPPING[row["label"]] + " discourse element. Then, think step by step, and finally in the last line of your response (after putting a line break), please write a sentence that addresses the same topic, focusing on the " + MAPPING[row["label"]] + " discourse element. You should use different names, words, and terminologies in your output, but the overall meaning and content should be the same and refer to the same discourse element. In the *last line* of your output, just put the sentence and nothing else."
        user_prompts.append({"id": id, "prompt": prompt})


In [None]:
len(user_prompts)

In [None]:
print(user_prompts[0]["prompt"])

In [None]:
import openai
from openai import OpenAI

with open("../../../api_key.txt", "r") as f:
    api_key = f.read().strip()

openai.api_key = api_key
openai_client = OpenAI(api_key=api_key)

def return_message_from_openai(messages, temperature = 1):
    global openai_client
    response = openai_client.chat.completions.create(
        model="gpt-4o-2024-08-06",
        messages=messages,
        temperature=temperature
    )
    return response.choices[0].message.content

In [None]:
import json

jsonl_data = []

for i, prompt in enumerate(user_prompts):
    jsonl_data.append({
        "custom_id": f"request-{i+1}-id-{prompt['id']}",
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-4o-2024-08-06",
            "messages": [
                {"role": "developer", "content": developer_prompt},
                {"role": "user", "content": prompt["prompt"]}
            ],
            "temperature": 1.0
        }
    })

with open(f"fifth-round-fold-{fold}.jsonl", "w") as f:
    for entry in jsonl_data:
        f.write(json.dumps(entry) + "\n")


# Batch API

In [None]:
batch_input_file = openai_client.files.create(
    file=open(f"fifth-round-fold-{fold}.jsonl", "rb"),
    purpose="batch"
)

print(batch_input_file)

In [None]:
batch_input_file_id = batch_input_file.id
created_batch = openai_client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
        "description": f"Fifth Round (fold {fold})"
    }
)
created_batch.id, created_batch.status

In [None]:
batch = openai_client.batches.retrieve(created_batch.id)
batch.status

In [None]:
file_response = openai_client.files.content(batch.output_file_id)

In [None]:
all_labels_to_use = []
keys = list(augmentation_count_needed.keys())
for index, row in df.iterrows():
    for i in range(augmentation_count_needed[row["label"]]):
        all_labels_to_use.append(row["label"])

len(all_labels_to_use), len([line for line in file_response.text.split("\n") if line])

In [None]:
# make a new dataframe id,text-id,text,label,original_id,test_fold, by using the df we had in the beginning
result_df = pd.DataFrame(columns=["id", "text", "label", "original_id"])

i = 0
for line in file_response.text.split("\n"):
    if line:
        data = json.loads(line)
        # get the last line of  data["response"]["body"]["choices"][0]["message"]["content"]
        text = data["response"]["body"]["choices"][0]["message"]["content"].split("\n")[-1]
        new_row = {
            "id": data["custom_id"],
            # "text-id": df[df["id"] == int(data["custom_id"].split("-")[-1])]["text-id"].values[0],
            "text": text,
            "label": all_labels_to_use[i],
            "original_id": data["custom_id"]
            # "test_fold": df[df["id"] == int(data["custom_id"].split("-")[-1])]["test_fold"].values[0]
        }
        result_df = pd.concat([result_df, pd.DataFrame([new_row])], ignore_index=True)
        i += 1

result_df.head()

In [None]:
result_df.to_csv(f"fifth-round-fold-{fold}.csv", index=False)