# Make the fine-tuning file

In [None]:
fold = 0

In [None]:
import pandas as pd

main_df = pd.read_csv("../../../input-data-all-together/discourse/sentence-only-with-folds.csv")
# only get the rows with test_fold != fold
df = main_df[main_df.test_fold != fold]

df.head() # columns: text-id, text, label, id, test_fold

In [None]:
# shuffle the rows with 42
df = df.sample(frac=1, random_state=42)

In [None]:
MAPPING = {
    0: "Evidence",
    1: "Claim",
    2: "Concluding Statement",
    3: "Lead",
    4: "Position",
    5: "Counterclaim",
    6: "Rebuttal"
}

developer_prompt = "You are an expert in analyzing persuasive essays and understanding argumentative and discourse elements. The discourse elements are: Lead, Position, Claim, Counterclaim, Rebuttal, Evidence, and Concluding Statement. Lead refers to an introduction that begins with a statistic, a quotation, a description, or some other device to grab the reader's attention and point toward the thesis. Position refers to an opinion or conclusion on the main question. Claim refers to a claim that supports the position. Counterclaim refers to a claim that refutes another claim or gives an opposing reason to the position. Rebuttal refers to a claim that refutes a counterclaim. Evidence refers to ideas or examples that support claims, counterclaims, rebuttals, or the position. Concluding statement refers to a concluding statement that restates the position and claims."

final_jsonl_data = []
for index, row in df.iterrows():
    final_jsonl_data.append({"messages": [{"role": "system", "content": developer_prompt}, {"role": "user", "content": "Please output a sentence that belongs to the " + MAPPING[row["label"]] + " discourse element. Just output the sentence and nothing else."}, {"role": "assistant", "content": row["text"]}]})

import jsonl
jsonl.dump(final_jsonl_data, f"jsonl-gpt-fold-{fold}.jsonl")

# Augment Data with the Fine-tuned Model

In [None]:
# find label count distribution and store it as a dict
label_count = df['label'].value_counts().to_dict()
label_count

In [None]:
# Find the maximum label count
max_label_count = max(label_count.values())

# Calculate the difference needed for each class to reach the maximum count
label_count_diff = {label: max_label_count - count for label, count in label_count.items()}

label_count_diff

In [None]:
# find how many augmentations should we do per sentence, by dividing the count diff by the number of sentences already in that class
augmentation_count_needed = {label: diff // count for label, diff, count in zip(label_count.keys(), label_count_diff.values(), label_count.values())}
augmentation_count_needed

In [None]:
import openai
from openai import OpenAI
import backoff

with open("../../../api_key.txt", "r") as f:
    api_key = f.read().strip()

openai.api_key = api_key
openai_client = OpenAI(api_key=api_key)

@backoff.on_exception(backoff.expo, openai.OpenAIError, max_time=120)
def return_sentence_from_openai(class_name, temperature = 0.5):
    global openai_client
    response = openai_client.chat.completions.create(
        model="ANON",
        messages=[
            {"role": "system", "content": developer_prompt},
            {"role": "user", "content": "Please output a sentence that belongs to the " + class_name + " discourse element. Just output the sentence and nothing else."}
        ],
        temperature=temperature
    )
    return response.choices[0].message.content

In [None]:
from tqdm import tqdm

sentences = [] # {index: XX, text: XX, label: XX}

with tqdm(total=len(df)) as pbar:
    for index, row in df.iterrows():
        for i in range(augmentation_count_needed[row["label"]]):
            mapping_label_text = MAPPING[row["label"]]
            sentence = return_sentence_from_openai(mapping_label_text)
            sentences.append({"index": index, "text": sentence, "label": row["label"]})
        pbar.update(1)

In [None]:
# make a new dataframe id,text-id,text,label,original_id,test_fold, by using the df we had in the beginning
result_df = pd.DataFrame(columns=["id", "text-id", "text", "label", "original_id", "test_fold"])

index_of_sentence = 0
with tqdm(total=len(df)) as pbar:
    for index, row in df.iterrows():
        for i in range(augmentation_count_needed[row["label"]]):
            sentence = sentences[index_of_sentence]["text"]
            new_row = {
                "id": f"augmented-{row['id']}-{i}",
                "text-id": row["text-id"],
                "text": sentence,
                "label": row["label"],
                "original_id": row["id"],
                "test_fold": row["test_fold"]
            }
            result_df = pd.concat([result_df, pd.DataFrame([new_row])], ignore_index=True)
            index_of_sentence += 1

result_df.head()

In [None]:
result_df.to_csv(f"fine-tune-fold-{fold}-outputs.csv", index=False)