# Make the fine-tuning file

In [None]:
fold = 0

In [None]:
import pandas as pd

main_df = pd.read_csv("ANON")
# only get the rows with test_fold != fold
df = main_df[main_df.test_fold != fold]

df.head() # columns: text-id, text, label, id, test_fold

In [None]:
# shuffle the rows with 42
df = df.sample(frac=1, random_state=42)

In [None]:
MAPPING = {
    0: "Description",
    1: "Feelings",
    2: "Evaluation",
    3: "Analysis",
    4: "Conclusion",
    5: "Action Plan"
}

developer_prompt = "You are an expert in the Gibbs reflective cycle. The components of the Gibbs reflective cycle are: Description, Feelings, Evaluation, Analysis, Conclusion, and Action Plan. Description refers describing to the event or experience you are reflecting on. Feelings refers to your emotions during the event or experience. Evaluation refers to your thoughts about the event or experience, providing positive and negative aspects on what happened. Analysis refers to your understanding of the event or experience, providing reasons behind points mentioned in the Evaluation aspect. Conclusion refers to what you learned from the event or experience. Action Plan refers to what you would do differently in the future (next time). Always respond in German."

final_jsonl_data = []
for index, row in df.iterrows():
    final_jsonl_data.append({"messages": [{"role": "system", "content": developer_prompt}, {"role": "user", "content": "Please output a reflective sentence that belongs to the " + MAPPING[row["label"]] + " component of the Gibbs reflective cycle. Just output the sentence and nothing else. Respond in German."}, {"role": "assistant", "content": row["text"]}]})

import jsonl
jsonl.dump(final_jsonl_data, f"jsonl-gpt-fold-{fold}.jsonl")
        


### Upload

In [None]:
import openai
from openai import OpenAI

with open("../../../api_key.txt", "r") as f:
    api_key = f.read().strip()

openai.api_key = api_key
openai_client = OpenAI(api_key=api_key)

job = openai_client.fine_tuning.jobs.create(
    training_file="jsonl-fold-0.jsonl",
    model="gpt-3.5-turbo-0125",
    # method={
    #     "type": "dpo",
    #     "dpo": {
    #         "hyperparameters": {"beta": 0.1},
    #     },
    # },
)

In [None]:
client.fine_tuning.jobs.retrieve("ftjob-abc123")

# Augment Data with the Fine-tuned Model

In [None]:
# find label count distribution and store it as a dict
label_count = df['label'].value_counts().to_dict()
label_count

In [None]:
# Find the maximum label count
max_label_count = max(label_count.values())

# Calculate the difference needed for each class to reach the maximum count
label_count_diff = {label: max_label_count - count for label, count in label_count.items()}

label_count_diff

In [None]:
# find how many augmentations should we do per sentence, by dividing the count diff by the number of sentences already in that class
augmentation_count_needed = {label: diff // count for label, diff, count in zip(label_count.keys(), label_count_diff.values(), label_count.values())}
augmentation_count_needed

In [None]:
MAPPING = {
    0: "Description",
    1: "Feelings",
    2: "Evaluation",
    3: "Analysis",
    4: "Conclusion",
    5: "Action Plan"
}

developer_prompt = "You are an expert in the Gibbs reflective cycle. The components of the Gibbs reflective cycle are: Description, Feelings, Evaluation, Analysis, Conclusion, and Action Plan. Description refers describing to the event or experience you are reflecting on. Feelings refers to your emotions during the event or experience. Evaluation refers to your thoughts about the event or experience, providing positive and negative aspects on what happened. Analysis refers to your understanding of the event or experience, providing reasons behind points mentioned in the Evaluation aspect. Conclusion refers to what you learned from the event or experience. Action Plan refers to what you would do differently in the future (next time). Always respond in German."

In [None]:
import openai
from openai import OpenAI
import backoff

with open("../../../api_key.txt", "r") as f:
    api_key = f.read().strip()

openai.api_key = api_key
openai_client = OpenAI(api_key=api_key)

@backoff.on_exception(backoff.expo, openai.OpenAIError, max_time=120)
def return_sentence_from_openai(class_name, temperature = 0.5):
    global openai_client
    response = openai_client.chat.completions.create(
        model="ANON",
        messages=[
            {"role": "system", "content": developer_prompt},
            {"role": "user", "content": "Please output a reflective sentence that belongs to the " + class_name + " component of the Gibbs reflective cycle. Just output the sentence and nothing else. Respond in German."}
        ],
        temperature=temperature
    )
    return response.choices[0].message.content


return_sentence_from_openai("Action Plan")

In [None]:
from tqdm import tqdm

sentences = [] # {index: XX, text: XX, label: XX}

with tqdm(total=len(df)) as pbar:
    for index, row in df.iterrows():
        for i in range(augmentation_count_needed[row["label"]]):
            mapping_label_text = MAPPING[row["label"]]
            sentence = return_sentence_from_openai(mapping_label_text)
            sentences.append({"index": index, "text": sentence, "label": row["label"]})
        pbar.update(1)

        


In [None]:
# make a new dataframe id,text-id,text,label,original_id,test_fold, by using the df we had in the beginning
result_df = pd.DataFrame(columns=["id", "text", "label", "original_id", "test_fold"])

index_of_sentence = 0
with tqdm(total=len(df)) as pbar:
    for index, row in df.iterrows():
        for i in range(augmentation_count_needed[row["label"]]):
            sentence = sentences[index_of_sentence]["text"]
            new_row = {
                "id": f"augmented-{row['id']}-{i}",
                # "text-id": row["text-id"],
                "text": sentence,
                "label": row["label"],
                "original_id": row["id"],
                "test_fold": row["test_fold"]
            }
            result_df = pd.concat([result_df, pd.DataFrame([new_row])], ignore_index=True)
            index_of_sentence += 1

result_df.head()

In [None]:
result_df.to_csv(f"fine-tune-fold-{fold}-outputs.csv", index=False)