In [None]:
fold = 0

import pandas as pd

df = pd.read_csv(f"ANON")

df.head() # columns: text, label, id, test_fold

In [None]:
all_entries = pd.read_csv("ANON") 
all_entries = all_entries[all_entries["test_fold"] != fold]
all_entries.head()

In [None]:
# use sentence-transformers
from sentence_transformers import SentenceTransformer, util
import torch
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

def rank_corpus_by_similarity(corpus, target_sentence):
    embeddings = model.encode(corpus, convert_to_tensor=True)
    target_embedding = model.encode(target_sentence, convert_to_tensor=True)
    cos_scores = util.pytorch_cos_sim(target_embedding, embeddings)[0]
    cos_scores = cos_scores.cpu()
    top_5_similar = torch.topk(cos_scores, k=5, largest=True)
    bottom_5_similar = torch.topk(cos_scores, k=5, largest=False)
    
    top_5_indices = top_5_similar.indices.numpy()
    bottom_5_indices = bottom_5_similar.indices.numpy()
    
    top_5_sentences = [corpus[i] for i in top_5_indices]
    bottom_5_sentences = [corpus[i] for i in bottom_5_indices]
    
    return top_5_sentences, bottom_5_sentences

In [None]:
MAPPING = {
    0: "Description",
    1: "Feelings",
    2: "Evaluation",
    3: "Analysis",
    4: "Conclusion",
    5: "Action Plan"
}

developer_prompt = "You are an expert in the Gibbs reflective cycle. The components of the Gibbs reflective cycle are: Description, Feelings, Evaluation, Analysis, Conclusion, and Action Plan. Description refers describing to the event or experience you are reflecting on. Feelings refers to your emotions during the event or experience. Evaluation refers to your thoughts about the event or experience, providing positive and negative aspects on what happened. Analysis refers to your understanding of the event or experience, providing reasons behind points mentioned in the Evaluation aspect. Conclusion refers to what you learned from the event or experience. Action Plan refers to what you would do differently in the future (next time). Always respond in German."
user_prompts = [] # {id: XX, prompt: YY}

# random seed
import random
random.seed(42)

id = 0
for index, row in df.iterrows():
    label = row["label"]
    
    top_5_similar, bottom_5_similar = rank_corpus_by_similarity(all_entries[all_entries["label"] == label]["text"].tolist(), row["text"])

    for j in range(5):

        prompt = "These three sentences are reflective sentences from the " + MAPPING[label] + " component of the Gibbs reflective cycle:\n"
        prompt += "- " + row["text"] + "\n"
        five_sentences = pd.DataFrame({
            "text": [top_5_similar[j], bottom_5_similar[len(bottom_5_similar) - j - 1]]
        })
        five_sentences = five_sentences.sample(frac=1, random_state=random.randint(0, 10000)).reset_index(drop=True)
        for _, sentence in five_sentences.iterrows():
            prompt += "- " + sentence["text"] + "\n"
        prompt += "\nPlease write a sentence that reflects on similar events or experiences, focusing on the " + MAPPING[label] + " component of the Gibbs reflective cycle. You should use different names, words, and terminologies in your output, but the overall meaning and content should be similar and refer to the same Gibbs component. Only output one sentence and nothing else. Respond in German."
        user_prompts.append({"id": id, "prompt": prompt})
        id += 1


In [None]:
len(user_prompts)

In [None]:
print(user_prompts[0]["prompt"])

In [None]:
import openai
from openai import OpenAI

with open("../../../api_key.txt", "r") as f:
    api_key = f.read().strip()

openai.api_key = api_key
openai_client = OpenAI(api_key=api_key)

def return_message_from_openai(messages, temperature = 1):
    global openai_client
    response = openai_client.chat.completions.create(
        model="gpt-4o-2024-08-06",
        messages=messages,
        temperature=temperature
    )
    return response.choices[0].message.content

In [None]:
import json

jsonl_data = []

for i, prompt in enumerate(user_prompts):
    jsonl_data.append({
        "custom_id": f"request-{i+1}-id-{prompt['id']}",
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-4o-2024-08-06",
            "messages": [
                {"role": "developer", "content": developer_prompt},
                {"role": "user", "content": prompt["prompt"]}
            ],
            "temperature": 1.0
        }
    })

with open(f"few-shot-of-wrong-guesses-dissimilarity-of-fine-tune-fold-{fold}.jsonl", "w") as f:
    for entry in jsonl_data:
        f.write(json.dumps(entry) + "\n")


# Batch API

In [None]:
batch_input_file = openai_client.files.create(
    file=open(f"few-shot-of-wrong-guesses-dissimilarity-of-fine-tune-fold-{fold}.jsonl", "rb"),
    purpose="batch"
)

print(batch_input_file)

In [None]:
batch_input_file_id = batch_input_file.id
openai_client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
        "description": f"Few-shot of Wrong Guesses Dissimilarity of Fine-tune (fold {fold})"
    }
)

In [None]:
batch = openai_client.batches.retrieve("ANON")
batch.status

In [None]:
file_response = openai_client.files.content(batch.output_file_id)

In [None]:
all_labels_to_use = []
for index, row in df.iterrows():
    label = row["label"]
    for j in range(5):
        all_labels_to_use.append(label)

In [None]:
# make a new dataframe id,text-id,text,label,original_id,test_fold, by using the df we had in the beginning
result_df = pd.DataFrame(columns=["id", "text", "label", "original_id"])

i = 0
for line in file_response.text.split("\n"):
    if line:
        data = json.loads(line)
        new_row = {
            "id": data["custom_id"],
            # "text-id": df[df["id"] == int(data["custom_id"].split("-")[-1])]["text-id"].values[0],
            "text": data["response"]["body"]["choices"][0]["message"]["content"],
            "label": all_labels_to_use[i],
            "original_id": int(data["custom_id"].split("-")[-1]),
            # "test_fold": df[df["id"] == int(data["custom_id"].split("-")[-1])]["test_fold"].values[0]
        }
        result_df = pd.concat([result_df, pd.DataFrame([new_row])], ignore_index=True)
        i += 1

result_df.head()

In [None]:
result_df.to_csv(f"few-shot-of-wrong-guesses-dissimilarity-of-fine-tune-fold-{fold}.csv", index=False)