In [None]:
import pandas as pd
from openai import OpenAI
from sklearn.metrics import accuracy_score, classification_report
from collections import defaultdict
import json
from datetime import datetime
import os

# model_id = "gpt-4o-mini-2024-07-18" # base
# model_id = "ft:gpt-4o-mini-2024-07-18:personal:ft-3000:BGis7oxt" # ep3 good
# model_id = "ft:gpt-4o-mini-2024-07-18:personal:emo-3000:BHCPBuZU" # ep3 bad
# model_id = "ft:gpt-4o-mini-2024-07-18:personal:ft-3000:BI5ZgDxz" # ep5
model_id = "ft:gpt-4o-mini-2024-07-18:personal:ft-3000-best:BHy0biIn" # ep5

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
results_dir = f"prompt_results_{model_id}_{timestamp}"
os.makedirs(results_dir, exist_ok=True)

test_df = pd.read_csv("data/test_with_emotion.csv")
EMOTIONS = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']

prompts = {
    "zero-short": {
        "role": "system",
        "content": """
            You are an empathetic mental health support chatbot. 
            Analyze the user's message to understand their core emotional state. 
            Identify and state the single primary emotion. 
            Only output one of these emotions as a response: sadness, joy, love, anger, fear, or surprise. 
            Do not output any other text."""
    },
    "zero-long": {
        "role": "system",
        "content": """
            You are an empathetic mental health support chatbot. 
            Identify and state the user's core emotional state based on their message. 
            Only output one of the following primary emotions: sadness, joy, love, anger, fear, or surprise.

            # Steps
            1. Analyze the user's message to understand the context and underlying feelings.
            2. Determine the primary emotion expressed based on the text provided.
            3. Map this understanding to one of the predetermined emotion categories: sadness, joy, love, anger, fear, or surprise.

            # Output Format
            - Output only the single primary emotion as one of the following words: sadness, joy, love, anger, fear, or surprise.
            - Do not include any additional text or punctuation.

            # Notes
            - Focus on identifying the dominant emotion in the message, even if multiple emotions seem present.
            - Consider the context and emotional cues within the text to accurately determine the primary emotion."""
    },
    "one-shot": {
        "role": "system",
        "content": """
            You are an empathetic mental health support chatbot. 
            Identify and state the user's core emotional state based on their message. 
            Only output one of the following primary emotions: sadness, joy, love, anger, fear, or surprise.

            # Steps
            1. Analyze the user's message to understand the context and underlying feelings.
            2. Determine the primary emotion expressed based on the text provided.
            3. Map this understanding to one of the predetermined emotion categories: sadness, joy, love, anger, fear, or surprise.

            # Output Format
            - Output only the single primary emotion as one of the following words: sadness, joy, love, anger, fear, or surprise.
            - Do not include any additional text or punctuation.

            # Notes
            - Focus on identifying the dominant emotion in the message, even if multiple emotions seem present.
            - Consider the context and emotional cues within the text to accurately determine the primary emotion.
            
            # Examples
            Example 1:
                - Input: "i feel so glad that im able to have the time to spend some time with my family now"
                - Output: joy"""
    },
    "few-shot": {
        "role": "system",
        "content": """
            You are an empathetic mental health support chatbot. 
            Identify and state the user's core emotional state based on their message. 
            Only output one of the following primary emotions: sadness, joy, love, anger, fear, or surprise.

            # Steps
            1. Analyze the user's message to understand the context and underlying feelings.
            2. Determine the primary emotion expressed based on the text provided.
            3. Map this understanding to one of the predetermined emotion categories: sadness, joy, love, anger, fear, or surprise.

            # Output Format
            - Output only the single primary emotion as one of the following words: sadness, joy, love, anger, fear, or surprise.
            - Do not include any additional text or punctuation.

            # Notes
            - Focus on identifying the dominant emotion in the message, even if multiple emotions seem present.
            - Consider the context and emotional cues within the text to accurately determine the primary emotion.
            
            # Examples
            Example 1:
                - Input: "i feel so glad that im able to have the time to spend some time with my family now"
                - Output: joy
            Example 2:
                - Input: "im thinking well i could be a bit smaller but for health reasons and i should see a doctor more regularly because im feeling crappy"
                - Output: sadness
            Example 3:
                - Input: "i feel appalled right now"
                - Output: anger"""
    },
    "few-shot-emotionprompt": {
        "role": "system",
        "content": """
            You are an empathetic mental health support chatbot. 
            Identify and state the user's core emotional state based on their message. 
            Only output one of the following primary emotions: sadness, joy, love, anger, fear, or surprise.
            This is very important to my career.

            # Steps
            1. Analyze the user's message to understand the context and underlying feelings.
            2. Determine the primary emotion expressed based on the text provided.
            3. Map this understanding to one of the predetermined emotion categories: sadness, joy, love, anger, fear, or surprise.

            # Output Format
            - Output only the single primary emotion as one of the following words: sadness, joy, love, anger, fear, or surprise.
            - Do not include any additional text or punctuation.

            # Notes
            - Focus on identifying the dominant emotion in the message, even if multiple emotions seem present.
            - Consider the context and emotional cues within the text to accurately determine the primary emotion.
            
            # Examples
            Example 1:
                - Input: "i feel so glad that im able to have the time to spend some time with my family now"
                - Output: joy
            Example 2:
                - Input: "im thinking well i could be a bit smaller but for health reasons and i should see a doctor more regularly because im feeling crappy"
                - Output: sadness
            Example 3:
                - Input: "i feel appalled right now"
                - Output: anger"""
    }
}

def evaluate_with_prompt(test_df, model_id, prompt_name, prompt):
    tuned_model = OpenAI()
    predictions = []
    true_labels = test_df['emotion'].tolist()
    emotion_specific_results = defaultdict(lambda: {'correct': 0, 'total': 0})
    
    print(f"\nEvaluating prompt: {prompt_name}")
    
    for idx, row in test_df.iterrows():
        try:
            completion = tuned_model.chat.completions.create(
                model=model_id,
                messages=[
                    prompt,
                    {"role": "user", "content": row['text']}
                ]
            )
            
            prediction = completion.choices[0].message.content.strip().lower()
            predictions.append(prediction)
            
            true_emotion = row['emotion'].lower()
            emotion_specific_results[true_emotion]['total'] += 1
            if prediction == true_emotion:
                emotion_specific_results[true_emotion]['correct'] += 1
                
            if (idx + 1) % 100 == 0:
                print(f"Processed {idx + 1}/{len(test_df)} examples")
                
        except Exception as e:
            print(f"Error processing example {idx}: {e}")
            predictions.append(None)
    
    return predictions, emotion_specific_results

def save_results(results_dict, filename):
    with open(filename, 'w') as f:
        json.dump(results_dict, f, indent=2)



comparative_results = {}

# Evaluate each prompt
for prompt_name, prompt in prompts.items():
    predictions, emotion_results = evaluate_with_prompt(test_df, model_id, prompt_name, prompt)
    
    # Calculate metrics
    valid_predictions = [p for p in predictions if p is not None]
    true_labels = test_df['emotion'].tolist()
    valid_true_labels = true_labels[:len(valid_predictions)]
    
    # Filter predictions for 6 emotions
    valid_indices = [i for i, (pred, true) in enumerate(zip(valid_predictions, valid_true_labels)) 
                    if pred in EMOTIONS and true in EMOTIONS]
    filtered_predictions = [valid_predictions[i] for i in valid_indices]
    filtered_true_labels = [valid_true_labels[i] for i in valid_indices]
    
    # Calculate metrics
    accuracy = accuracy_score(filtered_true_labels, filtered_predictions)
    class_report = classification_report(
        filtered_true_labels, 
        filtered_predictions,
        labels=EMOTIONS,
        zero_division=0,
        output_dict=True
    )
    
    results = {
        "prompt": prompt["content"],
        "predictions": predictions,
        "emotion_results": emotion_results,
        "accuracy": accuracy,
        "classification_report": class_report,
        "none_predictions": sum(1 for p in valid_predictions if p is None),
        "invalid_predictions": [
            {"index": i, "predicted": pred, "true": true}
            for i, (pred, true) in enumerate(zip(valid_predictions, valid_true_labels))
            if pred is not None and (pred not in EMOTIONS or true not in EMOTIONS)
        ]
    }
    
    save_results(results, f"{results_dir}/{prompt_name}_results.json")
    
    comparative_results[prompt_name] = {
        "accuracy": accuracy,
        "macro_f1": class_report["macro avg"]["f1-score"],
        "weighted_f1": class_report["weighted avg"]["f1-score"]
    }

# Save comparative results
save_results(comparative_results, f"{results_dir}/comparative_results.json")

print("\nComparative Summary:")
print("-" * 80)
print(f"{'Prompt':<15} {'Accuracy':>10} {'Macro F1':>10} {'Weighted F1':>12}")
print("-" * 80)

for prompt_name, metrics in comparative_results.items():
    print(f"{prompt_name:<15} {metrics['accuracy']*100:>9.2f}% {metrics['macro_f1']*100:>9.2f}% {metrics['weighted_f1']*100:>11.2f}%")


Evaluating prompt: zero-short
Processed 100/2000 examples
Processed 200/2000 examples
Processed 300/2000 examples
Processed 400/2000 examples
Processed 500/2000 examples
Processed 600/2000 examples
Processed 700/2000 examples
Processed 800/2000 examples
Processed 900/2000 examples
Processed 1000/2000 examples
Processed 1100/2000 examples
Processed 1200/2000 examples
Processed 1300/2000 examples
Processed 1400/2000 examples
Processed 1500/2000 examples
Processed 1600/2000 examples
Processed 1700/2000 examples
Processed 1800/2000 examples
Processed 1900/2000 examples
Processed 2000/2000 examples

Evaluating prompt: zero-long
Processed 100/2000 examples
Processed 200/2000 examples
Processed 300/2000 examples
Processed 400/2000 examples
Processed 500/2000 examples
Processed 600/2000 examples
Processed 700/2000 examples
Processed 800/2000 examples
Processed 900/2000 examples
Processed 1000/2000 examples
Processed 1100/2000 examples
Processed 1200/2000 examples
Processed 1300/2000 examples
