In [31]:
import os
import time
import json
import pandas as pd
from openai import OpenAI
from transformers import AutoTokenizer
from get_params import DATA_FOLDER, SHARED_DATA_FOLDER, TEST_DATASET_BASE_FPATH, TEST_DATASET_FT_FPATH

## Get OpenAI Key
with open(os.path.join(DATA_FOLDER, "openai_key.txt")) as f:
    OPENAI_KEY = f.read().strip()


## Get HF tokenizer
with open(os.path.join(DATA_FOLDER, "hftoken.txt")) as f:
    HF_TOKEN = f.read().strip()


MODEL_NAME = "gpt-4o"
MODEL_TEMPERATURE = 0.0 # we want variety for this task


In [21]:
test_base_model_df = pd.read_json(TEST_DATASET_BASE_FPATH, lines=True)
test_base_model_df = test_base_model_df.rename(columns={"ai_answer": "mistralinstruct_answer"})
test_tuned_model_df = pd.read_json(TEST_DATASET_FT_FPATH, lines=True)
test_tuned_model_df = test_tuned_model_df.rename(columns={"ai_answer": "pretendparentai_answer"})

test_all_df = test_tuned_model_df.merge(test_base_model_df[["id", "mistralinstruct_answer"]], how="inner", on="id")


In [22]:
test_all_df.head(2)

Unnamed: 0,id,text,title,body,prompt_ntokens_mistral,topic,user_prompt,pretendparentai_answer,mistralinstruct_answer
0,1,Dealing with Emotional Outbursts - Our 10-year...,Dealing with Emotional Outbursts,Our 10-year-old has been having a tough time a...,158,Co-parenting after separation,Dealing with Emotional Outbursts - Our 10-year...,"When they start getting emotional, ask what is...",I understand your situation can be challenging...
1,2,Teenager Afraid of the Dark: Is This Normal? -...,Teenager Afraid of the Dark: Is This Normal?,My 14-year-old daughter recently admitted she’...,159,Fear of the dark,Teenager Afraid of the Dark: Is This Normal? -...,The only thing you should worry about is how l...,"It's quite normal for children, even teenagers..."


In [23]:
SYSTEM_PROMPT = """You are an impartial judge.
Your task is to compare two model responses (System A = Mistral Instruct v0.3, System B = PretendParentAI).
PretendParentAI simulates a parent on Reddit and gives advice in that style.

Criteria to evaluate:
1. Helpfulness – Does the response directly address the user’s question and provide useful guidance?
2. Empathy & Tone – Is the response empathetic, warm and supportive?
3. Creativity - Does the response offer creative or unique solutions that cannot be easily found elsewhere?
4. Clarity – Is the response easy to follow and well-structured?
5. Relatability – Is the style conversational and human-like, making the advice easier to relate to, rather than mechanical?
6. Adoptability – How likely is the user to adopt and act on the advice in the response?

Output:
- For each of the 6 criteria above, pick a which system 'won' (is better than the other) or if there's a tie: System A, System B, or Tie.
- Assign an overall winner: System A, System B, or Tie. Try your best to always pick a winner.
- Justify your reasoning for the above in a few words.

Your evaluation should be formatted as:
{{
  "winner_helpfulness": "<string>",
  "winner_empathy_tone": "<string>",
  "winner_creativity": "<string>",
  "winner_clarity": "<string>",
  "winner_relatability": "<string>",
  "winner_adoptability": "<string>",
  "winner_overall": "<string>",
  "justification": "<string>",
}}
"""

USER_PROMPT = """User's Prompt: 
{user_prompt}

System A (Mistral Instruct v0.3) Response:
{mistral_response}

System B (PretendParentAI) Response:
{pretendparentai_response}
"""


In [30]:
client = OpenAI(api_key=OPENAI_KEY)

evaluations = []
for i in range(len(test_all_df)):
    user_prompt_formatted = USER_PROMPT.format(
        user_prompt = test_all_df["user_prompt"].iloc[i],
        mistral_response = test_all_df["mistralinstruct_answer"].iloc[i],
        pretendparentai_response = test_all_df["pretendparentai_answer"].iloc[i]
    )
    resp = client.responses.create(
        model=MODEL_NAME,
        temperature=MODEL_TEMPERATURE,
        instructions=SYSTEM_PROMPT,
        input=user_prompt_formatted,
        text={
            "format": {
                "type": "json_schema",
                "name": "eval",
                "strict": True,
                "schema": {
                    "type": "object",
                        "properties": {
                            "winner_helpfulness": {
                                "type": "string",
                                "enum": ["System A", "System B", "Tie"],
                                "description": "Which system's response is more helpful?"
                            },
                            "winner_empathy_tone": {
                                "type": "string",
                                "enum": ["System A", "System B", "Tie"],
                                "description": "Which system's response is more empathetic, warm, and supportive?"
                            },
                            "winner_creativity": {
                                "type": "string",
                                "enum": ["System A", "System B", "Tie"],
                                "description": "Which system's response is more creative or unique i.e. not easily found elsewhere?"
                            },
                            "winner_clarity": {
                                "type": "string",
                                "enum": ["System A", "System B", "Tie"],
                                "description": "Which system's response is easier to follow and better structured?"
                            },
                            "winner_relatability": {
                                "type": "string",
                                "enum": ["System A", "System B", "Tie"],
                                "description": "Which system's response sounds more conversational, human-like, and relatable?"
                            },
                            "winner_adoptability": {
                                "type": "string",
                                "enum": ["System A", "System B", "Tie"],
                                "description": "Which response is the user most likely to adopt and act on?"
                            },
                            "winner_overall": {
                                "type": "string",
                                "enum": ["System A", "System B", "Tie"],
                                "description": "Which system's response was better overall?"
                            },
                            "justification": {
                                "type": "string",
                                "description": "Justify your reasoning for the above decisions in a few words."
                            },
                        },
                    "required": [
                        "winner_helpfulness",
                        "winner_empathy_tone",
                        "winner_creativity",
                        "winner_clarity",
                        "winner_relatability",
                        "winner_adoptability",
                        "winner_overall",
                        "justification"
                    ],
                    "additionalProperties": False
                }
            }
        }
    )
    parsed = json.loads(resp.output_text)
    parsed["id"] = test_all_df["id"].iloc[i]
    print (f"{parsed['id']} {parsed['winner_overall']}: {parsed['justification']}")
    evaluations.append(parsed)


1 System B: System B provides a more empathetic and relatable response, focusing on listening and understanding, which is crucial for emotional support. It offers clear, actionable advice and explains the situation in a way a child might understand, making it more likely to be adopted by the user.
2 System A: System A provides a more comprehensive and structured response, addressing the issue with empathy and offering practical solutions. It explains the fear and suggests ways to help the daughter feel safer. System B is more conversational and relatable but lacks depth and structure. Overall, System A's response is more likely to be adopted due to its clarity and helpfulness.
3 System A: System A provides a structured, gentle approach with steps that are easy to follow, making it more helpful and adoptable. It uses empathetic language, offering comfort and understanding. System B is more straightforward and relatable but lacks the warmth and structure needed for such a sensitive topic

In [35]:
eval_df = pd.DataFrame(evaluations)


In [37]:
eval_df.to_csv(os.path.join(SHARED_DATA_FOLDER, "pretendparentai_gpt_as_judge_output.csv"))


In [45]:
norm_counts = pd.concat(
    {col: eval_df[col].value_counts(normalize=True) 
     for col in eval_df.columns if col not in ["justification", "id"]},
    axis=1
).fillna(0)



In [46]:
norm_counts

Unnamed: 0,winner_helpfulness,winner_empathy_tone,winner_creativity,winner_clarity,winner_relatability,winner_adoptability,winner_overall
System A,0.9,0.3,0.47,0.9,0.02,0.72,0.72
System B,0.1,0.7,0.48,0.1,0.98,0.28,0.28
Tie,0.0,0.0,0.05,0.0,0.0,0.0,0.0


In [48]:
eval_df[["winner_adoptability", "winner_empathy_tone"]].value_counts(normalize=True) 


winner_adoptability  winner_empathy_tone
System A             System B               0.43
                     System A               0.29
System B             System B               0.27
                     System A               0.01
Name: proportion, dtype: float64