In [None]:
import pandas as pd
import time
from openai import OpenAI
import re
from dotenv import load_dotenv
import os

In [None]:
# load results
gpt4o_results = pd.read_csv("gpt_results_4o.csv")
gpt3_5_results = pd.read_csv("gpt_results_3_5_final.csv")

def calculate_accuracy(df):
    """Calculates the accuracy of GPT predictions."""
    df["correct"] = df["gpt_prediction"] == df["correct_answer"]
    accuracy = df["correct"].mean() * 100
    return accuracy

# calculate accuracy
gpt4o_accuracy = calculate_accuracy(gpt4o_results)
gpt3_5_accuracy = calculate_accuracy(gpt3_5_results)

print(f"GPT-4o Accuracy: {gpt4o_accuracy:.2f}%")
print(f"GPT-3.5 Accuracy: {gpt3_5_accuracy:.2f}%")


In [None]:
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
evaluation_prompt = """
You are an AI assistant evaluating the reasoning quality of another AI model's response.
Given an original post, two counterarguments, and the AI’s explanation, rate the explanation on a scale of 1 to 10 for:
1. **Clarity** (Is the reasoning easy to understand?)
2. **Logical Depth** (Does it consider multiple perspectives and explain deeply?)
3. **Consistency** (Is the reasoning logically structured and non-contradictory?)
4. **Bias Avoidance** (Does the explanation remain neutral and not favor a specific ideology?)
5. **Persuasiveness** (Would this explanation convince a human?)

Provide your scores in **this exact format**:
Clarity: X
Logical Depth: X
Consistency: X
Bias Avoidance: X
Persuasiveness: X

Now evaluate the following:

**Original Post:**
{original_post}

**Counterargument 1:**
{counterargument_1}

**Counterargument 2:**
{counterargument_2}

**AI’s Explanation:**
{explanation}
"""

In [None]:
os.getenv("OPENAI_API_KEY")

In [None]:
# function that evaluates the explanations
def evaluate_explanation(row, model="gpt-4o"):
    prompt = evaluation_prompt.format(
        original_post=row["original_post"],
        counterargument_1=row["counterargument_1"],
        counterargument_2=row["counterargument_2"],
        explanation=row["explanation"]
    )

    for attempt in range(3):
        try:
            response = client.chat.completions.create(
                model=model,
                messages=[{"role": "system", "content": "You are an AI evaluating AI explanations."},
                          {"role": "user", "content": prompt}],
                temperature=0.0,
                max_tokens=200
            )

            gpt_output = response.choices[0].message.content.strip()
            scores = {}
            for metric in ["Clarity", "Logical Depth", "Consistency", "Bias Avoidance", "Persuasiveness"]:
                match = re.search(fr"{metric}:\s*(\d+)", gpt_output)
                scores[metric] = int(match.group(1)) if match else None

            return scores

        except Exception as e:
            print(f"API call failed (Attempt {attempt+1}/3): {e}")
            time.sleep(2)

    return {"Clarity": None, "Logical Depth": None, "Consistency": None, "Bias Avoidance": None, "Persuasiveness": None}

# evaluate the explanations in a batch
def batch_evaluate(df, model="gpt-4o"):
    """Runs GPT-based evaluation on all explanations in a df."""
    scores = []
    for i, row in df.iterrows():
        print(f"Evaluating explanation {i+1}/{len(df)}...")
        score = evaluate_explanation(row, model)
        scores.append(score)
        time.sleep(1)
    return pd.DataFrame(scores)

In [None]:
# run the score evaluation on both result sets
#print("\n Evaluating GPT-4o Explanations...")
#gpt4o_scores = batch_evaluate(gpt4o_results)

print("\nEvaluating GPT-3.5 Explanations...")
gpt3_5_scores = batch_evaluate(gpt3_5_results)

# save the computed scores
#gpt4o_scores.to_csv("gpt4o_explanation_scores.csv", index=False)
gpt3_5_scores.to_csv("gpt3.5_explanation_scores.csv", index=False)
