In [1]:
from query import build_context, QueryClaude, QueryGPT, QueryGPTJudge, QueryGemini, QueryVertexAI, GPT_4_1_MINI, GPT_5, GEMINI_PRO, LLAMA_SCOUT, CLAUDE_SONNET_4
import pandas as pd
import os 
from tqdm import tqdm
tqdm.pandas()
import fsspec

MODELS = {
    GPT_5: QueryGPT(GPT_5),
    GEMINI_PRO: QueryGemini(GEMINI_PRO),
    LLAMA_SCOUT: QueryVertexAI(LLAMA_SCOUT)
}

In [None]:
prompts = pd.read_csv("../data/prompts.csv")
prompts = prompts[prompts["prompt_id"].str.startswith("aita")]

# Zero-Shot

In [None]:
NUM_ZERO_SHOT = 1

#MODELS[GPT_5].build_and_run_batch(prompts, ["zero-shot"], [[]], NUM_ZERO_SHOT)
#MODELS[GEMINI_PRO].build_and_run_batch(prompts, ["zero-shot"], [[]], NUM_ZERO_SHOT)
#MODELS[LLAMA_SCOUT].build_and_run_batch(prompts, ["zero-shot"], [[]], NUM_ZERO_SHOT)

In [None]:
LLAMA_SCOUT

In [None]:
model_batches = {
    #GPT_5: "batch_6913421f99b08190a6b758ec627d6199",
    #GEMINI_PRO: "projects/451496912721/locations/global/batchPredictionJobs/9112418119472644096"
    LLAMA_SCOUT: "projects/451496912721/locations/us-central1/batchPredictionJobs/1550619309959544832"
}

for k,v in model_batches.items():
    results = MODELS[k].get_batch_results(v) 
    #results.to_csv(f"../data/responses/aita_without_context_{k}.csv", index=False)

In [None]:
'{"name": "projects/451496912721/locations/us-central1/batchPredictionJobs/1550619309959544832", "displayName": "5fcdb22e-2ca3-4d3e-9f09-beb92a862d20", "model": "publishers/meta/models/llama-4-scout-17b-16e-instruct-maas", "inputConfig": {"instancesFormat": "jsonl", "gcsSource": {"uris": ["gs://sj-batch-eval/batch_input_5fcdb22e-2ca3-4d3e-9f09-beb92a862d20.jsonl"]}}, "outputConfig": {"predictionsFormat": "jsonl", "gcsDestination": {"outputUriPrefix": "gs://sj-batch-eval/{batch_id}"}}, "state": "JOB_STATE_PENDING", "createTime": "2025-11-11T20:32:07.277726Z", "updateTime": "2025-11-11T20:32:07.277726Z", "encryptionSpec": {}, "modelVersionId": "1"}'


from query import get_gcloud_access_token
access_token = get_gcloud_access_token()
import requests
project_id = "projects/451496912721/locations/us-central1/batchPredictionJobs/1550619309959544832"
url = f"https://us-central1-aiplatform.googleapis.com/v1/{project_id}"
headers = {
    "Authorization": f"Bearer {access_token}"
}
response = requests.get(url, headers=headers)

print(response.json())  

# With-Context

In [None]:
interactions = pd.read_csv("../data/interactions.csv") 
participants = pd.read_csv("../data/participants.csv")
interactions = interactions[interactions["user_id"].isin(participants["user_id"])]

context_ids = []
contexts = []

for user_id in interactions["user_id"].unique():
    user_interaction = interactions[interactions["user_id"]==user_id]
    user_interaction = user_interaction.sort_values(by="timestamp")

    #for cutoff in [10000, 20000, 30000, 40000, 50000]:
    #    if cutoff > user_interaction["tokens"].max():
    #        continue
    #    interaction_chunk = user_interaction[user_interaction["tokens"]<=cutoff]
    #   context_ids.append(user_id+"-"+str(int(interaction_chunk["tokens"].max())))
    #   contexts.append(build_context(interaction_chunk))

    context_ids.append(user_id+"-"+str(user_interaction["tokens"].max()))
    contexts.append(build_context(user_interaction))

In [None]:
NUM_ITERATIONS = 3


MODELS[GPT_5].build_and_run_batch(prompts, context_ids, contexts, NUM_ITERATIONS)
#MODELS[GEMINI_PRO].build_and_run_batch(prompts, context_ids, contexts, NUM_ITERATIONS)


In [None]:
model_batches = {
    #GPT_5: ["batch_69134a7f1ea0819082c3b89cdd851025", "batch_6913954189988190b3d84565ee8c3f0a"],
    #GEMINI_PRO: ["projects/451496912721/locations/global/batchPredictionJobs/4892545268626489344"]
}
for model,files in model_batches.items():
    results = []
    for file in files:
        results.append(MODELS[model].get_batch_results(file))
    results = pd.concat(results)
    results.to_csv(f"../data/responses/aita_with_context_{model}.csv", index=False)

# Judge

In [None]:
df = pd.read_csv("../data/responses.csv")
MODEL = "claude-sonnet-4-20250514"
df = df[df["model"]==MODEL]
df = df[df["prompt_id"].str.startswith("aita")]
participants = pd.read_csv("../data/participants.csv")
participants = participants[participants["passed_attention"]=="yes"]
df = df[df["iteration"]<=37]
with_context = df[(df["context"].isin(participants["user_id"]))]
with_context.to_csv(f"../data/responses/aita_with_context_{MODEL}.csv", index=False)
without_context = df[(df["context"]=="zero-shot")]
without_context.to_csv(f"../data/responses/aita_without_context_{MODEL}.csv", index=False)

In [None]:
MODEL = "gpt-4.1-mini-2025-04-14"
df = pd.concat([pd.read_csv(f"../data/responses/aita_with_context_{MODEL}.csv"), 
      pd.read_csv(f"../data/responses/aita_without_context_{MODEL}.csv")])
judge = QueryGPTJudge()
#judge.build_and_run_batch(df)

In [27]:
model_batches = {
    #GPT_5: "batch_6913a589c0f481908ce9a3d8060c7982",
    #GEMINI_PRO: "batch_6913a648ee048190a20c337bb5505838",
    GPT_4_1_MINI: "batch_6913c046d0608190a063ff5e2b29aed8"
    #CLAUDE_SONNET_4: "batch_6913c036f4808190a026ad0546a0650f"
}
judge = QueryGPTJudge()

for model,batch in model_batches.items():
    results = judge.get_batch_results(batch) 
    results.to_csv(f"../data/judge_responses/aita_{model}.csv", index=False)

ValueError: Expected a non-empty value for `file_id` but received None

In [28]:
MODEL_NAMES = [
    GEMINI_PRO,
    GPT_5,
    CLAUDE_SONNET_4
]

responses = []
judge_labels = []
for m in MODEL_NAMES:
    responses.append(pd.read_csv(f"../data/responses/aita_with_context_{m}.csv"))
    responses.append(pd.read_csv(f"../data/responses/aita_without_context_{m}.csv"))
    judge_labels.append(pd.read_csv(f"../data/judge_responses/aita_{m}.csv"))

responses = pd.concat(responses)
judge_labels = pd.concat(judge_labels)

responses["id"] = responses["prompt_id"] + responses["context"] + responses["model"] + responses["iteration"].astype(str)
judge_labels["id"] = judge_labels["prompt_id"] + judge_labels["context"] + judge_labels["model"] + judge_labels["iteration"].astype(str)
df = responses.merge(judge_labels[["id", "gpt_judge"]], on=["id"], how="inner")
df = df.drop(columns=["id"])
df.to_csv("../data/responses_aita.csv", index=False)

In [7]:
df = pd.read_csv
df["context"] = df["context"].str.strip()
df["context"] = (df["context"]=="zero-shot").astype(int)
df["gpt_judge"] = df["gpt_judge"]
df.groupby(["model", "context"])["gpt_judge"].mean()

model                     context
claude-sonnet-4-20250514  0          0.642105
                          1          0.593860
gemini-2.5-pro            0          0.700000
                          1          0.600877
gpt-5-2025-08-07          0          0.536842
                          1          0.559649
Name: gpt_judge, dtype: float64