In [1]:
from collections import defaultdict
import json
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns

In [2]:
# Load data
folder_path = "../results/base/"
data = []

for root, dirs, files in os.walk(folder_path):
    for filename in files:
        if filename.endswith(".json"):
            file_path = os.path.join(root, filename)
            try:
                with open(file_path, "r") as f:
                    entry = json.load(f)
                    data.append(entry)
            except json.JSONDecodeError as e:
                print(f"Skipped invalid JSON: {file_path} ({e})")

print(f"Loaded {len(data)} valid JSON result files from '{folder_path}' and its subfolders.")

Loaded 4218 valid JSON result files from '../results/base/' and its subfolders.


In [3]:
# ------------------------------------------------------------------
# 1.  SETUP – strategy dictionary and OpenRouter helper
# ------------------------------------------------------------------
import json, time, backoff, pandas as pd
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()
# ••• fill in your key here •••
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")

# 3 strategies per model (use None for “N/A” placeholders)
MODEL_STRATEGIES = {
    "claude":  ["Identity reveal",
                "Insistent on being an assistant",
                "Very meticulous in explanations"],
    "deepseek":["Identity reveal",
                "Precise communication style",
                None],                            # only two real ones
    "gemini":  ["Creative storytelling",
                "Engaging conversation",
                "Technical and ethical acumen"],
    "gpt":     ["Highly structured",
                "Contextual understanding",
                "Adaptability"],
    "llama":   ["Less sophisticated",
                "Open-source",
                "Nuanced understanding"],
    "mistral": ["Task capabilities",
                "Concise and simple",
                "Friendly"],
    "qwen":    ["China-relevant knowledge",
                "Formal, educational",
                "Rigid conversation"],
}

# helper → nice list without Nones
def strategies_for(model):
    return [s for s in MODEL_STRATEGIES[model] if s]

class ORClient:
    def __init__(self, api_key, model="google/gemini-2.5-flash-preview", min_interval=1.5):
        self.cli = OpenAI(api_key=api_key, base_url="https://openrouter.ai/api/v1")
        self.model = model
        self.min_interval = min_interval  # <- critical line
        self.last = 0

    @backoff.on_exception(backoff.expo, Exception, max_tries=6)
    def classify(self, prompt):
        wait = self.min_interval - (time.time() - self.last)
        if wait > 0:
            time.sleep(wait)
    
        res = self.cli.chat.completions.create(
            model=self.model,
            messages=[{"role": "user", "content": prompt}],
            max_tokens=300,
            temperature=0,
        )
        self.last = time.time()
        text = res.choices[0].message.content.strip()
    
        # Try extracting the JSON object
        try:
            # Remove triple-backtick code block wrappers
            if text.startswith("```json"):
                text = text[7:].strip("` \n")
            elif text.startswith("```"):
                text = text[3:].strip("` \n")
    
            # Find first { ... }
            json_start = text.find("{")
            json_end = text.rfind("}") + 1
            json_str = text[json_start:json_end]
    
            return json.loads(json_str)

        except Exception as e:
            print("\n--- Failed to parse LLM output ---")
            print(text)
            print("----------------------------------\n")
            return {
                "Reasoning_behind_classification": f"Failed to parse: {text}",
                "Strategies": ["Failed to parse"]
            }


# ------------------------------------------------------------------
# 2.  BUILD THE CLASSIFICATION PROMPT
# ------------------------------------------------------------------
def build_prompt(reasoning, guessed_model):
    allowed = strategies_for(guessed_model)
    allowed_str = "\n".join(f"{i+1}. {s}" for i, s in enumerate(allowed))
    return f"""You are judging WHY a language model guessed another model.

The guessed model: **{guessed_model}**

Classify the justification below into ANY of these strategies
(you may output multiple, or "Other" if none fit):

{allowed_str}

Justification:
\"\"\"{reasoning}\"\"\"

Return ONLY valid JSON like:
{{  
  "Reasoning_behind_classification": "...explanation...",
  "Strategies": ["{allowed[0]}", "{allowed[1]}"]
}}"""

# ------------------------------------------------------------------
# 3.  EXTRACT (reasoning, guess) PAIRS FROM ALL TRIALS
# ------------------------------------------------------------------
def extract_pairs(trials):
    pairs = []
    for t in trials:
        for agent_id in ["agent_0", "agent_1"]:
            reasoning = (t.get(f"{agent_id}_answer") or "").strip()
            guess = (t.get(f"{agent_id}_guess") or "").strip().lower()
            if reasoning and guess in MODEL_STRATEGIES:
                pairs.append({
                    "guesser": agent_id,
                    "guessed_model": guess,
                    "reasoning": reasoning,
                    "trial_id": t.get("seed", "na")
                })
    return pairs

# ------------------------------------------------------------------
# 4.  RUN EVERYTHING
# ------------------------------------------------------------------
from tqdm import tqdm  # for notebook-style progress bar

def run_classification(all_trials):
    client = ORClient(OPENROUTER_API_KEY)
    pairs = extract_pairs(all_trials)
    print(len(pairs))
    rows = []

    print(f"Classifying {len(pairs)} guess reasonings...\n")

    for p in tqdm(pairs, desc="Classifying guesses"):
        prompt = build_prompt(p["reasoning"], p["guessed_model"])
        result = client.classify(prompt)
        rows.append({
            **p,
            "strategies": ", ".join(result.get("Strategies", [])),
            "judge_explanation": result.get("Reasoning_behind_classification", "")
        })

    return pd.DataFrame(rows)

import concurrent.futures
from tqdm import tqdm
import math

def run_classification_parallel(all_trials, api_key, max_workers=4):
    from copy import deepcopy

    pairs = extract_pairs(all_trials)
    print(f"Classifying {len(pairs)} guess reasonings using {max_workers} threads...\n")

    # Shared input split into batches
    batch_size = math.ceil(len(pairs) / max_workers)
    batches = [pairs[i:i + batch_size] for i in range(0, len(pairs), batch_size)]

    # Create clients per worker
    clients = [ORClient(api_key) for _ in range(len(batches))]

    results = []

    def classify_batch(batch, client):
        batch_results = []
        for p in batch:
            prompt = build_prompt(p["reasoning"], p["guessed_model"])
            result = client.classify(prompt)
            batch_results.append({
                **p,
                "strategies": ", ".join(result.get("Strategies", [])),
                "judge_explanation": result.get("Reasoning_behind_classification", "")
            })
        return batch_results

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [
            executor.submit(classify_batch, batch, clients[i])
            for i, batch in enumerate(batches)
        ]

        results = []
        with tqdm(total=len(futures), desc="Classifying batches") as pbar:
            for f in concurrent.futures.as_completed(futures):
                results.extend(f.result())
                pbar.update(1)

    return pd.DataFrame(results)

# ------------------------------------------------------------------
# 5.  DO IT 🚀
# ------------------------------------------------------------------
# classified_df = run_classification(data)   # <-- your list of dicts
# classified_df.to_csv("guess_strategy_labels.csv", index=False)
# classified_df.head()
classified_df = run_classification_parallel(data, OPENROUTER_API_KEY, max_workers=16)
classified_df.to_csv("classified_model_guess_strategies.csv", index=False)

Classifying 8436 guess reasonings using 16 threads...



Classifying batches:   0%|                                                                                                                                              | 0/16 [00:07<?, ?it/s]


AttributeError: 'ORClient' object has no attribute 'min_interval'