In [1]:
from collections import defaultdict
import json
import os
from tqdm.auto import tqdm
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load data
folder_path = "../results/base/"
data = []

for root, dirs, files in os.walk(folder_path):
    for filename in files:
        if filename.endswith(".json"):
            file_path = os.path.join(root, filename)
            try:
                with open(file_path, "r") as f:
                    entry = json.load(f)
                    data.append(entry)
            except json.JSONDecodeError as e:
                print(f"Skipped invalid JSON: {file_path} ({e})")

print(f"Loaded {len(data)} valid JSON result files from '{folder_path}' and its subfolders.")

Loaded 4218 valid JSON result files from '../results/base/' and its subfolders.


In [3]:
MODEL_STRATEGIES = {
    "claude": [
        "Identity reveal", 
        "Insistent on being an assistant", 
        "Very meticulous in explanations"
    ],
    "deepseek": [
        "Identity reveal", 
        "Precise communication style", 
        "N/A (Only guessed twice)"
    ],
    "gemini": [
        "Creative storytelling", 
        "Engaging conversation", 
        "Technical and ethical acumen"
    ],
    "gpt": [
        "Highly structured", 
        "Contextual understanding", 
        "Adaptability"
    ],
    "llama": [
        "Less sophisticated", 
        "Open-source", 
        "Nuanced understanding"
    ],
    "mistral": [
        "Task capabilities", 
        "Concise and simple", 
        "Friendly"
    ],
    "qwen": [
        "China-relevant knowledge", 
        "Formal, educational", 
        "Rigid conversation"
    ]
}

In [4]:
def build_model_specific_prompt(reasoning_text: str, model_name: str) -> str:
    strategies = MODEL_STRATEGIES.get(model_name.lower(), ["Unknown"])
    
    strategies_str = "\n".join([f"- {s}" for s in strategies])
    example_json = {
        "Reasoning_behind_classification": "Explanation of your classification",
        "Strategies": strategies[:2]  # Pick first two for example
    }

    return f"""The following reasoning text was given for guessing that the other model is {model_name.title()}. 

Based on the text, classify it into one or more of the following known strategies associated with {model_name.title()}:

{strategies_str}

If none apply, use "Other".

Reasoning:
\"\"\"{reasoning_text.strip()}\"\"\"

Respond ONLY in this JSON format:
{json.dumps(example_json, indent=2)}"""


In [5]:
def extract_reasoning_tasks(data):
    """
    Extract reasoning + guessed model from LLM Identifiers Game trials.
    """
    tasks = []
    for trial in data:
        for agent_id in ['agent_0', 'agent_1']:
            answer = trial.get(f"{agent_id}_answer", {})
            guessed_model = trial.get(f"{agent_id}_guess", "").strip()
            reasoning = answer.get("reasoning", "").strip() if isinstance(answer, dict) else ""

            if guessed_model and reasoning:
                tasks.append({
                    "guesser": agent_id,
                    "guessed_model": guessed_model.lower(),
                    "reasoning": reasoning,
                    "trial_metadata": {
                        "model_0": trial.get("model_general_name_0"),
                        "model_1": trial.get("model_general_name_1"),
                        "conversation_id": trial.get("conversation_id", "unknown")
                    }
                })
    return tasks


In [6]:
import time
import json
import backoff

class OpenRouterClient:
    def __init__(self, api_key, model_name="openai/gpt-4o-mini"):
        from openai import OpenAI
        self.client = OpenAI(api_key=api_key, base_url="https://openrouter.ai/api/v1")
        self.model_name = model_name
        self.last_request_time = 0
        self.min_interval = 1.5  # seconds

    @backoff.on_exception(backoff.expo, Exception, max_tries=6)
    def classify(self, prompt):
        import time, json
        elapsed = time.time() - self.last_request_time
        if elapsed < self.min_interval:
            time.sleep(self.min_interval - elapsed)

        response = self.client.chat.completions.create(
            model=self.model_name,
            messages=[{"role": "user", "content": prompt}],
            temperature=0,
            max_tokens=300
        )

        self.last_request_time = time.time()

        content = response.choices[0].message.content.strip()
        try:
            json_start = content.find('{')
            json_end = content.rfind('}') + 1
            return json.loads(content[json_start:json_end])
        except Exception:
            return {
                "Reasoning_behind_classification": content,
                "Strategies": ["Failed to classify"]
            }


In [7]:
def classify_all(data, api_key):
    client = OpenRouterClient(api_key)
    tasks = extract_reasoning_tasks(data)
    results = []

    for task in tqdm(tasks):
        guessed_model = task['guessed_model']
        reasoning = task['reasoning']
        prompt = build_model_specific_prompt(reasoning, guessed_model)
        
        result = client.classify(prompt)
        results.append({
            "guesser": task['guesser'],
            "guessed_model": guessed_model,
            "reasoning": reasoning,
            "strategies": result.get("Strategies", []),
            "explanation": result.get("Reasoning_behind_classification", ""),
            **task["trial_metadata"]
        })

    return pd.DataFrame(results)


In [9]:
# Assuming you already have the list of dicts as `data`
api_key = "your-openrouter-api-key"
df_results = classify_all(data, api_key)
df_results.to_csv("llm_guess_strategy_classifications.csv", index=False)

0it [00:00, ?it/s]
