In [1]:
"""
SlopRank)
"""

import pandas as pd
import time
from dotenv import load_dotenv
import llm
import networkx as nx
import json
import random

load_dotenv()

##############################################################################
# 1. Configuration
##############################################################################

MODEL_NAMES = [
    "gemini-2.0-flash-thinking-exp-1219",
    "claude-3-5-sonnet-latest",
    "o1-preview",
    "deepseek-chat"
]

model_objects = {m: llm.get_model(m) for m in MODEL_NAMES}

EVALUATION_METHOD = 1  # 1 (numeric 1–10) or 2 (Upvote/Downvote)
USE_SUBSET_EVALUATION = False  # Toggle to use partial evaluation
EVALUATORS_SUBSET_SIZE = 2    # e.g., each judge only evaluates 2 other models, an option to evaluate only a subset of other models to reduce API calls.

##############################################################################
# 2. Prompting functions
##############################################################################

def query_model(prompt, model_name):
    """
    Sends a prompt to a specified model via 'llm' and returns the response text.
    """
    response = model_objects[model_name].prompt(prompt)
    return response.text()

def query_model_all(df, model_name):
    """
    Clean up prompts, query the chosen model for all prompts in the DataFrame,
    store the responses in a new column "response_{model_name}".
    """
    t0 = time.time()
    cleaned_prompts = df["prompt"].str.strip().str.lower()  # Example: simple transform
    colname = f"response_{model_name}"
    df[colname] = cleaned_prompts.map(lambda x: query_model(x, model_name))
    print(f"{model_name} processing time:", time.time() - t0)
    return df

def gather_all_model_responses(raw_prompts):
    """
    For each model in MODEL_NAMES, gather responses to raw_prompts into a single DataFrame.
    """
    df = pd.DataFrame({"prompt": raw_prompts})
    for m in MODEL_NAMES:
        df = query_model_all(df, m)
    return df

##############################################################################
# 3. Evaluate the responses
##############################################################################

def evaluate_responses(df):
    """
    Each model will evaluate the other models' answers and produce endorsements.
    We'll build a graph of endorsements and run PageRank.

    EVALUATION_METHOD=1 -> numeric rating
    EVALUATION_METHOD=2 -> upvote/downvote

    If USE_SUBSET_EVALUATION=True, each judge evaluates only a random subset
    (of size EVALUATORS_SUBSET_SIZE) of the other models.
    """
    G = nx.DiGraph()
    G.add_nodes_from(MODEL_NAMES)

    for idx, row in df.iterrows():
        prompt = row["prompt"]
        # Gather each model's response for this prompt
        model_responses = {
            m: row.get(f"response_{m}", "No response")
            for m in MODEL_NAMES
        }

        # For each judge model, gather endorsements
        for judge_model in MODEL_NAMES:
            # Which models are we evaluating?
            other_models = [m for m in MODEL_NAMES if m != judge_model]

            if USE_SUBSET_EVALUATION and len(other_models) > EVALUATORS_SUBSET_SIZE:
                # Randomly pick a smaller subset
                other_models = random.sample(other_models, EVALUATORS_SUBSET_SIZE)

            # If there are none left (e.g. M=1?), skip
            if not other_models:
                continue

            # Build the evaluation prompt
            evaluation_prompt = build_evaluation_prompt(
                EVALUATION_METHOD, prompt, judge_model, model_responses, other_models
            )
            # Query judge model
            raw_judgment = query_model(evaluation_prompt, judge_model)
            # Parse
            parsed_judgments = parse_evaluation_output(
                EVALUATION_METHOD, raw_judgment, other_models
            )

            # Add edges in the graph
            for rated_model, endorsement_val in parsed_judgments.items():
                if G.has_edge(judge_model, rated_model):
                    G[judge_model][rated_model]["weight"] += endorsement_val
                else:
                    G.add_edge(judge_model, rated_model, weight=endorsement_val)

    # Compute PageRank
    pagerank_scores = nx.pagerank(G, weight="weight")
    ranked_models = sorted(pagerank_scores.items(), key=lambda x: x[1], reverse=True)
    return G, ranked_models

def build_evaluation_prompt(method, prompt, judge_model, model_responses, other_models):
    """
    Build a meta-prompt that instructs 'judge_model' to evaluate the other models' answers.
    """
    # We'll ask for a specific JSON structure.
    answers_section = ""
    for om in other_models:
        answers_section += f"**Model {om}**: {model_responses[om]}\n\n"

    if method == 1:
        # Numeric rating approach
        instructions = f"""
You are {judge_model}. You see the following question:
PROMPT: "{prompt}"

Here are other models' answers:

{answers_section}

Give each model a rating from 1 to 10, strictly in JSON format with no extra text.
Example:
{{
    "modelA": 8,
    "modelB": 5
}}
Use each model's name as the key.
        """
    else:
        # Upvote/Downvote approach
        instructions = f"""
You are {judge_model}. You see the following question:
PROMPT: "{prompt}"

Here are other models' answers:

{answers_section}

Simply say "Upvote" or "Downvote" for each model, in JSON format, no extra text.
Example:
{{
    "modelA": "Upvote",
    "modelB": "Downvote"
}}
        """
    return instructions.strip()

def parse_evaluation_output(method, raw_judgment, other_models):
    """
    Convert the raw text from the judge model into a dict of endorsements.
    method=1 -> numeric 1–10
    method=2 -> upvote/downvote
    """
    try:
        data = json.loads(raw_judgment)
    except:
        if method == 1:
            return {m: 5.0 for m in other_models}  # fallback
        else:
            return {m: 0 for m in other_models}    # fallback

    endorsement_map = {}
    for m in other_models:
        val = data.get(m, None)
        if method == 1:
            try:
                score = float(val)
                if score < 1:
                    score = 1
                if score > 10:
                    score = 10
                endorsement_map[m] = score
            except:
                endorsement_map[m] = 5.0
        else:
            # Upvote/Downvote
            if isinstance(val, str) and val.lower().strip() == "upvote":
                endorsement_map[m] = 1
            elif isinstance(val, str) and val.lower().strip() == "downvote":
                endorsement_map[m] = 0
            else:
                endorsement_map[m] = 0

    return endorsement_map

##############################################################################
# 4. Main
##############################################################################

if __name__ == "__main__":
    raw_prompts = [
        "Name the state capitals of states starting with 'C'. Then tell me what's bigger, 9.11 or 9.9?",
        "What is the meaning of life?",
        "Write a poem about Shakespeare's impact on modern economics."
    ]

    # 1) Gather responses for each model
    df_responses = gather_all_model_responses(raw_prompts)

    # 2) Evaluate (endorse) each other's answers and compute PageRank
    G, ranked = evaluate_responses(df_responses)

    print("\n=== PageRank Scores ===")
    for model, score in ranked:
        print(f"{model}: {score:.4f}")

    # Inspect edges if you want:
    # for edge in G.edges(data=True):
    #     print(edge)


gemini-2.0-flash-thinking-exp-1219 processing time: 25.186904668807983
claude-3-5-sonnet-latest processing time: 9.704782009124756
o1-preview processing time: 36.931774854660034
deepseek-chat processing time: 15.889774799346924

=== PageRank Scores ===
gemini-2.0-flash-thinking-exp-1219: 0.2563
deepseek-chat: 0.2508
o1-preview: 0.2474
claude-3-5-sonnet-latest: 0.2455
