In [5]:
# 1. Setup config

import pandas as pd
import json
import random
import time
from typing import List, Dict, Tuple
from dataclasses import dataclass
from pathlib import Path
import logging
from datetime import datetime

# Configure logging
logging.basicConfig(
    level=logging.WARNING,  # default to WARNING
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger("SlopRankLogger")
logger.setLevel(logging.INFO)  # Our SlopRank logs at INFO


In [6]:
# 2. Configuration (EvalConfig)

@dataclass
class EvalConfig:
    """Configuration for the evaluation system."""
    model_names: List[str]
    evaluation_method: int  # e.g., 1 => numeric rating
    use_subset_evaluation: bool
    evaluators_subset_size: int
    output_dir: Path
    request_delay: float = 0.0  # adjustable delay between requests if needed
    
    def __post_init__(self):
        self.output_dir.mkdir(parents=True, exist_ok=True)
        if self.evaluation_method not in {1, 2}:
            raise ValueError("evaluation_method must be 1 or 2")
        if self.evaluators_subset_size >= len(self.model_names):
            raise ValueError("evaluators_subset_size must be < number of models")

DEFAULT_CONFIG = EvalConfig(
    model_names=[
        "gemini-2.0-flash-thinking-exp-1219",
        "gemini-exp-1206",
        "claude-3-5-sonnet-latest",
        "o1-preview",
        "gpt-4o",
        "deepseek-chat"
    ],
    evaluation_method=1,  # numeric
    use_subset_evaluation=False,
    evaluators_subset_size=3,
    output_dir=Path("results"),  # folder for CSV outputs
    request_delay=0.0
)


In [8]:
# 3. Read prompts
# We assume you have a local "prompts.xlsx" file with columns ["Questions", "Answer_key"].

from dotenv import load_dotenv
load_dotenv()  # if you have .env credentials

logger.info("Reading prompts from prompts.xlsx ...")
prompts_df = pd.read_excel("prompts.xlsx", sheet_name=0)
prompts = prompts_df["Questions"].tolist()

# If "Answer_key" column exists, read it; otherwise fallback to None
if "Answer_key" in prompts_df.columns:
    answer_keys = prompts_df["Answer_key"].tolist()
else:
    logger.warning("No Answer_key column found; using None.")
    answer_keys = [None]*len(prompts_df)

prompt_pairs = list(zip(prompts, answer_keys))
logger.info(f"Loaded {len(prompt_pairs)} prompts from Excel.")


2025-01-09 14:22:29,563 - INFO - Reading prompts from prompts.xlsx ...
2025-01-09 14:22:29,612 - INFO - Loaded 5 prompts from Excel.


In [9]:
# 4. Collecting the responses

def collect_responses(prompt_pairs: List[Tuple[str, str]], config: EvalConfig, llm_module) -> pd.DataFrame:
    """
    Query each model with each prompt, saving the raw answers.
    Return a DataFrame: (prompt, model, response, is_valid, response_time, Answer_key).
    """
    logger.info("Collecting responses...")
    responses = []
    total_start = time.time()

    # For each (prompt, answer_key) pair
    for i, (prompt, answer_key) in enumerate(prompt_pairs, 1):
        logger.info(f"Processing prompt {i}/{len(prompt_pairs)}")
        for model_name in config.model_names:
            start_time = time.time()
            logger.info(f"Querying {model_name}...")
            try:
                model = llm_module.get_model(model_name)
                raw_response = model.prompt(prompt).text()
                # Simple validation: ensure at least 10 chars
                valid = isinstance(raw_response, str) and len(raw_response.strip()) >= 10
                elapsed = time.time() - start_time
                tokens_used = len(raw_response.split())

                responses.append({
                    'prompt': prompt,
                    'model': model_name,
                    'response': raw_response if valid else None,
                    'is_valid': valid,
                    'response_time': elapsed,
                    'Answer_key': answer_key,
                    'token_count': tokens_used 
                })
                logger.info(
                    f"{model_name} responded in {elapsed:.2f}s - {'Valid' if valid else 'Invalid'}"
                )

            except Exception as e:
                elapsed = time.time() - start_time
                logger.error(f"Error from {model_name} after {elapsed:.2f}s: {str(e)}")
                responses.append({
                    'prompt': prompt,
                    'model': model_name,
                    'response': None,
                    'is_valid': False,
                    'response_time': elapsed,
                    'Answer_key': answer_key,
                    'token_count': 0
                })

            if config.request_delay > 0.0:
                time.sleep(config.request_delay)

    total_time = time.time() - total_start
    logger.info(f"All responses collected in {total_time:.2f}s")

    return pd.DataFrame(responses)


In [10]:
# 5. Collecting Raw Evaluations (Unparsed)

def collect_raw_evaluations(responses_df: pd.DataFrame, config: EvalConfig, llm_module) -> pd.DataFrame:
    """
    Each model in config.model_names evaluates (rates) the others' responses.
    We do NOT parse the JSON here.
    Instead, we store the raw text in a DataFrame so we can debug if parsing fails.
    """
    logger.info("Collecting raw evaluations (unparsed)...")
    raw_judgment_log = []

    unique_prompts = responses_df['prompt'].unique()
    for prompt in unique_prompts:
        # Subset for that prompt
        prompt_subset = responses_df[responses_df['prompt'] == prompt]
        answer_key = prompt_subset['Answer_key'].iloc[0] if 'Answer_key' in prompt_subset.columns else None

        # model_name -> text response
        prompt_responses = prompt_subset.set_index('model')['response'].to_dict()

        # Each model is the judge
        for judge_model in config.model_names:
            # Exclude judge's own or missing responses
            other_models = [
                m for m in config.model_names
                if m != judge_model and prompt_responses.get(m) is not None
            ]

            if config.use_subset_evaluation and other_models:
                other_models = random.sample(
                    other_models,
                    min(config.evaluators_subset_size, len(other_models))
                )

            if not other_models:
                continue

            # Create a prompt that each judge will see
            model_to_anon = {m: f"Model_{i+1}" for i, m in enumerate(other_models)}
            answers_section = "\n".join([
                f"{model_to_anon[m]}:\n{prompt_responses[m]}\n---"
                for m in other_models
            ])
            if answer_key:
                answer_section = f"The Answer Key here is:\n{answer_key}\n---\n"
            else:
                answer_section = ""

            instructions = f"""
IMPORTANT: Your job is to evaluate the given problem and return a complete and syntactically perfect JSON object with ratings. Format should be: {{"Model_1": 8, "Model_2": 7}}

Rate these responses to: "{prompt}"
{answer_section}

The answers to rank from other LLMs:
{answers_section}

Rate each 1-10 based on: accuracy, completeness, clarity, relevance.
10: Only for truly exceptional, world leading class
8-9: Excellent, like a top professional in the field,
6-7: Good enough, like a mediocre undergraduate student, 
4-5: Fair, like an okay enough high schooler,
1-3: Poor

Again, the format to follow diligently: {{"Model_1": 8, "Model_2": 7}}
""".strip()

            try:
                judge_llm = llm_module.get_model(judge_model)
                judge_result_obj = judge_llm.prompt(instructions)
                raw_judgment = judge_result_obj.text()

                raw_judgment_tokens = len(raw_judgment.split())
                raw_judgment_log.append({
                    "prompt": prompt,
                    "judge_model": judge_model,
                    "raw_judgment": raw_judgment,
                    "model_mapping": json.dumps(model_to_anon),
                    "raw_judgment_token_count": raw_judgment_tokens   # store as JSON
                })

            except Exception as e:
                logger.error(f"Error collecting raw eval from judge={judge_model}: {str(e)}")
                # Save partial record so we know which judge/prompt failed
                raw_judgment_log.append({
                    "prompt": prompt,
                    "judge_model": judge_model,
                    "raw_judgment": None,
                    "model_mapping": json.dumps(model_to_anon),
                    "raw_judgment_token_count": 0,
                    "error": str(e)
                })

    raw_eval_df = pd.DataFrame(raw_judgment_log)
    logger.info("Finished collecting raw evaluation outputs.")
    return raw_eval_df


In [11]:
# 6. Parsing the raw evaluations

def parse_evaluation_rows(raw_eval_df: pd.DataFrame, config: EvalConfig) -> pd.DataFrame:
    """
    Parse each row of the raw_eval_df, which contains judge's raw JSON-like output.
    If parsing fails, fallback to a default rating (4.1) for each rated model.
    
    Returns a DataFrame: (prompt, judge_model, rated_model, score).
    """
    evaluations = []

    for _, row in raw_eval_df.iterrows():
        prompt = row["prompt"]
        judge_model = row["judge_model"]
        raw_judgment = row["raw_judgment"]
        raw_judgment_tokens = row.get("raw_judgment_token_count", 0)

        # Convert model_mapping from JSON string back to dict
        try:
            model_mapping = json.loads(row["model_mapping"])  # e.g. {"gemini-exp-1206":"Model_1"}
        except:
            model_mapping = {}

        if not raw_judgment:
            # If there's no raw judgment at all, we might skip or fallback
            for real_model in model_mapping.keys():
                evaluations.append({
                    "prompt": prompt,
                    "judge_model": judge_model,
                    "rated_model": real_model,
                    "score": 4.1,           # << changed fallback
                    "parse_failed": True
                })
            logger.warning(f"No raw_judgment for prompt={prompt}, judge={judge_model}; skipping parse.")
            continue

        # Try to parse a JSON object from the raw_judgment
        try:
            start = raw_judgment.find("{")
            end = raw_judgment.rfind("}") + 1

            if start == -1 or end == 0:
                raise ValueError("No JSON object found in raw_judgment")

            data = json.loads(raw_judgment[start:end])
            # Reverse mapping: "Model_1" => "gemini-exp-1206"
            anon_to_real = {v: k for k, v in model_mapping.items()}

            for anon_id, score in data.items():
                real_model = anon_to_real.get(anon_id)
                if not real_model:
                    # If we can't find the real model name, skip
                    continue
                numeric_score = float(score)
                numeric_score = max(1.0, min(10.0, numeric_score))  # clamp 1..10

                evaluations.append({
                    "prompt": prompt,
                    "judge_model": judge_model,
                    "rated_model": real_model,
                    "score": numeric_score,
                    "parse_failed": False,
                    "raw_judgment_token_count": raw_judgment_tokens
                })

        except Exception as e:
            logger.error(f"Parsing error for judge={judge_model}, prompt={prompt}: {str(e)}")
            # If parse fails, assign a default rating
            for real_model in model_mapping.keys():
                evaluations.append({
                    "prompt": prompt,
                    "judge_model": judge_model,
                    "rated_model": real_model,
                    "score": 4.1,
                    "parse_failed": True,
                    "raw_judgment_token_count": raw_judgment_tokens
                })

    evals_df = pd.DataFrame(evaluations)
    return evals_df


In [12]:
# 7. Full workflow
import llm  # custom LLM module

# 1) Create a config
config = DEFAULT_CONFIG
logger.info(f"Using config: {config}")

# 2) Collect or load responses
resp_path = config.output_dir / "responses.csv"

if resp_path.exists():
    logger.info(f"Loading existing responses from {resp_path}")
    responses_df = pd.read_csv(resp_path)
else:
    logger.info("No responses.csv found; collecting now from each model.")
    responses_df = collect_responses(prompt_pairs, config, llm)
    responses_df.to_csv(resp_path, index=False)
    logger.info(f"Saved new responses to {resp_path}")



2025-01-09 14:22:57,267 - INFO - Using config: EvalConfig(model_names=['gemini-2.0-flash-thinking-exp-1219', 'gemini-exp-1206', 'claude-3-5-sonnet-latest', 'o1-preview', 'gpt-4o', 'deepseek-chat'], evaluation_method=1, use_subset_evaluation=False, evaluators_subset_size=3, output_dir=PosixPath('results'), request_delay=0.0)
2025-01-09 14:22:57,269 - INFO - Loading existing responses from results/responses.csv
2025-01-09 14:22:57,294 - INFO - No raw_evaluations.csv found; collecting now (unparsed).
2025-01-09 14:22:57,294 - INFO - Collecting raw evaluations (unparsed)...
2025-01-09 14:49:01,107 - INFO - Finished collecting raw evaluation outputs.
2025-01-09 14:49:01,115 - INFO - Saved raw evaluations to results/raw_evaluations.csv
2025-01-09 14:49:01,116 - INFO - Loading parsed evaluations from results/evaluations.csv
2025-01-09 14:49:01,122 - INFO - Here are the first few rows of the parsed evaluations:


Unnamed: 0,prompt,judge_model,rated_model,score
0,"Provide a summary of the gene TM2D2, including...",gemini-2.0-flash-thinking-exp-1219,deepseek-chat,5.0
1,"Provide a summary of the gene TM2D2, including...",gemini-2.0-flash-thinking-exp-1219,o1-preview,5.0
2,"Provide a summary of the gene TM2D2, including...",gemini-2.0-flash-thinking-exp-1219,gemini-exp-1206,5.0
3,"Provide a summary of the gene TM2D2, including...",gemini-exp-1206,o1-preview,8.0
4,"Provide a summary of the gene TM2D2, including...",gemini-exp-1206,gemini-2.0-flash-thinking-exp-1219,7.0


In [None]:
# 8. Collect or load raw evaluations
raw_eval_path = config.output_dir / "raw_evaluations.csv"

if raw_eval_path.exists():
    logger.info(f"Loading existing raw evaluations from {raw_eval_path}")
    raw_eval_df = pd.read_csv(raw_eval_path)
else:
    logger.info("No raw_evaluations.csv found; collecting now (unparsed).")
    raw_eval_df = collect_raw_evaluations(responses_df, config, llm)
    raw_eval_df.to_csv(raw_eval_path, index=False)
    logger.info(f"Saved raw evaluations to {raw_eval_path}")


In [None]:
# 9. Parse or load final evaluations
eval_path = config.output_dir / "evaluations.csv"

if eval_path.exists():
    logger.info(f"Loading parsed evaluations from {eval_path}")
    evaluations_df = pd.read_csv(eval_path)
else:
    logger.info("No evaluations.csv found; parsing raw evaluations now.")
    evaluations_df = parse_evaluation_rows(raw_eval_df, config)
    evaluations_df.to_csv(eval_path, index=False)
    logger.info(f"Saved parsed evaluations to {eval_path}")


# 10. Inspect or analyze the final numeric scores
logger.info("Here are the first few rows of the parsed evaluations:")
display(evaluations_df.head())
