In [None]:
"""
SlopRank
"""
import pandas as pd
import networkx as nx
import json
import random
import time
from typing import List, Dict, Tuple
from dataclasses import dataclass
from pathlib import Path
import logging
from datetime import datetime

# Configure logging
logging.basicConfig(
    level=logging.WARNING,  # default to WARNING
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger("SlopRankLogger")
logger.setLevel(logging.INFO)  # SlopRank logs at INFO

@dataclass
class EvalConfig:
    """Configuration for the evaluation system."""
    model_names: List[str]
    evaluation_method: int  # 1 for numeric, 2 for ranking
    use_subset_evaluation: bool
    evaluators_subset_size: int
    output_dir: Path
    request_delay: float = 0.0  # adjustable delay between requests if needed
    
    def __post_init__(self):
        self.output_dir.mkdir(parents=True, exist_ok=True)
        if self.evaluation_method not in {1, 2}:
            raise ValueError("evaluation_method must be 1 or 2")
        if self.evaluators_subset_size >= len(self.model_names):
            raise ValueError("evaluators_subset_size must be less than number of models")

DEFAULT_CONFIG = EvalConfig(
    model_names=[
        "gemini-2.0-flash-thinking-exp-1219",
        "gemini-exp-1206",
        "claude-3-5-sonnet-latest",
        "o1-preview",
        "gpt-4o",
        "deepseek-chat"
    ],
    evaluation_method=1,
    use_subset_evaluation=True,
    evaluators_subset_size=3,
    output_dir=Path("results"),
    request_delay=0.5
)

class SlopRank:
    """
    Main class for running the evaluation pipeline:
    1. Collect responses from all models.
    2. Evaluate them with subset or full evaluations.
    3. Build a graph from the judgments.
    4. Run PageRank to rank the models.
    """
    def __init__(self, config: EvalConfig = DEFAULT_CONFIG):
        self.config = config
        self.responses_df = None
        self.evaluations_df = None

    def _validate_response(self, response: str) -> bool:
        """Basic validation of model responses."""
        if not isinstance(response, str):
            return False
        return len(response.strip()) >= 10

    def collect_responses(self, prompts: List[str], llm_module) -> pd.DataFrame:
        """Collect responses from all models for given prompts."""
        logger.info("Collecting responses...")
        responses = []
        total_start = time.time()

        for i, prompt in enumerate(prompts, 1):
            logger.info(f"Processing prompt {i}/{len(prompts)}")
            for model_name in self.config.model_names:
                start_time = time.time()
                logger.info(f"Querying {model_name}...")
                try:
                    model = llm_module.get_model(model_name)
                    response = model.prompt(prompt).text()
                    valid = self._validate_response(response)
                    elapsed = time.time() - start_time

                    responses.append({
                        'prompt': prompt,
                        'model': model_name,
                        'response': response if valid else None,
                        'is_valid': valid,
                        'response_time': elapsed
                    })
                    logger.info(f"{model_name} responded in {elapsed:.2f}s - "
                                f"{'Valid' if valid else 'Invalid'}")

                except Exception as e:
                    elapsed = time.time() - start_time
                    logger.error(f"Error from {model_name} after {elapsed:.2f}s: {str(e)}")
                    responses.append({
                        'prompt': prompt,
                        'model': model_name,
                        'response': None,
                        'is_valid': False,
                        'response_time': elapsed
                    })

                # Respect optional delay to avoid rate limits
                if self.config.request_delay > 0.0:
                    time.sleep(self.config.request_delay)

        total_time = time.time() - total_start
        logger.info(f"All responses collected in {total_time:.2f}s")
        self.responses_df = pd.DataFrame(responses)
        return self.responses_df

    def _create_evaluation_prompt(self, prompt: str, responses: Dict[str, str]) -> Tuple[str, Dict[str, str]]:
        """
        Creates the evaluation prompt, anonymizing the model names,
        and returns the mapping as well.
        """
        model_to_anon = {m: f"Model_{i+1}" for i, m in enumerate(responses.keys())}
        answers_section = "\n".join([
            f"{model_to_anon[m]}:\n{resp}\n---"
            for m, resp in responses.items()
        ])

        if self.config.evaluation_method == 1:
            instructions = f"""IMPORTANT: Return only a JSON object with ratings.

            Rate these responses to: "{prompt}"

            {answers_section}

            Rate 1-10 based on: accuracy, completeness, clarity, relevance, depth, usefulness
            10: Exceptional, 8-9: Excellent, 6-7: Good, 4-5: Fair, 1-3: Poor

            Format: {{"Model_1": 8, "Model_2": 7}}
            """
        else:
            instructions = f"""IMPORTANT: Return only a JSON object with rankings.

            Rank these responses to: "{prompt}"

            {answers_section}

            Rank from best (1) to worst. No ties allowed.
            Consider: accuracy, completeness, clarity, relevance, depth, usefulness

            Format: {{"Model_1": 1, "Model_2": 2}}
            """

        return instructions.strip(), model_to_anon

    def _parse_evaluation(self, raw_judgment: str, model_mapping: Dict[str, str]) -> Dict[str, float]:
        """
        Parse and validate evaluation responses. If parsing fails,
        fallback to neutral or default scores.
        """
        # Try to find the JSON object and parse it
        try:
            start = raw_judgment.find("{")
            end = raw_judgment.rfind("}") + 1
            if start == -1 or end == 0:
                raise ValueError("No JSON object found.")

            data = json.loads(raw_judgment[start:end])
            anon_to_model = {v: k for k, v in model_mapping.items()}

            results = {}
            for anon_id, score in data.items():
                model_name = anon_to_model.get(anon_id)
                if not model_name:
                    continue

                if self.config.evaluation_method == 1:
                    # Numeric scores, clamp between 1.0 and 10.0
                    numeric_score = float(score)
                    numeric_score = max(1.0, min(10.0, numeric_score))
                    results[model_name] = numeric_score
                else:
                    # Rankings must be integers
                    rank_score = int(score)
                    results[model_name] = rank_score

            return results

        except Exception as e:
            logger.error(f"Error parsing evaluation: {str(e)}")
            # Fallback to default
            if self.config.evaluation_method == 1:
                # default score 5.0 for numeric
                return {m: 5.0 for m in model_mapping.keys()}
            else:
                # default rank = "number of items" for ranking
                # so everything is equally "worst"
                return {
                    m: len(model_mapping)
                    for m in model_mapping.keys()
                }

    def evaluate_responses(self, responses_df: pd.DataFrame, llm_module) -> Tuple[nx.DiGraph, pd.DataFrame]:
        """Evaluate all responses and build the graph."""
        logger.info("Evaluating responses...")
        G = nx.DiGraph()
        G.add_nodes_from(self.config.model_names)
        evaluations = []

        # For each unique prompt
        for prompt in responses_df['prompt'].unique():
            prompt_subset = responses_df[responses_df['prompt'] == prompt]
            prompt_responses = prompt_subset.set_index('model')['response'].to_dict()

            # For each model that will serve as the judge
            for judge_model in self.config.model_names:
                # Filter out the judge from rated models and ignore invalid or missing responses
                other_models = [
                    m for m in self.config.model_names
                    if m != judge_model and prompt_responses.get(m) is not None
                ]

                if self.config.use_subset_evaluation:
                    other_models = random.sample(
                        other_models,
                        min(self.config.evaluators_subset_size, len(other_models))
                    )

                if not other_models:
                    continue

                # Build the evaluation prompt
                eval_prompt, model_mapping = self._create_evaluation_prompt(
                    prompt,
                    {m: prompt_responses[m] for m in other_models}
                )

                try:
                    raw_judgment = llm_module.get_model(judge_model).prompt(eval_prompt).text()
                    parsed_judgments = self._parse_evaluation(raw_judgment, model_mapping)

                    # Record judgments
                    for rated_model, score in parsed_judgments.items():
                        evaluations.append({
                            'prompt': prompt,
                            'judge_model': judge_model,
                            'rated_model': rated_model,
                            'score': score
                        })
                        # Update graph
                        if G.has_edge(judge_model, rated_model):
                            G[judge_model][rated_model]['weight'] += score
                        else:
                            G.add_edge(judge_model, rated_model, weight=score)

                except Exception as e:
                    logger.error(f"Error during evaluation by {judge_model}: {str(e)}")
                    continue

        return G, pd.DataFrame(evaluations)

    def run(self, prompts: List[str], llm_module) -> Dict:
        """
        Run the full SlopRank evaluation:
         - Collect responses
         - Evaluate them
         - Compute PageRank
         - Save results
        """
        # 1. Collect responses
        self.collect_responses(prompts, llm_module)
        if self.responses_df is None or self.responses_df.empty:
            logger.error("No responses collected, aborting.")
            return {}

        # 2. Save responses
        resp_path = self.config.output_dir / "responses.csv"
        self.responses_df.to_csv(resp_path, index=False)
        logger.info(f"Saved responses to {resp_path}")

        # 3. Evaluate
        G, self.evaluations_df = self.evaluate_responses(self.responses_df, llm_module)
        if self.evaluations_df.empty:
            logger.error("No evaluations produced, aborting.")
            return {}

        # 4. Save evaluations
        eval_path = self.config.output_dir / "evaluations.csv"
        self.evaluations_df.to_csv(eval_path, index=False)
        logger.info(f"Saved evaluations to {eval_path}")

        # 5. Compute PageRank
        if len(G.edges) == 0:
            logger.error("No valid edges to compute PageRank, aborting.")
            return {}

        pagerank_scores = nx.pagerank(G, weight="weight")
        ranked_models = sorted(pagerank_scores.items(), key=lambda x: x[1], reverse=True)

        # 6. Save graph
        gml_path = self.config.output_dir / "endorsement_graph.gml"
        nx.write_gml(G, gml_path)
        logger.info(f"Saved graph to {gml_path}")

        # 7. Save final results
        results = {
            "rankings": ranked_models,
            "metadata": {
                "evaluation_method": self.config.evaluation_method,
                "use_subset_evaluation": self.config.use_subset_evaluation,
                "evaluators_subset_size": self.config.evaluators_subset_size,
                "timestamp": datetime.now().isoformat()
            }
        }
        rankings_path = self.config.output_dir / "rankings.json"
        with open(rankings_path, "w") as f:
            json.dump(results, f, indent=4)
        logger.info(f"Saved rankings to {rankings_path}")

        return results

def main():
    import llm  # Your LLM module import
    from dotenv import load_dotenv
    load_dotenv()

    slop_rank_logger = logging.getLogger("SlopRankLogger")
    slop_rank_logger.info("Starting SlopRank evaluation")
    start_time = time.time()

    try:
        # Read prompts
        slop_rank_logger.info("Reading prompts from prompts.csv")
        prompts_df = pd.read_csv("prompts.csv")
        prompts = prompts_df["Questions"].tolist()
        slop_rank_logger.info(f"Loaded {len(prompts)} prompts")

        # Initialize evaluation
        config = DEFAULT_CONFIG
        slop_rank_logger.info(f"Using configuration: {config}")
        evaluator = SlopRank(config)

        # Run evaluation
        results = evaluator.run(prompts, llm)

        if results:
            print("\n=== Model Rankings ===")
            max_score = max(score for _, score in results["rankings"])
            for model, score in results["rankings"]:
                normalized_score = (score / max_score) * 10  # Normalize to 0-10
                print(f"{model:30} {score:.4f} (normalized: {normalized_score:.2f})")

            total_time = time.time() - start_time
            slop_rank_logger.info(f"Evaluation completed in {total_time:.2f}s")
        else:
            slop_rank_logger.error("No results generated")
    except Exception as e:
        slop_rank_logger.error(f"Fatal error in main: {str(e)}")

if __name__ == "__main__":
    main()
