# Software mapping

In [67]:
import openai
import anthropic
import json
import time
import configparser
import tiktoken
from typing import List, Dict, Tuple, Optional
from collections import Counter
import numpy as np
from dataclasses import dataclass, asdict
import google.generativeai as genai

In [68]:
# ============================================================================
# CONFIGURATION
# ============================================================================

def initialize_openai():
    """Initialize OpenAI client from config file"""
    config = configparser.ConfigParser()
    config.read('config_LLM.txt')
    api_key = config['LLM'].get('OPENAI_API_KEY')
    model_type = config['LLM'].get('MODEL_TYPE_adv')
    client = openai.OpenAI(api_key=api_key)
    return client, model_type

def initialize_anthropic():
    """Initialize Anthropic client from config file"""
    config = configparser.ConfigParser()
    config.read('config_LLM.txt')
    api_key = config['LLM'].get('ANTHROPIC_API_KEY')
    client = anthropic.Anthropic(api_key=api_key) if api_key else None
    return client

def initialize_google():
    """Initialize Google Gemini client from config file"""
    config = configparser.ConfigParser()
    config.read('config_LLM.txt')
    api_key = config['LLM'].get('GOOGLE_API_KEY')
    if api_key:
        genai.configure(api_key=api_key)
        return True
    return None

In [69]:

# ============================================================================
# TOKEN COUNTING UTILITIES
# ============================================================================

def num_tokens_from_string(string: str, model_name: str) -> int:
    """
    Get token count with fallback for unsupported models.
    Updated for GPT-5 series and other new models.
    """
    try:
        encoding = tiktoken.encoding_for_model(model_name)
        return len(encoding.encode(string))
    except KeyError:
        # Fallback for unsupported models
        if model_name.startswith('gpt-5'):
            # Use o200k_base encoding for GPT-5 series
            encoding = tiktoken.get_encoding("o200k_base")
            return len(encoding.encode(string))
        elif model_name.startswith('gpt-4'):
            # Use cl100k_base for GPT-4 series
            encoding = tiktoken.get_encoding("cl100k_base")
            return len(encoding.encode(string))
        elif model_name.startswith('claude'):
            # Approximate for Claude (roughly 3.5 chars per token)
            return int(len(string) / 3.5)
        else:
            # General approximation: 4 chars per token
            return len(string) // 4

def count_tokens_in_messages(messages: List[Dict], model: str) -> int:
    """Count tokens in a list of messages"""
    total_tokens = 0
    for message in messages:
        # Count tokens in content
        if isinstance(message.get('content'), str):
            total_tokens += num_tokens_from_string(message['content'], model)
        # Add overhead for message structure (role, etc.)
        total_tokens += 4  # Approximate overhead per message
    total_tokens += 3  # Add overhead for prompt formatting
    return total_tokens




In [70]:


# ============================================================================
# CREDIT TRACKING SYSTEM
# ============================================================================

class CreditTracker:
    """
    Enhanced credit tracker supporting multiple LLM providers with their specific pricing.
    Updated with October 2025 pricing.
    """

    # Pricing per 1M tokens (as of October 2025)
    PRICING = {
        # OpenAI models
        'gpt-5': {'input': 0.625, 'output': 5.00, 'cached_input': 0.0625},
        'gpt-5-mini': {'input': 0.125, 'output': 1.00, 'cached_input': 0.0125},
        'gpt-5-nano': {'input': 0.025, 'output': 0.20, 'cached_input': 0.0025},
        'gpt-4o': {'input': 1.25, 'output': 5.00},
        'gpt-4o-mini': {'input': 0.075, 'output': 0.30},
        'gpt-4.1': {'input': 1.00, 'output': 4.00},
        'gpt-4.1-mini': {'input': 0.20, 'output': 0.80},
        'o1': {'input': 7.50, 'output': 30.00},
        'o1-mini': {'input': 0.55, 'output': 2.20},

        # Claude models (Anthropic)
        'claude-3-haiku': {'input': 0.25, 'output': 1.25},
        'claude-3.5-haiku': {'input': 0.25, 'output': 1.25},
        'claude-3.5-sonnet': {'input': 3.00, 'output': 15.00},
        'claude-sonnet-4-20250514': {'input': 3.00, 'output': 15.00},
        'claude-3-opus': {'input': 15.00, 'output': 75.00},

        # Google Gemini models
        'gemini-1.5-pro': {'input': 1.25, 'output': 5.00},
        'gemini-1.5-flash': {'input': 0.075, 'output': 0.30},
        'gemini-2.0-flash-exp': {'input': 0.0, 'output': 0.0},
    }

    def __init__(self):
        self.total_input_tokens = 0
        self.total_output_tokens = 0
        self.total_cached_tokens = 0
        self.total_cost = 0
        self.model_usage = {}  # Track usage per model
        self.call_count = 0

    def update(self, 
               model: str, 
               input_tokens: int, 
               output_tokens: int,
               cached_tokens: int = 0):
        """
        Update tracker with token usage for a specific model.

        Args:
            model: Model name (e.g., 'gpt-4o', 'claude-3.5-sonnet')
            input_tokens: Number of input tokens
            output_tokens: Number of output tokens
            cached_tokens: Number of cached input tokens (if applicable)
        """
        # Update totals
        self.total_input_tokens += input_tokens
        self.total_output_tokens += output_tokens
        self.total_cached_tokens += cached_tokens
        self.call_count += 1

        # Get pricing for this model
        pricing = self.PRICING.get(model, {'input': 0.00015, 'output': 0.0006})  # Default fallback

        # Calculate cost
        input_cost = (input_tokens / 1_000_000) * pricing['input']
        output_cost = (output_tokens / 1_000_000) * pricing['output']
        cached_cost = (cached_tokens / 1_000_000) * pricing.get('cached_input', pricing['input'])

        call_cost = input_cost + output_cost + cached_cost
        self.total_cost += call_cost

        # Track per-model usage
        if model not in self.model_usage:
            self.model_usage[model] = {
                'calls': 0,
                'input_tokens': 0,
                'output_tokens': 0,
                'cached_tokens': 0,
                'cost': 0
            }

        self.model_usage[model]['calls'] += 1
        self.model_usage[model]['input_tokens'] += input_tokens
        self.model_usage[model]['output_tokens'] += output_tokens
        self.model_usage[model]['cached_tokens'] += cached_tokens
        self.model_usage[model]['cost'] += call_cost

    def get_stats(self):
        """Get comprehensive statistics about API usage and costs"""
        return {
            "total_calls": self.call_count,
            "total_input_tokens": self.total_input_tokens,
            "total_output_tokens": self.total_output_tokens,
            "total_cached_tokens": self.total_cached_tokens,
            "total_tokens": self.total_input_tokens + self.total_output_tokens,
            "total_cost": round(self.total_cost, 4),
            "average_cost_per_call": round(self.total_cost / max(self.call_count, 1), 4),
            "model_breakdown": {
                model: {
                    'calls': stats['calls'],
                    'total_tokens': stats['input_tokens'] + stats['output_tokens'],
                    'cost': round(stats['cost'], 4)
                }
                for model, stats in self.model_usage.items()
            }
        }

    def print_summary(self):
        """Print a formatted summary of usage"""
        stats = self.get_stats()
        print("\n" + "="*60)
        print("API USAGE SUMMARY")
        print("="*60)
        print(f"Total API Calls: {stats['total_calls']}")
        print(f"Total Tokens: {stats['total_tokens']:,}")
        print(f"  - Input: {stats['total_input_tokens']:,}")
        print(f"  - Output: {stats['total_output_tokens']:,}")
        print(f"  - Cached: {stats['total_cached_tokens']:,}")
        print(f"\nTotal Cost: ${stats['total_cost']:.4f}")
        print(f"Average Cost per Call: ${stats['average_cost_per_call']:.4f}")

        if self.model_usage:
            print("\nBreakdown by Model:")
            print("-" * 60)
            for model, breakdown in stats['model_breakdown'].items():
                print(f"  {model}:")
                print(f"    Calls: {breakdown['calls']}")
                print(f"    Tokens: {breakdown['total_tokens']:,}")
                print(f"    Cost: ${breakdown['cost']:.4f}")
        print("="*60)







## The method assessor

In [71]:
# ============================================================================
# ASSESSMENT DATA STRUCTURES
# ============================================================================

@dataclass
class AssessmentResult:
    """Store assessment results from a single LLM"""
    software: str
    method: str
    rank: int
    reasoning: str
    sources: List[str]
    llm_provider: str
    input_tokens: int = 0
    output_tokens: int = 0

@dataclass
class ConsensusResult:
    """Store consensus results across multiple LLMs"""
    software: str
    method: str
    final_rank: int
    confidence: float
    individual_ranks: Dict[str, int]
    individual_reasoning: Dict[str, str]
    individual_sources: Dict[str, List[str]]
    agreement_level: str
    total_tokens: int = 0
    total_cost: float = 0.0


In [None]:
# ============================================================================
# SOFTWARE METHOD ASSESSOR WITH CREDIT TRACKING
# ============================================================================
class SoftwareMethodAssessor:
    """
    Assess software implementation support for specific methods using multiple LLM APIs.
    Includes comprehensive credit tracking across all providers.

    Ranking scale: 
    0=no support, 
    1=limited possibility, 
    2=via APIs/extensions, 
    3=directly implemented
    """

    def __init__(self, 
                 openai_api_key: str = None, 
                 anthropic_api_key: str = None,
                 google_api_key: str = None,
                 use_config: bool = True):
        """
        Initialize with API keys

        Args:
            openai_api_key: OpenAI API key (optional if use_config=True)
            anthropic_api_key: Anthropic API key (optional)
            google_api_key: Google API key (optional)
            use_config: If True, load from config_LLM.txt file
        """
        if use_config:
            self.openai_client, self.default_model = initialize_openai()
            self.anthropic_client = initialize_anthropic()
            self.google_enabled = initialize_google()
        else:
            self.openai_client = openai.OpenAI(api_key=openai_api_key) if openai_api_key else None
            self.anthropic_client = anthropic.Anthropic(api_key=anthropic_api_key) if anthropic_api_key else None
            self.default_model = "gpt-4o"
            if google_api_key:
                genai.configure(api_key=google_api_key)
                self.google_enabled = True
            else:
                self.google_enabled = False
        self.default_model = "gpt-4o"

        # Initialize credit tracker
        self.credit_tracker = CreditTracker()

        # System prompt for assessment
       

        self.system_prompt = """You are a technical software assessment expert specialized in power systems analysis software. Your task is to evaluate the implementation support level for specific methods.

Use this ranking scale:
0 = No support (method cannot be implemented at all)
1 = Limited possibility for implementation or extension (requires significant workarounds)
2 = Indirectly supported through APIs or extensions (requires external tools/plugins)
3 = Directly implemented (native feature in the software)

CRITICAL: You MUST search for and provide actual references. Your assessment must be based on real, verifiable sources.

For each assessment:
1. Search for scientific papers demonstrating implementation (IEEE Xplore, ScienceDirect, arXiv, Google Scholar)
2. Find official documentation from the software vendor or project website
3. Look for GitHub repositories with code examples or open-source implementations
4. Check API documentation or extension/plugin capabilities
5. Review user forums, technical blogs, Stack Overflow, or case studies

IMPORTANT: The "sources" field is MANDATORY and must contain:
- Full URLs to documentation pages
- Paper titles with DOIs or direct links
- GitHub repository URLs with specific file paths if relevant
- Forum discussion URLs
- Technical blog post URLs

If you cannot find ANY sources, you must state this explicitly in the reasoning and set rank to 0.

Return your response in VALID JSON format with this exact structure:
{
    "rank": <0-3>,
    "reasoning": "<detailed explanation citing specific sources by number, e.g., 'According to [1], PSS/E supports...''>",
    "sources": [
        "https://example.com/documentation - Official PSS/E Manual on OPF",
        "https://doi.org/10.1109/... - Paper title by Author et al.",
        "https://github.com/org/repo/file.py - Implementation example"
    ]
}

Each source must include both the URL and a brief description separated by ' - '.
Minimum 2 sources required for ranks 2-3, minimum 1 source for rank 1."""

# Assessing functions
    def create_assessment_prompt(self, software: str, method: str) -> str:
        """Create prompt for assessing a specific software-method combination"""
        return f"""Assess the implementation support for:

    Software: {software}
    Method: {method}

    You MUST search comprehensively and provide actual URLs/references for:

    1. Scientific papers (IEEE Xplore, ScienceDirect, Google Scholar, arXiv) 
    Example: "https://doi.org/10.1109/TPWRS.2020.1234567 - Optimal Power Flow Implementation in PSS/E by Smith et al."

    2. Official {software} documentation (user manuals, technical references, API docs)
    Example: "https://www.siemens-energy.com/psse/docs/opf-manual.pdf - PSS/E OPF User Manual Chapter 5"

    3. GitHub repositories with code examples
    Example: "https://github.com/username/project/blob/main/opf.py - OPF implementation using {software} API"

    4. Technical forums (Stack Overflow, vendor forums, Reddit)
    Example: "https://stackoverflow.com/questions/12345678 - Discussion on {method} in {software}"

    5. Technical blogs, white papers, or case studies
    Example: "https://blog.example.com/psse-opf-tutorial - Tutorial on OPF implementation"

    Based on your findings, determine the implementation rank (0-3) with:
    - Detailed reasoning that cites sources by number [1], [2], etc.
    - Complete list of sources with URLs and descriptions
    - Specific examples from the sources
    - Version information if available

    REMEMBER: The sources list is mandatory. If you cannot find sources, state this clearly and rank as 0."""

    def assess_with_openai(self, 
                          software: str, 
                          method: str, 
                          model: str = None) -> AssessmentResult:
        """
        Assess using OpenAI's model with credit tracking

        Args:
            software: Software name
            method: Method to assess
            model: OpenAI model to use (defaults to configured model)
        """
        if model is None:
            model = self.default_model

        try:
            prompt = self.create_assessment_prompt(software, method)
            messages = [
                {"role": "system", "content": self.system_prompt},
                {"role": "user", "content": prompt}
            ]

            # Estimate input tokens
            input_tokens = count_tokens_in_messages(messages, model)

            response = self.openai_client.chat.completions.create(
                model=model,
                messages=messages,
                temperature=0.3,
                response_format={"type": "json_object"}
            )
            usage = response.usage

            cached_tokens = 0
            if hasattr(usage, 'prompt_tokens_details') and usage.prompt_tokens_details:
                    cached_tokens = getattr(usage.prompt_tokens_details, 'cached_tokens', 0)
            # Track actual usage
            
            self.credit_tracker.update(
                model=model,
                input_tokens=usage.prompt_tokens,
                output_tokens=usage.completion_tokens,
                cached_tokens=cached_tokens
            )
            
            result = json.loads(response.choices[0].message.content)

                # After parsing JSON:
            rank = int(result.get("rank", 0))
            sources = result.get("sources", [])
            reasoning = result.get("reasoning", "")
            
            # Validate sources
            if rank > 0 and len(sources) == 0:
                if debug:
                    print(f"WARNING: Rank {rank} assigned but no sources provided. Lowering rank to 0.")
                rank = 0
                reasoning += " [Note: Rank lowered to 0 due to lack of verifiable sources]"
            
            return AssessmentResult(
                software=software,
                method=method,
                rank=rank,
                reasoning=reasoning,
                sources=sources,
                llm_provider=f"openai_{model}",
                input_tokens=usage.prompt_tokens,
                output_tokens=usage.completion_tokens
            )

        except Exception as e:
            print(f"Error with OpenAI assessment: {e}")
            return AssessmentResult(
                software=software,
                method=method,
                rank=0,
                reasoning=f"Error: {str(e)}",
                sources=[],
                llm_provider=f"openai_{model}"
            )

    def assess_with_claude(self, 
                          software: str, 
                          method: str, 
                          model: str = "claude-sonnet-4-20250514") -> AssessmentResult:
        """
        Assess using Anthropic's Claude with credit tracking

        Args:
            software: Software name
            method: Method to assess
            model: Claude model to use
        """
        if not self.anthropic_client:
            raise ValueError("Anthropic API key not provided")

        try:
            prompt = self.create_assessment_prompt(software, method)

            # Estimate input tokens
            input_tokens = num_tokens_from_string(self.system_prompt + prompt, model)

            response = self.anthropic_client.messages.create(
                model=model,
                max_tokens=4096,
                temperature=0.3,
                system=self.system_prompt,
                messages=[
                    {"role": "user", "content": prompt}
                ]
            )

            # Track usage
            self.credit_tracker.update(
                model=model,
                input_tokens=response.usage.input_tokens,
                output_tokens=response.usage.output_tokens
            )

            # Extract JSON from response
            content = response.content[0].text
            content = content.strip()
            if content.startswith('```'):
                lines = content.split('\n')
                start_idx = 0
                end_idx = len(lines)
                for i, line in enumerate(lines):
                    if line.strip().startswith('```'):
                        if start_idx == 0:
                            start_idx = i + 1
                        else:
                            end_idx = i
                            break
                content = '\n'.join(lines[start_idx:end_idx])
            # Try to parse JSON with fallback
            try:
                result = json.loads(content)
            except json.JSONDecodeError:
                import re
                json_match = re.search(r'\{[^{}]*"rank"[^{}]*\}', content, re.DOTALL)
                if json_match:
                    result = json.loads(json_match.group())
                else:
                    raise ValueError(f"Could not parse JSON from response: {content[:200]}")
            # After successfully parsing JSON, add validation:
            rank = int(result.get("rank", 0))
            sources = result.get("sources", [])
            reasoning = result.get("reasoning", "")
            
            # Validate that sources are provided for non-zero ranks
            if rank > 0 and len(sources) == 0:
                if debug:
                    print(f"WARNING: Rank {rank} assigned but no sources provided. Lowering rank to 0.")
                rank = 0
                reasoning += " [Note: Rank lowered to 0 due to lack of verifiable sources]"
            
            # Warn if sources look generic or empty
            if len(sources) > 0:
                empty_sources = [s for s in sources if len(s.strip()) < 10]
                if len(empty_sources) > 0 and debug:
                    print(f"WARNING: {len(empty_sources)} sources appear to be empty or too short")
            
            return AssessmentResult(
                software=software,
                method=method,
                rank=rank,
                reasoning=reasoning,
                sources=sources,
                llm_provider=f"claude_{model}",
                input_tokens=response.usage.input_tokens,
                output_tokens=response.usage.output_tokens
            )


        except Exception as e:
            print(f"Error with Claude assessment: {e}")
            return AssessmentResult(
                software=software,
                method=method,
                rank=0,
                reasoning=f"Error: {str(e)}",
                sources=[],
                llm_provider=f"claude_{model}"
            )

    def assess_with_google(self, 
                        software: str, 
                        method: str, 
                        model: str = "models/gemini-2.5-flash",  # ← UPDATED DEFAULT
                        debug: bool = False) -> AssessmentResult:
        """
        Assess using Google Gemini with credit tracking

        Args:
            software: Software name
            method: Method to assess
            model: Gemini model to use (default: models/gemini-2.5-flash)
            debug: If True, print detailed debugging information
        """
        if not self.google_enabled:
            raise ValueError("Google API key not provided")
        
        try:
            prompt = self.create_assessment_prompt(software, method)
            
            if debug:
                print(f"\n{'='*60}")
                print(f"DEBUG: Google Gemini Request for {software} - {method}")
                print(f"{'='*60}")
                print(f"Model: {model}")

            # Ensure model has correct prefix
            if not model.startswith('models/'):
                model = f"models/{model}"
            
            # Create the model
            gemini_model = genai.GenerativeModel(
                model_name=model,
                generation_config={
                    "temperature": 0.3,
                    "max_output_tokens": 2048,
                },
                system_instruction=self.system_prompt
            )
            
            # Generate response
            response = gemini_model.generate_content(
                prompt,
                generation_config={
                    "temperature": 0.3,
                    "max_output_tokens": 2048,
                }
            )
            
            if debug:
                print(f"DEBUG: Successfully used model: {model}")
            
            # Extract token counts
            try:
                input_tokens = response.usage_metadata.prompt_token_count
                output_tokens = response.usage_metadata.candidates_token_count
            except AttributeError:
                # Fallback if usage_metadata is not available
                input_tokens = int(len(prompt.split()) * 1.3)
                output_tokens = int(len(response.text.split()) * 1.3)
                if debug:
                    print("DEBUG: Token usage not available, using approximation")
            
            if debug:
                print(f"DEBUG: Tokens - Input: {input_tokens}, Output: {output_tokens}")
            
            # Track usage
            self.credit_tracker.update(
                model=model,
                input_tokens=int(input_tokens),
                output_tokens=int(output_tokens)
            )
            
            content = response.text
            
            if debug:
                print(f"\nDEBUG: Raw Gemini Response:")
                print(f"{'-'*60}")
                print(content[:500])  # Print first 500 chars
                print(f"{'-'*60}")
            
            # Clean and parse JSON (same as Claude method)
            content = content.strip()
            if content.startswith('```'):
                if debug:
                    print(f"\nDEBUG: Detected markdown code block, cleaning...")
                lines = content.split('\n')
                start_idx = 0
                end_idx = len(lines)
                for i, line in enumerate(lines):
                    if line.strip().startswith('```'):
                        if start_idx == 0:
                            start_idx = i + 1
                        else:
                            end_idx = i
                            break
                content = '\n'.join(lines[start_idx:end_idx])
            
            # Try to parse JSON
            result = None
            try:
                result = json.loads(content)
                if debug:
                    print(f"\nDEBUG: Successfully parsed JSON")
                    print(f"DEBUG: Rank: {result.get('rank', 'NOT FOUND')}")
            except json.JSONDecodeError as je:
                if debug:
                    print(f"\nDEBUG: JSON parsing failed: {je}")
                    print(f"DEBUG: Attempting regex extraction...")
                
                import re
                json_match = re.search(r'\{[^{}]*"rank"[^{}]*\}', content, re.DOTALL)
                if json_match:
                    json_str = json_match.group()
                    result = json.loads(json_str)
                else:
                    raise ValueError(f"Could not parse JSON from response: {content[:200]}")
            
            # Validate sources
            rank = int(result.get("rank", 0))
            sources = result.get("sources", [])
            reasoning = result.get("reasoning", "")
            
            if rank > 0 and len(sources) == 0:
                if debug:
                    print(f"WARNING: Rank {rank} assigned but no sources provided. Lowering rank to 0.")
                rank = 0
                reasoning += " [Note: Rank lowered to 0 due to lack of verifiable sources]"
            
            if debug:
                print(f"\nDEBUG: Final rank: {rank}")
                print(f"DEBUG: Number of sources: {len(sources)}")
                print(f"{'='*60}\n")

            return AssessmentResult(
                software=software,
                method=method,
                rank=rank,
                reasoning=reasoning,
                sources=sources,
                llm_provider=f"google_{model.replace('models/', '')}",
                input_tokens=int(input_tokens),
                output_tokens=int(output_tokens)
            )

        except Exception as e:
            print(f"Error with Google assessment: {e}")
            import traceback
            traceback.print_exc()
            return AssessmentResult(
                software=software,
                method=method,
                rank=0,
                reasoning=f"Error: {str(e)}",
                sources=[],
                llm_provider=f"google_{model.replace('models/', '')}"
            )

# compare different results within same run
    def calculate_confidence(self, ranks: List[int]) -> Tuple[float, str]:
        """
        Calculate confidence based on agreement between LLM assessments

        Returns:
            confidence score (0-1) and agreement level description
        """
        if len(ranks) < 2:
            return 0.5, "single_assessment"

        rank_counts = Counter(ranks)
        most_common_count = rank_counts.most_common(1)[0][1]
        total_assessments = len(ranks)

        agreement = most_common_count / total_assessments
        variance = np.var(ranks)

        # Combined confidence metric
        confidence = agreement * (1 - variance / 4)

        if agreement == 1.0:
            level = "perfect_agreement"
        elif agreement >= 0.67:
            level = "strong_agreement"
        elif agreement >= 0.5:
            level = "moderate_agreement"
        else:
            level = "low_agreement"

        return confidence, level

    def assess_software_method(self, 
                               software: str, 
                               method: str,
                               use_openai: bool = True,
                               use_claude: bool = True,
                               use_google:bool = True,
                               openai_model: str = None,
                               claude_model: str = "claude-sonnet-4-20250514",
                               google_model: str = "models/gemini-2.5-flash") -> ConsensusResult:
        """
        Assess a single software-method combination using multiple LLMs
        """
        results = []
        total_tokens = 0

        if use_openai and self.openai_client:
            print(f"Assessing {software} - {method} with OpenAI...")
            result = self.assess_with_openai(software, method, openai_model)
            results.append(result)
            total_tokens += result.input_tokens + result.output_tokens
            time.sleep(1)

        if use_claude and self.anthropic_client:
            print(f"Assessing {software} - {method} with Claude...")
            result = self.assess_with_claude(software, method, claude_model)
            results.append(result)
            total_tokens += result.input_tokens + result.output_tokens
            time.sleep(1)
        if use_google and self.google_enabled:  # ← ADD THIS BLOCK
            print(f"Assessing {software} - {method} with Google Gemini...")
            result = self.assess_with_google(software, method, google_model)
            results.append(result)
            total_tokens += result.input_tokens + result.output_tokens
            time.sleep(1)
        
        ranks = [r.rank for r in results]
        confidence, agreement_level = self.calculate_confidence(ranks)

        rank_counts = Counter(ranks)
        final_rank = rank_counts.most_common(1)[0][0]

        individual_ranks = {r.llm_provider: r.rank for r in results}
        individual_reasoning = {r.llm_provider: r.reasoning for r in results}
        individual_sources = {r.llm_provider: r.sources for r in results} 

        # Calculate total cost
        total_cost = sum([
            self.credit_tracker.PRICING.get(
                r.llm_provider.replace('openai_', '').replace('claude_', '').replace('google_', ''),
                {'input': 0, 'output': 0}
            )['input'] * r.input_tokens / 1_000_000 +
            self.credit_tracker.PRICING.get(
                r.llm_provider.replace('openai_', '').replace('claude_', '').replace('google_', ''),
                {'input': 0, 'output': 0}
            )['output'] * r.output_tokens / 1_000_000
            for r in results
        ])

        return ConsensusResult(
            software=software,
            method=method,
            final_rank=final_rank,
            confidence=confidence,
            individual_ranks=individual_ranks,
            individual_reasoning=individual_reasoning,
            individual_sources=individual_sources, 
            agreement_level=agreement_level,
            total_tokens=total_tokens,
            total_cost=total_cost
        )

    def assess_multiple(self,
                       software_list: List[str],
                       method_list: List[str],
                       use_openai: bool = True,
                       use_claude: bool = True,
                       use_google: bool = True,
                       openai_model: str = None,
                       claude_model: str = "claude-sonnet-4-20250514",
                       google_model: str = "models/gemini-2.5-flash") -> List[ConsensusResult]:
        """
        Assess multiple software-method combinations with progress tracking
        """
        results = []
        total = len(software_list) * len(method_list)
        current = 0

        print(f"\nStarting assessment of {total} combinations...")
        print(f"Using OpenAI: {use_openai}, Using Claude: {use_claude}, Using Google: {use_google}")
        print("-" * 60)

        for software in software_list:
            for method in method_list:
                current += 1
                print(f"\n[{current}/{total}] Processing: {software} - {method}")

                result = self.assess_software_method(
                    software=software,
                    method=method,
                    use_openai=use_openai,
                    use_claude=use_claude,
                    openai_model=openai_model,
                    claude_model=claude_model,
                    google_model=google_model
                )
                results.append(result)

                # Show interim cost
                stats = self.credit_tracker.get_stats()
                print(f"  Rank: {result.final_rank}, Confidence: {result.confidence:.2%}")
                print(f"  Running cost: ${stats['total_cost']:.4f} ({stats['total_tokens']:,} tokens)")

                time.sleep(2)  # Rate limiting

        return results

    def print_detailed_results(self, results: List[ConsensusResult], min_confidence: float = 0.0):
        """ Print detailed results including sources
            Args:
            results: List of consensus results
            min_confidence: Minimum confidence threshold to display (0.0 to 1.0)
        """
        print("\n" + "="*80)
        print("DETAILED ASSESSMENT RESULTS")
        print("="*80)
        
        for result in results:
            if result.confidence >= min_confidence:
                print(f"\n{'─'*80}")
                print(f"Software: {result.software}")
                print(f"Method: {result.method}")
                print(f"Final Rank: {result.final_rank}")
                print(f"Confidence: {result.confidence:.2%} ({result.agreement_level})")
                print(f"Cost: ${result.total_cost:.4f}")
                
                print(f"\n{'─'*40}")
                print("Individual Assessments:")
                print(f"{'─'*40}")
                
                for provider, rank in result.individual_ranks.items():
                    print(f"\n{provider}:")
                    print(f"  Rank: {rank}")
                    
                    # Print reasoning (truncated)
                    reasoning = result.individual_reasoning.get(provider, "N/A")
                    if len(reasoning) > 200:
                        print(f"  Reasoning: {reasoning[:200]}...")
                    else:
                        print(f"  Reasoning: {reasoning}")
                    
                    # Print sources
                    sources = result.individual_sources.get(provider, [])
                    if sources:
                        print(f"  Sources ({len(sources)}):")
                        for i, source in enumerate(sources, 1):
                            print(f"    [{i}] {source}")
                    else:
                        print(f"  Sources: None provided")
        
        print("\n" + "="*80)


        """Create summary statistics from results"""
        total = len(results)
        
        confidence_levels = {
            "high_confidence": sum(1 for r in results if r.confidence >= 0.8),
            "medium_confidence": sum(1 for r in results if 0.5 <= r.confidence < 0.8),
            "low_confidence": sum(1 for r in results if r.confidence < 0.5)
        }
        
        agreement_levels = Counter([r.agreement_level for r in results])
        
        rank_distribution = Counter([r.final_rank for r in results])
        
        return {
            "total_assessments": total,
            "confidence_distribution": confidence_levels,
            "agreement_distribution": dict(agreement_levels),
            "rank_distribution": dict(rank_distribution),
            "average_confidence": np.mean([r.confidence for r in results])
        }

    def create_sources_summary(self, results: List[ConsensusResult]) -> Dict:
        """Create summary of sources across all assessments"""
        total_sources = 0
        assessments_with_sources = 0
        assessments_without_sources = 0
        sources_by_provider = {}
        
        for result in results:
            has_sources = False
            for provider, sources in result.individual_sources.items():
                if provider not in sources_by_provider:
                    sources_by_provider[provider] = {
                        'total_sources': 0,
                        'assessments_with_sources': 0,
                        'assessments_without_sources': 0
                    }
                
                if sources and len(sources) > 0:
                    total_sources += len(sources)
                    sources_by_provider[provider]['total_sources'] += len(sources)
                    sources_by_provider[provider]['assessments_with_sources'] += 1
                    has_sources = True
                else:
                    sources_by_provider[provider]['assessments_without_sources'] += 1
            
            if has_sources:
                assessments_with_sources += 1
            else:
                assessments_without_sources += 1
        
        return {
            'total_sources': total_sources,
            'assessments_with_sources': assessments_with_sources,
            'assessments_without_sources': assessments_without_sources,
            'average_sources_per_assessment': total_sources / max(len(results), 1),
            'by_provider': sources_by_provider
        }

# export and save
    def export_results(self, results: List[ConsensusResult], filename: str):
        """Export results to JSON file"""
        export_data = [asdict(r) for r in results]
        with open(filename, 'w') as f:
            json.dump(export_data, f, indent=2)
        print(f"\nResults exported to {filename}")
# create the comparison reesults report
    def create_summary_report(self, results: List[ConsensusResult]) -> Dict:
        """Create summary statistics from results"""
        total = len(results)

        confidence_levels = {
            "high_confidence": sum(1 for r in results if r.confidence >= 0.8),
            "medium_confidence": sum(1 for r in results if 0.5 <= r.confidence < 0.8),
            "low_confidence": sum(1 for r in results if r.confidence < 0.5)
        }

        agreement_levels = Counter([r.agreement_level for r in results])
        rank_distribution = Counter([r.final_rank for r in results])

        return {
            "total_assessments": total,
            "confidence_distribution": confidence_levels,
            "agreement_distribution": dict(agreement_levels),
            "rank_distribution": dict(rank_distribution),
            "average_confidence": np.mean([r.confidence for r in results]),
            "total_cost": sum(r.total_cost for r in results),
            "total_tokens": sum(r.total_tokens for r in results)
        }

    def merge_assessment_results(self, 
                                 *result_files: str,
                                 output_file: str = "merged_assessment.json",
                                 merge_strategy: str = "union",
                                 prefer_higher_confidence: bool = True) -> List[ConsensusResult]:
        """
        Merge multiple assessment result JSON files
        
        Args:
            *result_files: Variable number of JSON file paths to merge
            output_file: Output file for merged results
            merge_strategy: How to handle duplicates:
                - "union": Keep all assessments from all LLMs (default)
                - "replace": Later files replace earlier ones completely
                - "best": Keep only the highest confidence assessment
            prefer_higher_confidence: If True, prefer assessments with more LLM opinions
        
        Returns:
            List of merged ConsensusResult objects
        """
        print(f"\n{'='*70}")
        print(f"MERGING ASSESSMENT RESULTS")
        print(f"{'='*70}")
        print(f"Strategy: {merge_strategy}")
        print(f"Input files: {len(result_files)}")
        
        # Dictionary to store merged results: (software, method) -> result data
        merged_data = {}
        
        # Track which LLMs assessed each combination
        assessment_tracking = {}
        
        # Load and process each file
        for file_idx, file_path in enumerate(result_files, 1):
            print(f"\nProcessing file {file_idx}/{len(result_files)}: {file_path}")
            
            try:
                with open(file_path, 'r') as f:
                    results = json.load(f)
                
                print(f"  Loaded {len(results)} assessments")
                
                for result in results:
                    software = result['software']
                    method = result['method']
                    key = (software, method)
                    
                    # Track this assessment
                    if key not in assessment_tracking:
                        assessment_tracking[key] = {
                            'files': [],
                            'llm_providers': set()
                        }
                    
                    assessment_tracking[key]['files'].append(file_path)
                    assessment_tracking[key]['llm_providers'].update(result['individual_ranks'].keys())
                    
                    if key not in merged_data:
                        # First time seeing this combination
                        merged_data[key] = result
                    else:
                        # Combination exists, apply merge strategy
                        if merge_strategy == "union":
                            merged_data[key] = self._merge_union(merged_data[key], result)
                        elif merge_strategy == "replace":
                            merged_data[key] = result  # Later file wins
                        elif merge_strategy == "best":
                            merged_data[key] = self._merge_best(merged_data[key], result, prefer_higher_confidence)
                        else:
                            raise ValueError(f"Unknown merge strategy: {merge_strategy}")
                
            except Exception as e:
                print(f"  ERROR loading {file_path}: {e}")
                continue
        
        # Print merge summary
        print(f"\n{'='*70}")
        print(f"MERGE SUMMARY")
        print(f"{'='*70}")
        print(f"Total unique combinations: {len(merged_data)}")
        
        # Show combinations by number of LLMs
        llm_count_distribution = {}
        for key, tracking in assessment_tracking.items():
            num_llms = len(tracking['llm_providers'])
            if num_llms not in llm_count_distribution:
                llm_count_distribution[num_llms] = []
            llm_count_distribution[num_llms].append(key)
        
        print(f"\nAssessments by number of LLMs:")
        for num_llms in sorted(llm_count_distribution.keys(), reverse=True):
            combos = llm_count_distribution[num_llms]
            print(f"  {num_llms} LLMs: {len(combos)} combinations")
        
        # Convert back to list of results
        merged_results = list(merged_data.values())
        
        # Recalculate consensus for merged results
        print(f"\nRecalculating consensus for merged results...")
        merged_results = self._recalculate_consensus(merged_results)
        
        # Export merged results
        with open(output_file, 'w') as f:
            json.dump(merged_results, f, indent=2)
        
        print(f"\n✓ Merged results saved to: {output_file}")
        print(f"{'='*70}\n")
        
        # Convert to ConsensusResult objects
        consensus_results = []
        for result in merged_results:
            consensus_results.append(ConsensusResult(
                software=result['software'],
                method=result['method'],
                final_rank=result['final_rank'],
                confidence=result['confidence'],
                individual_ranks=result['individual_ranks'],
                individual_reasoning=result['individual_reasoning'],
                individual_sources=result['individual_sources'],
                agreement_level=result['agreement_level'],
                total_tokens=result['total_tokens'],
                total_cost=result['total_cost']
            ))
        
        return consensus_results
    
    
    def _merge_union(self, existing: Dict, new: Dict) -> Dict:
        """
        Merge strategy: Union - Combine all LLM assessments
        """
        # Combine individual assessments from all LLMs
        merged = existing.copy()
        
        # Merge individual ranks
        merged['individual_ranks'].update(new['individual_ranks'])
        
        # Merge individual reasoning
        merged['individual_reasoning'].update(new['individual_reasoning'])
        
        # Merge individual sources
        merged['individual_sources'].update(new['individual_sources'])
        
        # Sum tokens and costs
        merged['total_tokens'] = existing['total_tokens'] + new['total_tokens']
        merged['total_cost'] = existing['total_cost'] + new['total_cost']
        
        # Note: final_rank, confidence, and agreement_level will be recalculated
        
        return merged
    
    
    def _merge_best(self, existing: Dict, new: Dict, prefer_higher_confidence: bool) -> Dict:
        """
        Merge strategy: Best - Keep the assessment with best confidence/most LLMs
        """
        existing_llm_count = len(existing['individual_ranks'])
        new_llm_count = len(new['individual_ranks'])
        
        if prefer_higher_confidence:
            # Prefer more LLM assessments (higher confidence)
            if new_llm_count > existing_llm_count:
                return new
            elif new_llm_count < existing_llm_count:
                return existing
            else:
                # Same number of LLMs, use confidence score
                if new['confidence'] > existing['confidence']:
                    return new
                else:
                    return existing
        else:
            # Just use confidence score
            if new['confidence'] > existing['confidence']:
                return new
            else:
                return existing
    
    
    def _recalculate_consensus(self, results: List[Dict]) -> List[Dict]:
        """
        Recalculate final_rank, confidence, and agreement_level for merged results
        """
        for result in results:
            # Get all ranks
            ranks = list(result['individual_ranks'].values())
            
            if len(ranks) == 0:
                continue
            
            # Calculate new consensus
            confidence, agreement_level = self.calculate_confidence(ranks)
            
            # Determine final rank (most common)
            rank_counts = Counter(ranks)
            final_rank = rank_counts.most_common(1)[0][0]
            
            # Update result
            result['final_rank'] = final_rank
            result['confidence'] = confidence
            result['agreement_level'] = agreement_level
        
        return results
    
    
    def compare_assessment_files(self, file1: str, file2: str):
        """
        Compare two assessment files and show differences
        
        Args:
            file1: First JSON file path
            file2: Second JSON file path
        """
        print(f"\n{'='*70}")
        print(f"COMPARING ASSESSMENT FILES")
        print(f"{'='*70}")
        print(f"File 1: {file1}")
        print(f"File 2: {file2}")
        
        with open(file1, 'r') as f:
            results1 = json.load(f)
        with open(file2, 'r') as f:
            results2 = json.load(f)
        
        # Create lookup dictionaries
        lookup1 = {(r['software'], r['method']): r for r in results1}
        lookup2 = {(r['software'], r['method']): r for r in results2}
        
        all_keys = set(lookup1.keys()) | set(lookup2.keys())
        
        only_in_file1 = set(lookup1.keys()) - set(lookup2.keys())
        only_in_file2 = set(lookup2.keys()) - set(lookup1.keys())
        in_both = set(lookup1.keys()) & set(lookup2.keys())
        
        print(f"\nCombinations only in File 1: {len(only_in_file1)}")
        for key in sorted(only_in_file1):
            print(f"  - {key[0]} / {key[1]}")
        
        print(f"\nCombinations only in File 2: {len(only_in_file2)}")
        for key in sorted(only_in_file2):
            print(f"  - {key[0]} / {key[1]}")
        
        print(f"\nCombinations in both files: {len(in_both)}")
        
        # Compare assessments in both files
        differences = []
        for key in sorted(in_both):
            r1 = lookup1[key]
            r2 = lookup2[key]
            
            llms1 = set(r1['individual_ranks'].keys())
            llms2 = set(r2['individual_ranks'].keys())
            
            if llms1 != llms2:
                differences.append({
                    'key': key,
                    'type': 'different_llms',
                    'file1_llms': llms1,
                    'file2_llms': llms2
                })
            elif r1['final_rank'] != r2['final_rank']:
                differences.append({
                    'key': key,
                    'type': 'different_rank',
                    'file1_rank': r1['final_rank'],
                    'file2_rank': r2['final_rank']
                })
        
        if differences:
            print(f"\nDifferences found: {len(differences)}")
            for diff in differences:
                print(f"\n  {diff['key'][0]} / {diff['key'][1]}")
                if diff['type'] == 'different_llms':
                    print(f"    File 1 LLMs: {diff['file1_llms']}")
                    print(f"    File 2 LLMs: {diff['file2_llms']}")
                elif diff['type'] == 'different_rank':
                    print(f"    File 1 Rank: {diff['file1_rank']}")
                    print(f"    File 2 Rank: {diff['file2_rank']}")
        else:
            print("\n✓ No differences found in overlapping assessments")
        
        print(f"{'='*70}\n")


    def filter_results(self, 
                      results: List[ConsensusResult],
                      min_confidence: float = None,
                      min_llms: int = None,
                      specific_software: List[str] = None,
                      specific_methods: List[str] = None,
                      output_file: str = None) -> List[ConsensusResult]:
        """
        Filter results based on various criteria
        
        Args:
            results: List of ConsensusResult objects
            min_confidence: Minimum confidence threshold (0.0-1.0)
            min_llms: Minimum number of LLMs that assessed
            specific_software: List of software names to include
            specific_methods: List of methods to include
            output_file: Optional output file for filtered results
        
        Returns:
            Filtered list of ConsensusResult objects
        """
        filtered = results
        
        if min_confidence is not None:
            filtered = [r for r in filtered if r.confidence >= min_confidence]
            print(f"After confidence filter (>={min_confidence}): {len(filtered)} results")
        
        if min_llms is not None:
            filtered = [r for r in filtered if len(r.individual_ranks) >= min_llms]
            print(f"After LLM count filter (>={min_llms}): {len(filtered)} results")
        
        if specific_software is not None:
            filtered = [r for r in filtered if r.software in specific_software]
            print(f"After software filter: {len(filtered)} results")
        
        if specific_methods is not None:
            filtered = [r for r in filtered if r.method in specific_methods]
            print(f"After method filter: {len(filtered)} results")
        
        if output_file:
            self.export_results(filtered, output_file)
        
        return filtered


In [None]:

# Initialize assessor
assessor = SoftwareMethodAssessor(use_config=True)

# Define software and methods to assess
software_list_all = [
    "PSS/E",
    "PowerWorld",
]

method_list_all = [
    "Optimal Power Flow (OPF)",
    "State Estimation",
]

# Run assessments
results = assessor.assess_multiple(
software_list=software_list_all,
method_list=method_list_all,
use_openai=False,
use_claude=False,
use_google=True,  # ← Enable third LLM
openai_model="gpt-4o",
claude_model="claude-sonnet-4-20250514",
google_model="models/gemini-2.0-flash-exp"  # Use flash for cost savings
)

# Export results (sources will be included automatically via asdict)
assessor.export_results(results, "software_method_assessment.json")

# Create and print summary report
summary = assessor.create_summary_report(results)
print("\n" + "="*60)
print("ASSESSMENT SUMMARY")
print("="*60)
print(json.dumps(summary, indent=2))

# Print sources summary
sources_summary = assessor.create_sources_summary(results)
print("\n" + "="*60)
print("SOURCES SUMMARY")
print("="*60)
print(json.dumps(sources_summary, indent=2))

# Print credit tracker summary
assessor.credit_tracker.print_summary()

# Print detailed results with sources (only high confidence)
assessor.print_detailed_results(results, min_confidence=0.8)

# Or print all results with sources:
# assessor.print_detailed_results(results, min_confidence=0.0)



Starting assessment of 4 combinations...
Using OpenAI: False, Using Claude: False, Using Google: True
------------------------------------------------------------

[1/4] Processing: PSS/E - Optimal Power Flow (OPF)
Assessing PSS/E - Optimal Power Flow (OPF) with Google Gemini...
  Rank: 3, Confidence: 50.00%
  Running cost: $0.0000 (1,225 tokens)

[2/4] Processing: PSS/E - State Estimation
Assessing PSS/E - State Estimation with Google Gemini...
  Rank: 3, Confidence: 50.00%
  Running cost: $0.0000 (2,489 tokens)

[3/4] Processing: PowerWorld - Optimal Power Flow (OPF)
Assessing PowerWorld - Optimal Power Flow (OPF) with Google Gemini...
  Rank: 3, Confidence: 50.00%
  Running cost: $0.0000 (3,631 tokens)

[4/4] Processing: PowerWorld - State Estimation
Assessing PowerWorld - State Estimation with Google Gemini...
  Rank: 3, Confidence: 50.00%
  Running cost: $0.0000 (4,829 tokens)

Results exported to software_method_assessment.json

ASSESSMENT SUMMARY
{
  "total_assessments": 4,
  "