# Software mapping

In [2]:
# =============================================================================
# IMPORTS
# =============================================================================
import openai
import anthropic
import json
import time
import configparser
import tiktoken
from typing import List, Dict, Tuple, Optional, Callable
from collections import Counter
import numpy as np
from dataclasses import dataclass, asdict
import google.generativeai as genai
from itertools import combinations
import random
from datetime import datetime
from pathlib import Path
import pickle
import traceback
import pandas as pd
from difflib import get_close_matches, SequenceMatcher

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
# %%
# =============================================================================
# CONFIGURATION & INITIALIZATION
# =============================================================================

def initialize_openai():
    """Initialize OpenAI client from config file"""
    config = configparser.ConfigParser()
    config.read('config_LLM.txt')
    api_key = config['LLM'].get('OPENAI_API_KEY')
    model_type = config['LLM'].get('MODEL_TYPE_adv', 'gpt-4o-mini')
    client = openai.OpenAI(api_key=api_key)
    return client, model_type

def initialize_anthropic():
    """Initialize Anthropic client from config file"""
    config = configparser.ConfigParser()
    config.read('config_LLM.txt')
    api_key = config['LLM'].get('ANTHROPIC_API_KEY')
    client = anthropic.Anthropic(api_key=api_key) if api_key else None
    return client

def initialize_google():
    """Initialize Google Gemini client from config file"""
    config = configparser.ConfigParser()
    config.read('config_LLM.txt')
    api_key = config['LLM'].get('GOOGLE_API_KEY')
    if api_key:
        genai.configure(api_key=api_key)
        return True
    return False

print("✓ Initialization functions defined")


✓ Initialization functions defined


In [8]:
# %%
# =============================================================================
# TOKEN COUNTING UTILITIES
# =============================================================================

def num_tokens_from_string(string: str, model_name: str) -> int:
    """Get token count with fallback for unsupported models"""
    try:
        encoding = tiktoken.encoding_for_model(model_name)
        return len(encoding.encode(string))
    except KeyError:
        if model_name.startswith('gpt-5'):
            encoding = tiktoken.get_encoding("o200k_base")
            return len(encoding.encode(string))
        elif model_name.startswith('gpt-4'):
            encoding = tiktoken.get_encoding("cl100k_base")
            return len(encoding.encode(string))
        elif model_name.startswith('claude'):
            return int(len(string) / 3.5)
        elif model_name.startswith('models/gemini') or model_name.startswith('gemini'):
            return int(len(string) / 4)
        else:
            return len(string) // 4

def count_tokens_in_messages(messages: List[Dict], model: str) -> int:
    """Count tokens in a list of messages"""
    total_tokens = 0
    for message in messages:
        if isinstance(message.get('content'), str):
            total_tokens += num_tokens_from_string(message['content'], model)
        total_tokens += 4  # Message overhead
    total_tokens += 3  # Completion overhead
    return total_tokens

print("✓ Token counting utilities defined")


✓ Token counting utilities defined


In [9]:
# %%
# =============================================================================
# CREDIT TRACKER
# =============================================================================

class CreditTracker:
    """Track API usage and costs across all models"""
    
    PRICING = {
        # OpenAI
        'gpt-4o': {'input': 1.25, 'output': 5.00},
        'gpt-4o-mini': {'input': 0.075, 'output': 0.30},
        
        # Claude
        'claude-3-haiku-20240307': {'input': 0.25, 'output': 1.25},
        'claude-3-5-haiku-20241022': {'input': 0.80, 'output': 4.00},
        'claude-3-5-sonnet-20241022': {'input': 3.00, 'output': 15.00},
        'claude-sonnet-4-20250514': {'input': 3.00, 'output': 15.00},
        
        # Google
        'models/gemini-2.5-flash': {'input': 0.075, 'output': 0.30},
        'models/gemini-2.0-flash': {'input': 0.075, 'output': 0.30},
        'models/gemini-2.0-flash-001': {'input': 0.075, 'output': 0.30},
        'gemini-2.0-flash': {'input': 0.075, 'output': 0.30},
    }

    def __init__(self):
        self.total_input_tokens = 0
        self.total_output_tokens = 0
        self.total_cached_tokens = 0
        self.total_cost = 0
        self.model_usage = {}
        self.call_count = 0

    def update(self, model: str, input_tokens: int, output_tokens: int, cached_tokens: int = 0):
        """Update usage statistics"""
        self.total_input_tokens += input_tokens
        self.total_output_tokens += output_tokens
        self.total_cached_tokens += cached_tokens
        self.call_count += 1

        pricing = self.PRICING.get(model, {'input': 0.00015, 'output': 0.0006})
        
        input_cost = (input_tokens / 1_000_000) * pricing['input']
        output_cost = (output_tokens / 1_000_000) * pricing['output']
        call_cost = input_cost + output_cost
        self.total_cost += call_cost

        if model not in self.model_usage:
            self.model_usage[model] = {
                'calls': 0, 'input_tokens': 0, 'output_tokens': 0,
                'cached_tokens': 0, 'cost': 0
            }

        self.model_usage[model]['calls'] += 1
        self.model_usage[model]['input_tokens'] += input_tokens
        self.model_usage[model]['output_tokens'] += output_tokens
        self.model_usage[model]['cached_tokens'] += cached_tokens
        self.model_usage[model]['cost'] += call_cost

    def get_stats(self):
        """Get current statistics"""
        return {
            "total_calls": self.call_count,
            "total_input_tokens": self.total_input_tokens,
            "total_output_tokens": self.total_output_tokens,
            "total_tokens": self.total_input_tokens + self.total_output_tokens,
            "total_cost": round(self.total_cost, 4),
            "average_cost_per_call": round(self.total_cost / max(self.call_count, 1), 4),
            "model_breakdown": {
                model: {
                    'calls': stats['calls'],
                    'total_tokens': stats['input_tokens'] + stats['output_tokens'],
                    'cost': round(stats['cost'], 4)
                }
                for model, stats in self.model_usage.items()
            }
        }

    def print_summary(self):
        """Print formatted summary"""
        stats = self.get_stats()
        print("\n" + "="*60)
        print("API USAGE SUMMARY")
        print("="*60)
        print(f"Total API Calls: {stats['total_calls']}")
        print(f"Total Tokens: {stats['total_tokens']:,}")
        print(f"  - Input: {stats['total_input_tokens']:,}")
        print(f"  - Output: {stats['total_output_tokens']:,}")
        if self.total_cached_tokens > 0:
            print(f"  - Cached: {self.total_cached_tokens:,}")
        print(f"\nTotal Cost: ${stats['total_cost']:.4f}")
        print(f"Average Cost per Call: ${stats['average_cost_per_call']:.4f}")

        if self.model_usage:
            print("\nBreakdown by Model:")
            print("-" * 60)
            for model, breakdown in stats['model_breakdown'].items():
                print(f"  {model}:")
                print(f"    Calls: {breakdown['calls']}")
                print(f"    Tokens: {breakdown['total_tokens']:,}")
                print(f"    Cost: ${breakdown['cost']:.4f}")
        print("="*60 + "\n")

print("✓ CreditTracker class defined")


✓ CreditTracker class defined


## The method assessor

In [10]:
# %%
# =============================================================================
# DATA STRUCTURES
# =============================================================================

@dataclass
class AssessmentResult:
    """Single LLM assessment result"""
    software: str
    method: str
    rank: int
    reasoning: str
    sources: List[str]
    llm_provider: str
    input_tokens: int = 0
    output_tokens: int = 0

@dataclass
class ConsensusResult:
    """Consensus across multiple LLMs"""
    software: str
    method: str
    final_rank: int
    confidence: float
    individual_ranks: Dict[str, int]
    individual_reasoning: Dict[str, str]
    individual_sources: Dict[str, List[str]]
    agreement_level: str
    total_tokens: int = 0
    total_cost: float = 0.0

print("✓ Data structures defined")


✓ Data structures defined


In [11]:
# %%
# =============================================================================
# MAIN ASSESSOR CLASS - PART 1: Core Functions
# =============================================================================

class SoftwareMethodAssessor:
    """Main class for software-method assessment using multiple LLMs"""
    
    def __init__(self, use_config: bool = True, timeout: int = 180, max_retries: int = 3):
        """Initialize assessor with API clients"""
        if use_config:
            self.openai_client, self.default_model = initialize_openai()
            self.anthropic_client = initialize_anthropic()
            self.google_enabled = initialize_google()
        else:
            self.openai_client = None
            self.anthropic_client = None
            self.google_enabled = False
            self.default_model = "gpt-4o-mini"
        
        self.credit_tracker = CreditTracker()
        self.timeout = timeout
        self.max_retries = max_retries
        
        # System prompt for assessments
        self.system_prompt = """You are a technical software assessment expert specialized in power systems analysis software.

Use this ranking scale:
0 = No support (method cannot be implemented at all)
1 = Limited possibility for implementation or extension (requires significant workarounds)
2 = Indirectly supported through APIs or extensions (requires external tools/plugins)
3 = Directly implemented (native feature in the software)

CRITICAL: You MUST search for and provide actual references. Your assessment must be based on real, verifiable sources.

For each assessment:
1. Search for scientific papers demonstrating implementation (IEEE Xplore, ScienceDirect, arXiv, Google Scholar)
2. Find official documentation from the software vendor or project website
3. Look for GitHub repositories with code examples or open-source implementations
4. Check API documentation or extension/plugin capabilities
5. Review user forums, technical blogs, Stack Overflow, or case studies

Return your response in VALID JSON format with this exact structure:
{
    "rank": <0-3>,
    "reasoning": "<detailed explanation citing specific sources by number, e.g., 'According to [1], PSS/E supports...'>",
    "sources": [
        "https://example.com/documentation - Official PSS/E Manual on OPF",
        "https://doi.org/10.1109/... - Paper title by Author et al.",
        "https://github.com/org/repo/file.py - Implementation example"
    ]
}

Each source must include both the URL and a brief description separated by ' - '.
Minimum 2 sources required for ranks 2-3, minimum 1 source for rank 1."""

    def calculate_confidence(self, ranks: List[int]) -> Tuple[float, str]:
        """Calculate confidence score from multiple assessments"""
        if not ranks:
            return 0.0, "no_data"
        
        rank_counts = Counter(ranks)
        most_common_count = rank_counts.most_common(1)[0][1]
        total_ranks = len(ranks)
        confidence = most_common_count / total_ranks
        
        if total_ranks == 1:
            agreement_level = "single_assessment"
        elif confidence == 1.0:
            agreement_level = "perfect_agreement"
        elif confidence >= 0.75:
            agreement_level = "strong_agreement"
        elif confidence >= 0.5:
            agreement_level = "moderate_agreement"
        else:
            agreement_level = "weak_agreement"
        
        return confidence, agreement_level

    def export_results(self, results: List[ConsensusResult], filename: str):
        """Export results to JSON file"""
        output_data = [asdict(result) for result in results]
        with open(filename, 'w') as f:
            json.dump(output_data, f, indent=2)
        print(f"\nResults exported to {filename}")

print("✓ SoftwareMethodAssessor class initialized (Part 1)")


✓ SoftwareMethodAssessor class initialized (Part 1)


In [12]:
# %%
# =============================================================================
# MAIN ASSESSOR CLASS - PART 2: Batch Assessment Methods
# =============================================================================

def create_batch_assessment_prompt(self, batch_items: List[Tuple[str, str]], batch_size: int = None) -> str:
    """Create a structured prompt for batch assessment"""
    batch_size = batch_size or len(batch_items)
    
    items_text = ""
    for idx, (software, method) in enumerate(batch_items, 1):
        items_text += f"\n{idx}. Software: {software}\n   Method: {method}\n"
    
    prompt = f"""You must assess {len(batch_items)} software-method combinations independently.

CRITICAL INSTRUCTIONS:
- Treat each pair as completely independent
- Do NOT let one assessment influence another
- Provide the SAME quality of research and reasoning for ALL items
- Each assessment must have its own sources

Items to assess:{items_text}

For EACH item above, perform independent research and provide sources with URLs.

Return a JSON array with exactly {len(batch_items)} objects:
[
  {{
    "software": "<software name>",
    "method": "<method name>",
    "rank": <0-3>,
    "reasoning": "<detailed explanation citing sources [1], [2], etc.>",
    "sources": [
      "https://... - Description",
      "https://... - Description"
    ]
  }},
  ...
]

IMPORTANT: Return ONLY the JSON array, no other text."""
    
    return prompt

# Add to SoftwareMethodAssessor class
SoftwareMethodAssessor.create_batch_assessment_prompt = create_batch_assessment_prompt

print("✓ Batch prompt creation added")


✓ Batch prompt creation added


In [13]:
# %%
# =============================================================================
# OPENAI ASSESSMENT METHOD
# =============================================================================

def assess_batch_with_openai(self, batch_items: List[Tuple[str, str]], 
                             model: str = None, debug: bool = False) -> List[AssessmentResult]:
    """Assess a batch of items with OpenAI"""
    if model is None:
        model = self.default_model
    
    try:
        prompt = self.create_batch_assessment_prompt(batch_items)
        messages = [
            {"role": "system", "content": self.system_prompt},
            {"role": "user", "content": prompt}
        ]
        
        if debug:
            print(f"  DEBUG: Batch size: {len(batch_items)}")
        
        response = self.openai_client.chat.completions.create(
            model=model,
            messages=messages,
            temperature=0.3,
            max_tokens=4096,
            timeout=self.timeout,
            response_format={"type": "json_object"}
        )
        
        usage = response.usage
        cached_tokens = 0
        if hasattr(usage, 'prompt_tokens_details') and usage.prompt_tokens_details:
            cached_tokens = getattr(usage.prompt_tokens_details, 'cached_tokens', 0)
        
        self.credit_tracker.update(
            model=model,
            input_tokens=usage.prompt_tokens,
            output_tokens=usage.completion_tokens,
            cached_tokens=cached_tokens
        )
        
        content = response.choices[0].message.content
        
        # Parse JSON
        try:
            parsed = json.loads(content)
            if isinstance(parsed, dict):
                if 'assessments' in parsed:
                    results_data = parsed['assessments']
                elif 'results' in parsed:
                    results_data = parsed['results']
                else:
                    results_data = next(v for v in parsed.values() if isinstance(v, list))
            else:
                results_data = parsed
        except Exception as e:
            print(f"  ERROR parsing batch response: {e}")
            return []
        
        # Convert to AssessmentResult objects
        assessment_results = []
        for item_data in results_data:
            rank = int(item_data.get("rank", 0))
            sources = item_data.get("sources", [])
            reasoning = item_data.get("reasoning", "")
            
            if rank > 0 and len(sources) == 0:
                rank = 0
                reasoning += " [Rank lowered to 0: no sources]"
            
            assessment_results.append(AssessmentResult(
                software=item_data.get("software", ""),
                method=item_data.get("method", ""),
                rank=rank,
                reasoning=reasoning,
                sources=sources,
                llm_provider=f"openai_{model}",
                input_tokens=usage.prompt_tokens // len(batch_items),
                output_tokens=usage.completion_tokens // len(batch_items)
            ))
        
        return assessment_results
        
    except Exception as e:
        print(f"  ERROR in OpenAI batch assessment: {e}")
        return []

# Add to class
SoftwareMethodAssessor.assess_batch_with_openai = assess_batch_with_openai

print("✓ OpenAI assessment method added")


✓ OpenAI assessment method added


In [15]:
# %%
# =============================================================================
# CLAUDE ASSESSMENT METHOD
# =============================================================================

def assess_batch_with_claude(self, batch_items: List[Tuple[str, str]], 
                             model: str = "claude-3-5-haiku-20241022", 
                             debug: bool = False) -> List[AssessmentResult]:
    """Assess a batch of items with Claude"""
    if not self.anthropic_client:
        return []
    
    try:
        prompt = self.create_batch_assessment_prompt(batch_items)
        
        if debug:
            print(f"  DEBUG: Batch size: {len(batch_items)}")
        
        response = self.anthropic_client.messages.create(
            model=model,
            max_tokens=4096,
            temperature=0.3,
            timeout=self.timeout,
            system=self.system_prompt,
            messages=[{"role": "user", "content": prompt}]
        )
        
        self.credit_tracker.update(
            model=model,
            input_tokens=response.usage.input_tokens,
            output_tokens=response.usage.output_tokens
        )
        
        content = response.content[0].text
        
        # Clean markdown code blocks
        content = content.strip()
        if content.startswith('```'):
            lines = content.split('\n')
            start_idx = 0
            end_idx = len(lines)
            for i, line in enumerate(lines):
                if line.strip().startswith('```'):
                    if start_idx == 0:
                        start_idx = i + 1
                    else:
                        end_idx = i
                        break
            content = '\n'.join(lines[start_idx:end_idx])
        
        # Parse JSON
        try:
            parsed = json.loads(content)
            if isinstance(parsed, dict):
                if 'assessments' in parsed:
                    results_data = parsed['assessments']
                elif 'results' in parsed:
                    results_data = parsed['results']
                else:
                    results_data = next(v for v in parsed.values() if isinstance(v, list))
            else:
                results_data = parsed
        except Exception as e:
            print(f"  ERROR parsing Claude batch response: {e}")
            return []
        
        # Convert to AssessmentResult objects
        assessment_results = []
        for item_data in results_data:
            rank = int(item_data.get("rank", 0))
            sources = item_data.get("sources", [])
            reasoning = item_data.get("reasoning", "")
            
            if rank > 0 and len(sources) == 0:
                rank = 0
                reasoning += " [Rank lowered to 0: no sources]"
            
            assessment_results.append(AssessmentResult(
                software=item_data.get("software", ""),
                method=item_data.get("method", ""),
                rank=rank,
                reasoning=reasoning,
                sources=sources,
                llm_provider=f"claude_{model}",
                input_tokens=response.usage.input_tokens // len(batch_items),
                output_tokens=response.usage.output_tokens // len(batch_items)
            ))
        
        return assessment_results
        
    except Exception as e:
        print(f"  ERROR in Claude batch assessment: {e}")
        return []

# Add to class
SoftwareMethodAssessor.assess_batch_with_claude = assess_batch_with_claude

print("✓ Claude assessment method added")


✓ Claude assessment method added


In [17]:
# %%
# =============================================================================
# GOOGLE ASSESSMENT METHOD
# =============================================================================

def assess_batch_with_google(self, batch_items: List[Tuple[str, str]], 
                             model: str = "models/gemini-2.0-flash", 
                             debug: bool = False) -> List[AssessmentResult]:
    """Assess a batch of items with Google Gemini"""
    if not self.google_enabled:
        return []
    
    try:
        prompt = self.create_batch_assessment_prompt(batch_items)
        
        if not model.startswith('models/'):
            model = f"models/{model}"
        
        gemini_model = genai.GenerativeModel(
            model_name=model,
            generation_config={
                "temperature": 0.3,
                "max_output_tokens": 4096,
            },
            system_instruction=self.system_prompt
        )
        
        response = gemini_model.generate_content(
            prompt,
            generation_config={
                "temperature": 0.3,
                "max_output_tokens": 4096,
            },
            request_options={'timeout': self.timeout}
        )
        
        if debug:
            print(f"  DEBUG: Successfully used model: {model}")
        
        # Extract token counts
        try:
            input_tokens = response.usage_metadata.prompt_token_count
            output_tokens = response.usage_metadata.candidates_token_count
        except AttributeError:
            input_tokens = int(len(prompt.split()) * 1.3)
            output_tokens = int(len(response.text.split()) * 1.3)
        
        self.credit_tracker.update(
            model=model,
            input_tokens=int(input_tokens),
            output_tokens=int(output_tokens)
        )
        
        content = response.text.strip()
        
        # Clean markdown
        if content.startswith('```'):
            lines = content.split('\n')
            start_idx = 0
            end_idx = len(lines)
            for i, line in enumerate(lines):
                if line.strip().startswith('```'):
                    if start_idx == 0:
                        start_idx = i + 1
                    else:
                        end_idx = i
                        break
            content = '\n'.join(lines[start_idx:end_idx])
        
        # Parse JSON
        try:
            parsed = json.loads(content)
            if isinstance(parsed, dict):
                if 'assessments' in parsed:
                    results_data = parsed['assessments']
                elif 'results' in parsed:
                    results_data = parsed['results']
                else:
                    results_data = next(v for v in parsed.values() if isinstance(v, list))
            else:
                results_data = parsed
        except Exception as e:
            print(f"  ERROR parsing Google batch response: {e}")
            return []
        
        # Convert to AssessmentResult objects
        assessment_results = []
        for item_data in results_data:
            rank = int(item_data.get("rank", 0))
            sources = item_data.get("sources", [])
            reasoning = item_data.get("reasoning", "")
            
            if rank > 0 and len(sources) == 0:
                rank = 0
                reasoning += " [Rank lowered to 0: no sources]"
            
            assessment_results.append(AssessmentResult(
                software=item_data.get("software", ""),
                method=item_data.get("method", ""),
                rank=rank,
                reasoning=reasoning,
                sources=sources,
                llm_provider=f"google_{model.replace('models/', '')}",
                input_tokens=int(input_tokens) // len(batch_items),
                output_tokens=int(output_tokens) // len(batch_items)
            ))
        
        return assessment_results
        
    except Exception as e:
        print(f"  ERROR in Google batch assessment: {e}")
        return []

# Add to class
SoftwareMethodAssessor.assess_batch_with_google = assess_batch_with_google

print("✓ Google assessment method added")


✓ Google assessment method added


In [18]:
# %%
# =============================================================================
# BATCH CREATION METHOD
# =============================================================================

def create_batches(self, software_list: List[str], method_list: List[str],
                  strategy: str = "by_software") -> List[List[Tuple[str, str]]]:
    """
    Create batches of (software, method) pairs
    
    Args:
        software_list: List of software names
        method_list: List of methods
        strategy: "by_software", "by_method", "mixed", or "fixed_size"
    
    Returns:
        List of batches
    """
    batches = []
    
    if strategy == "by_software":
        # One batch per software with all its methods
        for software in software_list:
            batch = [(software, method) for method in method_list]
            batches.append(batch)
    
    elif strategy == "by_method":
        # One batch per method with all software
        for method in method_list:
            batch = [(software, method) for software in software_list]
            batches.append(batch)
    
    elif strategy == "mixed":
        # Alternate between by_software and by_method
        for i, software in enumerate(software_list[:len(software_list)//2 + 1]):
            batch = [(software, method) for method in method_list]
            batches.append(batch)
        for method in method_list:
            batch = [(software, method) for software in software_list[len(software_list)//2 + 1:]]
            if batch:
                batches.append(batch)
    
    else:  # "fixed_size" - already handled in assess_multiple_batched
        all_items = [(sw, method) for sw in software_list for method in method_list]
        batches = [all_items]  # Return as single batch, will be split later
    
    return batches

# Add to class
SoftwareMethodAssessor.create_batches = create_batches

print("✓ Batch creation method added")


✓ Batch creation method added


In [19]:
# %%
# =============================================================================
# MAIN BATCHED ASSESSMENT METHOD
# =============================================================================

def assess_multiple_batched(self, software_list: List[str], method_list: List[str],
                           batch_strategy: str = "by_software",
                           batch_size: int = 100,
                           overlap_percentage: float = 0.0,
                           use_openai: bool = True,
                           use_claude: bool = True,
                           use_google: bool = True,
                           openai_model: str = None,
                           claude_model: str = "claude-3-5-haiku-20241022",
                           google_model: str = "models/gemini-2.0-flash",
                           debug: bool = False) -> List[ConsensusResult]:
    """
    Assess multiple software-method combinations using batch processing
    """
    total_items = len(software_list) * len(method_list)
    
    print(f"\n{'='*70}")
    print(f"BATCH ASSESSMENT MODE")
    print(f"{'='*70}")
    print(f"Total items: {total_items}")
    print(f"Strategy: {batch_strategy}")
    print(f"LLMs: OpenAI={use_openai}, Claude={use_claude}, Google={use_google}")
    
    # Create batches
    batches = self.create_batches(software_list, method_list, batch_strategy)
    
    print(f"\nCreated {len(batches)} batches")
    for i, batch in enumerate(batches, 1):
        print(f"  Batch {i}: {len(batch)} items")
    
    # Store all individual assessments
    all_assessments = {}  # (software, method) -> list of AssessmentResult
    
    print(f"\n{'-'*70}")
    print(f"Processing batches...")
    print(f"{'-'*70}")
    
    # Process each batch with each LLM
    for batch_idx, batch in enumerate(batches, 1):
        print(f"\n[Batch {batch_idx}/{len(batches)}] {len(batch)} items")
        
        batch_results = []
        
        if use_openai and self.openai_client:
            print(f"  Assessing with OpenAI...")
            results = self.assess_batch_with_openai(batch, openai_model, debug)
            batch_results.extend(results)
            time.sleep(1)
        
        if use_claude and self.anthropic_client:
            print(f"  Assessing with Claude...")
            results = self.assess_batch_with_claude(batch, claude_model, debug)
            batch_results.extend(results)
            time.sleep(1)
        
        if use_google and self.google_enabled:
            print(f"  Assessing with Google...")
            results = self.assess_batch_with_google(batch, google_model, debug)
            batch_results.extend(results)
            time.sleep(1)
        
        # Store results
        for result in batch_results:
            key = (result.software, result.method)
            if key not in all_assessments:
                all_assessments[key] = []
            all_assessments[key].append(result)
        
        # Show progress
        stats = self.credit_tracker.get_stats()
        print(f"  Running cost: ${stats['total_cost']:.4f} ({stats['total_tokens']:,} tokens)")
    
    # Create consensus results
    print(f"\n{'-'*70}")
    print(f"Creating consensus results...")
    print(f"{'-'*70}")
    
    consensus_results = []
    
    for (software, method), assessments in all_assessments.items():
        if len(assessments) == 0:
            continue
        
        # Group by LLM provider (handle duplicates)
        by_provider = {}
        for assessment in assessments:
            if assessment.llm_provider not in by_provider:
                by_provider[assessment.llm_provider] = assessment
        
        assessments = list(by_provider.values())
        
        ranks = [a.rank for a in assessments]
        confidence, agreement_level = self.calculate_confidence(ranks)
        
        rank_counts = Counter(ranks)
        final_rank = rank_counts.most_common(1)[0][0]
        
        individual_ranks = {a.llm_provider: a.rank for a in assessments}
        individual_reasoning = {a.llm_provider: a.reasoning for a in assessments}
        individual_sources = {a.llm_provider: a.sources for a in assessments}
        
        total_tokens = sum(a.input_tokens + a.output_tokens for a in assessments)
        
        # Calculate cost
        total_cost = sum([
            self.credit_tracker.PRICING.get(
                a.llm_provider.replace('openai_', '').replace('claude_', '').replace('google_', ''),
                {'input': 0, 'output': 0}
            )['input'] * a.input_tokens / 1_000_000 +
            self.credit_tracker.PRICING.get(
                a.llm_provider.replace('openai_', '').replace('claude_', '').replace('google_', ''),
                {'input': 0, 'output': 0}
            )['output'] * a.output_tokens / 1_000_000
            for a in assessments
        ])
        
        consensus_results.append(ConsensusResult(
            software=software,
            method=method,
            final_rank=final_rank,
            confidence=confidence,
            individual_ranks=individual_ranks,
            individual_reasoning=individual_reasoning,
            individual_sources=individual_sources,
            agreement_level=agreement_level,
            total_tokens=total_tokens,
            total_cost=total_cost
        ))
    
    print(f"\nCompleted {len(consensus_results)} assessments")
    print(f"{'='*70}\n")
    
    return consensus_results

# Add to class
SoftwareMethodAssessor.assess_multiple_batched = assess_multiple_batched

print("✓ Main batched assessment method added")


✓ Main batched assessment method added


In [20]:
# %%
# =============================================================================
# RESULT MERGER
# =============================================================================

def merge_assessment_results(self, *result_files: str, output_file: str = "merged_results.json",
                            merge_strategy: str = "union") -> List[ConsensusResult]:
    """
    Merge multiple assessment result JSON files
    """
    print(f"\n{'='*70}")
    print(f"MERGING ASSESSMENT RESULTS")
    print(f"{'='*70}")
    print(f"Strategy: {merge_strategy}")
    print(f"Input files: {len(result_files)}")
    
    merged_data = {}
    
    for file_idx, file_path in enumerate(result_files, 1):
        print(f"\nProcessing file {file_idx}/{len(result_files)}: {file_path}")
        
        try:
            with open(file_path, 'r') as f:
                results = json.load(f)
            
            print(f"  Loaded {len(results)} assessments")
            
            for result in results:
                software = result['software']
                method = result['method']
                key = (software, method)
                
                if key not in merged_data:
                    merged_data[key] = result
                else:
                    # Merge: combine all LLM assessments
                    merged_data[key]['individual_ranks'].update(result['individual_ranks'])
                    merged_data[key]['individual_reasoning'].update(result['individual_reasoning'])
                    merged_data[key]['individual_sources'].update(result['individual_sources'])
                    merged_data[key]['total_tokens'] += result['total_tokens']
                    merged_data[key]['total_cost'] += result['total_cost']
                
        except Exception as e:
            print(f"  ERROR loading {file_path}: {e}")
            continue
    
    # Recalculate consensus
    print(f"\nRecalculating consensus for merged results...")
    merged_results_list = list(merged_data.values())
    
    for result in merged_results_list:
        ranks = list(result['individual_ranks'].values())
        rank_counts = Counter(ranks)
        result['final_rank'] = rank_counts.most_common(1)[0][0]
        
        # Calculate confidence
        most_common_count = rank_counts.most_common(1)[0][1]
        result['confidence'] = most_common_count / len(ranks)
        
        if result['confidence'] == 1.0:
            result['agreement_level'] = "perfect_agreement"
        elif result['confidence'] >= 0.75:
            result['agreement_level'] = "strong_agreement"
        elif result['confidence'] >= 0.5:
            result['agreement_level'] = "moderate_agreement"
        else:
            result['agreement_level'] = "weak_agreement"
    
    # Save merged results
    with open(output_file, 'w') as f:
        json.dump(merged_results_list, f, indent=2)
    
    print(f"\n✓ Merged results saved to: {output_file}")
    print(f"{'='*70}\n")
    
    # Convert to ConsensusResult objects
    consensus_results = []
    for result in merged_results_list:
        consensus_results.append(ConsensusResult(
            software=result['software'],
            method=result['method'],
            final_rank=result['final_rank'],
            confidence=result['confidence'],
            individual_ranks=result['individual_ranks'],
            individual_reasoning=result['individual_reasoning'],
            individual_sources=result['individual_sources'],
            agreement_level=result['agreement_level'],
            total_tokens=result['total_tokens'],
            total_cost=result['total_cost']
        ))
    
    return consensus_results

# Add to class
SoftwareMethodAssessor.merge_assessment_results = merge_assessment_results

print("✓ Result merger added")


✓ Result merger added


In [None]:
# %%
# =============================================================================
# LOAD YOUR SOFTWARE AND METHOD LISTS
# =============================================================================

# Replace with your actual data loading
# Example:
# software_list_all = pd.read_csv('software_list.csv')['Name'].tolist()
# method_list_all = pd.read_csv('method_list.csv')['Method'].tolist()

# For testing, here's a placeholder:
software_list_all = [
    'Power Factory Digisilent','DINIS','ERACS','Distribution Network Analysis - ETAP','IPSA',
'Power World','PSS/E','PSSE/SINCAL','SKM Power Tools','OpenDSS','Matlab & Simulink','DYMOLA','MathPower',
'RelyPES','GridLAB-D','PyPSA (Python for Power System Analysis)','TARA','PyPower/Pandapower','GridCal Sk','MatDyn',
'NEPLAN','PSAT','CYMEDIST','Synergi Electric','Dynawo','OpenModellica',
'Sienna(PowerModels.jl PowerSystems.jl & PowerSimulations.jl PowerFlows.jl)','POWSYBL','Hitachi Network Manager','Spectrum Power',
'CIMPLICITY Scada','eTerra','Netbas','Trimble NIS','GAMS'
]

method_list_all = [
'power flow analysis','security-constrained optimal power flow','security constrained unit commitment',
'Non Linear Optimal Power Flow','Multi-Period  Optimisation ','unit commitment','genetic algorithm','neural network',
'kalman filter','monte-carlo','random forest','deep-learning','particle swarm optimization','fuzzy logic','time series',
'artificial bee colony','stochastic simulation','fault analysis','reinforcement learning','linear programming','mixed integer linear programming',
'support vector machine','ensemble-learning','graph-neural network','numerical solvers','global optimization','economic dispatch ED',
'probabilistic-forecasting','General Optimization','data envelopment analysis','machine learning','deep neural network','voltage stability',
'probabilistic analysis','real-time data analysis','optimal power flow','demand response','optimal capacity configuration','sensitivity analysis',
'sequential monte carlo','fuzzy logic','load forecasting','load balancing','power forecasting','state estimation','hosting capacity',
'error estimation techniques','stochastic model','failure modeling','loss of load expectancy','system identification','economic dispatch',
'time series analysis','multi-objective optimization','expected energy not served','power system flexibility','decision tree',
'contingency analysis','load frequency control','power factor correction','voltage control strategy','multi-agent system',
'system average interruption duration index','dynamic line rating','static var compensator','dynamic programming','model predictive control',
'k-means clustering','linear regression','principal component analysis','fault detection classification',
'system average interruption frequency index','stochastic optimization','cost-benefit analysis','fuzzy inference system',
'differential evolution','multi-state model','fault tree analysis','reliability economics','short-term load forecasting',
'dynamic voltage restorer','dynamic reactive power compensation','shunt active power filter','fault detection diagnosis',
'phase-locked loop','power system restoration','load carrying capability elcc','wind power prediction','discrete wavelet transform',
'dynamic resource allocation','space vector pulse width modulation','logistic regression','game theory','binary particle swarm',
'power system stabilizer','firefly algorithm','sliding mode control','modified ieee rts','heuristic optimization','partial discharge pd',
'stochastic programming','simulated annealing','support vector regression','two-stage stochastic','adaptive neuro-fuzzy inference',
'predictive modeling','short-term memory lstm network','load shifting','cuckoo search','automatic generation control agc','quantum computing',
'power quality disturbance','doubly-fed induction','convolutional neural network cnns','empirical mode decomposition','evolution algorithm',
'deep reinforcement learning drl','minimal cut set','tabu search','generative adversarial network','gated recurrent unit',
'approximate computing','demand side management dsm','frequency variation','markov chain monte carlo','ant colony optimization',
'predictive controller','multi-objective particle swarm optimization','power generation modeling','quantile regression','dynamic pricing',
'wavelet transform dwt','modal analysis','power quality assessment','reactive power sharing','quadratic programming','stochastic unit commitment',
'interior point method','process regression','second-order cone','energy resilience analysis','metaheuristics','bayesian optimization',
'clustering analysis','power transfer distribution factor','harmony search','optimization gwo','fuzzy comprehensive evaluation',
'deep deterministic','gaussian process regression','svd','bat algorithm','cumulative distribution function','deep deterministic policy gradient',
'genetic programming','sequential quadratic programming','energy demand forecasting','supply chain optimization','levelized cost of energy lcoe',
'frequency nadir','multi-output','hybrid system modeling','proton exchange membrane','hybrid acdc microgrid','multiple-input-multiple-output mimo',
'alternating direction method','hybrid optimization model','load shedding analysis','non-dominated sorting genetic','deep q-network',
'line outage distribution factor','multi-criteria decision analysis','closed-form expression','energy transition modeling','point estimate method',
'signal noise ratio','agent-based modeling','environmental impact assessment','data-driven optimization','energy consumption modeling',
'state-space modeling','quadrature pase shift keying','multi-fidelity model','stochastic geometry','quadrature amplitude modulation',
'orthogonal frequency-division multiplexing','minimum mean square','adaptive modulation','error rate ber performance']



print(f"✓ Loaded {len(software_list_all)} software")
print(f"✓ Loaded {len(method_list_all)} methods")
print(f"✓ Total pairs to assess: {len(software_list_all) * len(method_list_all)}")


In [None]:
# %%
# =============================================================================
# EXECUTION: RUN 1 - OpenAI + Google
# =============================================================================

from pathlib import Path
from datetime import datetime

# Setup output directory
output_dir = Path("software_analysis_final")
output_dir.mkdir(exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

print(f"Output directory: {output_dir}")
print(f"Timestamp: {timestamp}")

# Initialize assessor
assessor = SoftwareMethodAssessor(use_config=True, timeout=180)

print("\n" + "="*70)
print("RUN 1: OpenAI + Google Assessment")
print("="*70)

# Run assessment
results_run1 = assessor.assess_multiple_batched(
    software_list=software_list_all,
    method_list=method_list_all,
    batch_strategy="by_software",  # Assess all methods per software
    use_openai=True,
    use_google=True,
    use_claude=False,
    openai_model='gpt-4o-mini',
    google_model='models/gemini-2.0-flash'
)

# Save results
run1_file = output_dir / f"run1_openai_google_{timestamp}.json"
assessor.export_results(results_run1, str(run1_file))

# Save as CSV for easy viewing
results_df = pd.DataFrame([{
    'software': r.software,
    'method': r.method,
    'final_rank': r.final_rank,
    'confidence': r.confidence,
    'agreement_level': r.agreement_level,
    'num_llms': len(r.individual_ranks)
} for r in results_run1])

csv_file = output_dir / f"run1_openai_google_{timestamp}.csv"
results_df.to_csv(csv_file, index=False)

# Print summary
assessor.credit_tracker.print_summary()
print(f"\n✓ Run 1 complete!")
print(f"  JSON: {run1_file}")
print(f"  CSV: {csv_file}")
print(f"  Completed {len(results_run1)} assessments")
print("\n" + "="*70)


In [183]:
## CSV merging with existing file
import pandas as pd
import numpy as np
from pathlib import Path
from typing import Dict, List, Tuple
import json
from difflib import get_close_matches

class ResultCSVMerger:
    """
    Merge LLM assessment results into existing CSV with software/method matrix
    """
    
    def __init__(self, 
                 csv_file: str,
                 delimiter: str = ';',
                 software_name_column: str = 'Name',
                 method_start_column: str = 'Numerical-solvers'):
        """
        Initialize CSV merger
        
        Args:
            csv_file: Path to existing CSV file
            delimiter: CSV delimiter (default ';' for European format)
            software_name_column: Column name containing software names
            method_start_column: First method column (all columns after this are methods)
        """
        self.csv_file = Path(csv_file)
        self.delimiter = delimiter
        self.software_name_column = software_name_column
        self.method_start_column = method_start_column
        
        # Load CSV
        self.df = pd.read_csv(csv_file, delimiter=delimiter, encoding='utf-8-sig')
        
        # Identify method columns
        self.info_columns = []
        self.method_columns = []
        found_methods = False
        
        for col in self.df.columns:
            if col == method_start_column:
                found_methods = True
            
            if found_methods:
                self.method_columns.append(col)
            else:
                self.info_columns.append(col)
        
        print(f"CSV Merger initialized:")
        print(f"  File: {csv_file}")
        print(f"  Software rows: {len(self.df)}")
        print(f"  Info columns: {len(self.info_columns)}")
        print(f"  Method columns: {len(self.method_columns)}")
        
        # Create normalized name mappings
        self.software_name_map = self._create_software_name_map()
        self.method_name_map = self._create_method_name_map()
    
    
    def _normalize_name(self, name: str) -> str:
        """Normalize a name for matching (lowercase, remove special chars, extra spaces)"""
        if pd.isna(name):
            return ""
        
        name = str(name).lower()
        # Remove special characters but keep spaces and hyphens
        name = ''.join(c if c.isalnum() or c in ' -' else ' ' for c in name)
        # Collapse multiple spaces
        name = ' '.join(name.split())
        return name
    
    
    def _create_software_name_map(self) -> Dict[str, str]:
        """Create mapping from normalized software names to original CSV names"""
        name_map = {}
        
        for idx, row in self.df.iterrows():
            original_name = row[self.software_name_column]
            if pd.notna(original_name):
                normalized = self._normalize_name(original_name)
                name_map[normalized] = original_name
        
        return name_map
    
    
    def _create_method_name_map(self) -> Dict[str, str]:
        """Create mapping from normalized method names to CSV column names"""
        name_map = {}
        
        for method_col in self.method_columns:
            normalized = self._normalize_name(method_col)
            name_map[normalized] = method_col
        
        return name_map
    
    
    def find_matching_software(self, software_name: str, threshold: float = 0.8) -> Tuple[str, float]:
        """
        Find matching software in CSV using fuzzy matching
        
        Args:
            software_name: Software name from LLM results
            threshold: Minimum similarity score (0-1)
        
        Returns:
            (matched_csv_name, similarity_score) or (None, 0) if no match
        """
        normalized_query = self._normalize_name(software_name)
        
        # Try exact match first
        if normalized_query in self.software_name_map:
            return self.software_name_map[normalized_query], 1.0
        
        # Try fuzzy matching
        matches = get_close_matches(
            normalized_query, 
            self.software_name_map.keys(), 
            n=1, 
            cutoff=threshold
        )
        
        if matches:
            matched_normalized = matches[0]
            matched_original = self.software_name_map[matched_normalized]
            
            # Calculate similarity score
            from difflib import SequenceMatcher
            similarity = SequenceMatcher(None, normalized_query, matched_normalized).ratio()
            
            return matched_original, similarity
        
        return None, 0.0
    
    
    def find_matching_method(self, method_name: str, threshold: float = 0.7) -> Tuple[str, float]:
        """
        Find matching method column in CSV using fuzzy matching
        
        Args:
            method_name: Method name from LLM results
            threshold: Minimum similarity score (0-1)
        
        Returns:
            (matched_column_name, similarity_score) or (None, 0) if no match
        """
        normalized_query = self._normalize_name(method_name)
        
        # Try exact match first
        if normalized_query in self.method_name_map:
            return self.method_name_map[normalized_query], 1.0
        
        # Try fuzzy matching
        matches = get_close_matches(
            normalized_query, 
            self.method_name_map.keys(), 
            n=1, 
            cutoff=threshold
        )
        
        if matches:
            matched_normalized = matches[0]
            matched_column = self.method_name_map[matched_normalized]
            
            from difflib import SequenceMatcher
            similarity = SequenceMatcher(None, normalized_query, matched_normalized).ratio()
            
            return matched_column, similarity
        
        return None, 0.0
    
    
    def merge_llm_results(self,
                         llm_results_file: str,
                         output_file: str,
                         min_confidence: float = 0.5,
                         software_match_threshold: float = 0.8,
                         method_match_threshold: float = 0.7,
                         overwrite_existing: bool = True,
                         create_mapping_report: bool = True) -> pd.DataFrame:
        """
        Merge LLM assessment results into CSV
        
        Args:
            llm_results_file: Path to merged LLM results JSON
            output_file: Path for output CSV
            min_confidence: Minimum confidence to use LLM score
            software_match_threshold: Minimum similarity for software matching
            method_match_threshold: Minimum similarity for method matching
            overwrite_existing: If True, overwrite existing scores; if False, only fill empty
            create_mapping_report: Create detailed mapping report
        
        Returns:
            Updated DataFrame
        """
        print(f"\n{'='*70}")
        print(f"MERGING LLM RESULTS INTO CSV")
        print(f"{'='*70}")
        
        # Load LLM results
        with open(llm_results_file, 'r') as f:
            llm_results = json.load(f)
        
        print(f"Loaded {len(llm_results)} LLM assessments")
        
        # Create working copy of dataframe
        df_merged = self.df.copy()
        
        # Track statistics
        stats = {
            'total_llm_results': len(llm_results),
            'software_matched': 0,
            'software_not_matched': 0,
            'method_matched': 0,
            'method_not_matched': 0,
            'scores_updated': 0,
            'scores_skipped_low_confidence': 0,
            'scores_skipped_existing': 0
        }
        
        mapping_report = []
        
        # Process each LLM result
        for result in llm_results:
            software = result['software']
            method = result['method']
            rank = result['final_rank']
            confidence = result['confidence']
            
            # Skip low confidence results
            if confidence < min_confidence:
                stats['scores_skipped_low_confidence'] += 1
                mapping_report.append({
                    'software': software,
                    'method': method,
                    'status': 'skipped_low_confidence',
                    'confidence': confidence,
                    'rank': rank
                })
                continue
            
            # Find matching software
            matched_software, sw_similarity = self.find_matching_software(
                software, software_match_threshold
            )
            
            if matched_software is None:
                stats['software_not_matched'] += 1
                mapping_report.append({
                    'software': software,
                    'method': method,
                    'status': 'software_not_matched',
                    'confidence': confidence,
                    'rank': rank
                })
                continue
            
            stats['software_matched'] += 1
            
            # Find matching method
            matched_method, method_similarity = self.find_matching_method(
                method, method_match_threshold
            )
            
            if matched_method is None:
                stats['method_not_matched'] += 1
                mapping_report.append({
                    'software': software,
                    'method': method,
                    'status': 'method_not_matched',
                    'matched_software': matched_software,
                    'sw_similarity': sw_similarity,
                    'confidence': confidence,
                    'rank': rank
                })
                continue
            
            stats['method_matched'] += 1
            
            # Find row index for this software
            row_idx = self.df[self.df[self.software_name_column] == matched_software].index[0]
            
            # Check if we should update
            current_value = df_merged.at[row_idx, matched_method]
            
            if not overwrite_existing and pd.notna(current_value) and current_value != '':
                stats['scores_skipped_existing'] += 1
                mapping_report.append({
                    'software': software,
                    'method': method,
                    'status': 'skipped_existing_value',
                    'matched_software': matched_software,
                    'matched_method': matched_method,
                    'sw_similarity': sw_similarity,
                    'method_similarity': method_similarity,
                    'current_value': current_value,
                    'new_rank': rank,
                    'confidence': confidence
                })
                continue
            
            # Update the score
            df_merged.at[row_idx, matched_method] = rank
            stats['scores_updated'] += 1
            
            mapping_report.append({
                'software': software,
                'method': method,
                'status': 'updated',
                'matched_software': matched_software,
                'matched_method': matched_method,
                'sw_similarity': sw_similarity,
                'method_similarity': method_similarity,
                'previous_value': current_value,
                'new_rank': rank,
                'confidence': confidence
            })
        
        # Print statistics
        print(f"\n{'-'*70}")
        print(f"MERGE STATISTICS")
        print(f"{'-'*70}")
        print(f"Total LLM results: {stats['total_llm_results']}")
        print(f"\nSoftware Matching:")
        print(f"  Matched: {stats['software_matched']}")
        print(f"  Not matched: {stats['software_not_matched']}")
        print(f"\nMethod Matching:")
        print(f"  Matched: {stats['method_matched']}")
        print(f"  Not matched: {stats['method_not_matched']}")
        print(f"\nScore Updates:")
        print(f"  Updated: {stats['scores_updated']}")
        print(f"  Skipped (low confidence): {stats['scores_skipped_low_confidence']}")
        print(f"  Skipped (existing value): {stats['scores_skipped_existing']}")
        
        # Save merged CSV
        df_merged.to_csv(output_file, sep=self.delimiter, index=False, encoding='utf-8-sig')
        print(f"\n✓ Merged CSV saved to: {output_file}")
        
        # Save mapping report if requested
        if create_mapping_report:
            report_file = Path(output_file).parent / f"{Path(output_file).stem}_mapping_report.json"
            with open(report_file, 'w') as f:
                json.dump({
                    'statistics': stats,
                    'mappings': mapping_report
                }, f, indent=2)
            print(f"✓ Mapping report saved to: {report_file}")
        
        print(f"{'='*70}\n")
        
        return df_merged
    
    
    def preview_unmatched(self, llm_results_file: str, 
                         software_threshold: float = 0.8,
                         method_threshold: float = 0.7):
        """
        Preview which software and methods from LLM results won't match
        
        Args:
            llm_results_file: Path to LLM results JSON
            software_threshold: Minimum similarity for software
            method_threshold: Minimum similarity for method
        """
        print(f"\n{'='*70}")
        print(f"PREVIEW: UNMATCHED ITEMS")
        print(f"{'='*70}")
        
        with open(llm_results_file, 'r') as f:
            llm_results = json.load(f)
        
        unmatched_software = set()
        unmatched_methods = set()
        
        for result in llm_results:
            software = result['software']
            method = result['method']
            
            matched_sw, sw_sim = self.find_matching_software(software, software_threshold)
            if matched_sw is None:
                unmatched_software.add(software)
            
            matched_method, method_sim = self.find_matching_method(method, method_threshold)
            if matched_method is None:
                unmatched_methods.add(method)
        
        if unmatched_software:
            print(f"\nUnmatched Software ({len(unmatched_software)}):")
            for sw in sorted(unmatched_software):
                # Find closest match for reference
                matched, sim = self.find_matching_software(sw, 0.0)
                print(f"  '{sw}'")
                if matched:
                    print(f"    Closest: '{matched}' (similarity: {sim:.2f})")
        
        if unmatched_methods:
            print(f"\nUnmatched Methods ({len(unmatched_methods)}):")
            for method in sorted(unmatched_methods):
                matched, sim = self.find_matching_method(method, 0.0)
                print(f"  '{method}'")
                if matched:
                    print(f"    Closest: '{matched}' (similarity: {sim:.2f})")
        
        if not unmatched_software and not unmatched_methods:
            print("\n✓ All items will match!")
        
        print(f"{'='*70}\n")
    
    
    def create_name_mapping_file(self, llm_results_file: str, output_file: str = "name_mappings.json"):
        """
        Create a manual name mapping file for unmatched items
        
        Args:
            llm_results_file: Path to LLM results JSON
            output_file: Path for mapping file
        """
        with open(llm_results_file, 'r') as f:
            llm_results = json.load(f)
        
        # Collect all unique software and methods
        software_set = set(r['software'] for r in llm_results)
        method_set = set(r['method'] for r in llm_results)
        
        mappings = {
            'software_mappings': {},
            'method_mappings': {}
        }
        
        # Create suggested mappings for software
        for sw in sorted(software_set):
            matched, sim = self.find_matching_software(sw, 0.0)
            mappings['software_mappings'][sw] = {
                'suggested_match': matched,
                'similarity': round(sim, 2),
                'manual_override': None  # User can fill this in
            }
        
        # Create suggested mappings for methods
        for method in sorted(method_set):
            matched, sim = self.find_matching_method(method, 0.0)
            mappings['method_mappings'][method] = {
                'suggested_match': matched,
                'similarity': round(sim, 2),
                'manual_override': None  # User can fill this in
            }
        
        with open(output_file, 'w') as f:
            json.dump(mappings, f, indent=2)
        
        print(f"✓ Name mapping template saved to: {output_file}")
        print(f"  Edit 'manual_override' fields for custom mappings")


In [201]:
software_list_all = [
    'Power Factory Digisilent','DINIS','ERACS','IPSA','Netbas','Trimble NIS','GAMS'
]


method_list_all = [
'power flow analysis','security-constrained optimal power flow','security constrained unit commitment']

## Batch strategies

In [186]:
# Example usage script
def run_scheduled_assessment_workflow():
    """
    Complete workflow: Schedule -> Execute -> Merge -> Validate
    """
    
    # Step 1: Create schedule
    print("STEP 1: Creating Assessment Schedule")
    print("="*70)
    
    scheduler = AssessmentScheduler(
        software_list=software_list_all[:10],  # Use subset for example
        method_list=method_list_all[:20],
        min_assessments_per_pair=2,  # Each pair assessed twice
        batch_size=6,
        output_dir="scheduled_assessments"
    )
    
    # Define LLM combinations (using cheap models)
    llm_combinations = [
        {
            'openai': True, 
            'claude': False, 
            'google': True,
            'openai_model': 'gpt-4o-mini',  # Cheap
            'google_model': 'models/gemini-2.0-flash'  # Cheap
        },
        {
            'openai': False, 
            'claude': True, 
            'google': True,
            'claude_model': 'claude-3.5-haiku',  # Cheap
            'google_model': 'models/gemini-2.0-flash'
        },
        # Optional: Add batch API runs for overnight processing
        {
            'openai': True,
            'claude': False,
            'google': False,
            'openai_model': 'gpt-4o-mini-batch',  # 50% cheaper, overnight
        }
    ]
    
    schedule = scheduler.create_schedule(
        llm_combinations=llm_combinations,
        overlap_strategy="stratified"
    )
    
    scheduler.print_schedule_summary(schedule)
    
    # Step 2: Execute runs (you would run this in a loop or script)
    print("\nSTEP 2: Execute Scheduled Runs")
    print("="*70)
    print("Now run each scheduled assessment:")
    print("  for run in schedule['runs']:")
    print("      execute_run(run)")
    print("\nSee execute_scheduled_run() function below for implementation")
    
    # Step 3: Merge and validate (after all runs complete)
    print("\nSTEP 3: Merge and Validate Results")
    print("="*70)
    
    merger = ResultMergerValidator(
        schedule_file="scheduled_assessments/assessment_schedule.pkl",
        results_dir="scheduled_assessments"
    )
    
    # Check completion
    completion_status = merger.check_completion()
    
    # Merge if enough runs completed
    if completion_status['completed_runs'] > 0:
        merged_results, validation_report = merger.merge_all_results(
            output_file="scheduled_assessments/final_merged_results.json",
            validate_coverage=True
        )
        
        print(f"\n✓ Workflow complete!")
        print(f"  Merged {len(merged_results)} unique assessments")
        print(f"  Results saved to: scheduled_assessments/final_merged_results.json")


def execute_scheduled_run(run_config: Dict, assessor: SoftwareMethodAssessor, output_dir: str = "."):
    """
    Execute a single scheduled run
    
    Args:
        run_config: Run configuration from schedule
        assessor: SoftwareMethodAssessor instance
        output_dir: Directory to save results (should match scheduler output_dir)
    """
    print(f"\n{'='*70}")
    print(f"EXECUTING RUN {run_config['run_id']}")
    print(f"{'='*70}")
    
    llm_config = run_config['llm_config']
    all_pairs = []
    for batch in run_config['batches']:
        all_pairs.extend(batch)
    
    # Extract software and method lists
    software_list = list(set([pair[0] for pair in all_pairs]))
    method_list = list(set([pair[1] for pair in all_pairs]))
    
    # Run batched assessment
    results = assessor.assess_multiple_batched(
        software_list=software_list,
        method_list=method_list,
        batch_strategy="fixed_size",
        batch_size=len(run_config['batches'][0]) if run_config['batches'] else 5,
        overlap_percentage=0.0,  # No overlap within single run
        use_openai=llm_config.get('openai', False),
        use_claude=llm_config.get('claude', False),
        use_google=llm_config.get('google', False),
        openai_model=llm_config.get('openai_model'),
        claude_model=llm_config.get('claude_model'),
        google_model=llm_config.get('google_model')
    )
    
    # Save results to correct directory - FIX HERE
    from pathlib import Path
    output_path = Path(output_dir) / run_config['output_file']
    output_path.parent.mkdir(parents=True, exist_ok=True)  # Ensure directory exists
    
    assessor.export_results(results, str(output_path))
    
    print(f"\n✓ Run {run_config['run_id']} complete")
    print(f"  Results saved to: {output_path}")
    
    return results

In [None]:
# ============================================================================
# COMPLETE EXAMPLE: Cheap overnight assessment of all your data
# ============================================================================

# Initialize
assessor = SoftwareMethodAssessor(use_config=True)

# Create schedule for ALL your software and methods
scheduler = AssessmentScheduler(
    software_list=software_list_all,  # All 35 software
    method_list=method_list_all,  # All 157 methods
    min_assessments_per_pair=2,  # Each pair assessed by 2 different LLM combinations
    batch_size=80,  # Optimal from research
    output_dir="large_scale_assessment"
)

# Define cheap LLM combinations
llm_combos = [
    # Run 1: OpenAI mini + Google flash
    {
        'openai': True, 'claude': False, 'google': True,
        'openai_model': 'gpt-4o-mini',  # $0.075/$0.30
        'google_model': 'models/gemini-2.0-flash'  # $0.075/$0.30
    },
    # Run 2: Claude Haiku + Google flash  
    {
        'openai': False, 'claude': True, 'google': True,
        'claude_model': 'claude-3-5-haiku-20241022',  # $0.25/$1.25
        'google_model': 'models/gemini-2.0-flash'
    },
    # Run 3: Claude Haiku + Google flash  
    #{
    #    'openai': True, 'claude': True, 'google': False,
    #    'openai_model': 'gpt-4o-mini',
    #    'claude_model': 'claude-3-5-haiku-20241022'  # $0.25/$1.25
    #}
]

# Create schedule
schedule = scheduler.create_schedule(
    llm_combinations=llm_combos,
    overlap_strategy="stratified"
)

scheduler.print_schedule_summary(schedule)

# Execute all runs (can be done overnight or in batches)
for run in schedule['runs']:
    execute_scheduled_run(run, assessor,output_dir="software_analysis_output")
    
    # Optional: Save progress after each run
    assessor.credit_tracker.print_summary()

# After all runs complete: Merge and validate
merger = ResultMergerValidator(
    schedule_file="large_scale_assessment/assessment_schedule.pkl",
    results_dir="software_analysis_output"
)

completion_status = merger.check_completion()
merged_results, validation_report = merger.merge_all_results()

print("\n✓ Complete assessment finished!")

Scheduler initialized:
  Software: 35
  Methods: 189
  Total pairs: 6615
  Minimum assessments per pair: 2
  Required total assessments: 13230

Creating schedule with 3 LLM combinations...
  Scheduled Run 1: 60 pairs, 10 batches
    Coverage: min=0, avg=0.01, max=1
    Pairs at minimum (2): 0/6615
  Scheduled Run 2: 60 pairs, 10 batches
    Coverage: min=0, avg=0.02, max=2
    Pairs at minimum (2): 14/6615
  Scheduled Run 3: 60 pairs, 10 batches
    Coverage: min=0, avg=0.03, max=2
    Pairs at minimum (2): 41/6615
  Scheduled Run 4: 60 pairs, 10 batches
    Coverage: min=0, avg=0.04, max=2
    Pairs at minimum (2): 59/6615
  Scheduled Run 5: 60 pairs, 10 batches
    Coverage: min=0, avg=0.05, max=2
    Pairs at minimum (2): 89/6615
  Scheduled Run 6: 60 pairs, 10 batches
    Coverage: min=0, avg=0.05, max=2
    Pairs at minimum (2): 114/6615
  Scheduled Run 7: 60 pairs, 10 batches
    Coverage: min=0, avg=0.06, max=2
    Pairs at minimum (2): 144/6615
  Scheduled Run 8: 60 pairs, 10 b

KeyboardInterrupt: 

In [202]:
results_all = assessor.assess_multiple_batched(
    software_list=software_list_all,
    method_list=method_list_all,
    batch_strategy="by_software",  # ← KEY: Group by software
    # No batch_size needed - automatically does all methods per software
    overlap_percentage=0.0,
    use_openai=True,
    use_google=True,
    use_claude=False,
    openai_model='gpt-4o-mini',
    google_model='models/gemini-2.0-flash'
)


BATCH ASSESSMENT MODE
Total items: 21
Strategy: by_software
Batch size: varies
Overlap: 0%
LLMs: OpenAI=True, Claude=False, Google=True

Created 7 batches
  Batch 1: 3 items
  Batch 2: 3 items
  Batch 3: 3 items
  Batch 4: 3 items
  Batch 5: 3 items
  Batch 6: 3 items
  Batch 7: 3 items

----------------------------------------------------------------------
Processing batches...
----------------------------------------------------------------------

[Batch 1/7] 3 items
  Assessing with OpenAI...
  Assessing with Google...
  Running cost: $1.0725 (7,931,827 tokens)

[Batch 2/7] 3 items
  Assessing with OpenAI...
  Assessing with Google...
  Running cost: $1.0728 (7,934,243 tokens)

[Batch 3/7] 3 items
  Assessing with OpenAI...
  Assessing with Google...
  Running cost: $1.0733 (7,936,996 tokens)

[Batch 4/7] 3 items
  Assessing with OpenAI...
  Assessing with Google...
  Running cost: $1.0738 (7,939,732 tokens)

[Batch 5/7] 3 items
  Assessing with OpenAI...
  Assessing with Google...

In [206]:
assessor.export_results(results_all,"software_analysis_output/results_all.json")


Results exported to software_analysis_output/results_all.json


In [None]:
# ============================================================================
# USAGE EXAMPLE
# ============================================================================

# 1. Initialize the merger with your CSV
merger = ResultCSVMerger(
    csv_file="input_data\Software_method_implementation_score.csv",
    delimiter=';',
    software_name_column='Name',
    method_start_column='Numerical-solvers'
)

# 2. Preview what won't match (optional but recommended)
merger.preview_unmatched(
    llm_results_file="software_analysis_output/merged_complete_results.json",
    software_threshold=0.8,
    method_threshold=0.7
)

# 3. Create a name mapping file for manual adjustment (if needed)
merger.create_name_mapping_file(
    llm_results_file="software_analysis_output/merged_complete_results.json",
    output_file="software_analysis_output/name_mappings.json"
)

# 4. Merge the results
df_updated = merger.merge_llm_results(
    llm_results_file="software_analysis_output/merged_complete_results.json",
    output_file="software_analysis_output/software_methods_updated.csv",
    min_confidence=0.6,  # Only use results with >=60% confidence
    software_match_threshold=0.8,
    method_match_threshold=0.7,
    overwrite_existing=True,  # Overwrite existing scores
    create_mapping_report=True  # Create detailed report
)

# 5. Review the mapping report
import json
with open("software_methods_updated_mapping_report.json", 'r') as f:
    report = json.load(f)
    
print(f"\nUpdated {report['statistics']['scores_updated']} scores")
print(f"Check mapping report for details")


CSV Merger initialized:
  File: input_data\Software_method_implementation_score.csv
  Software rows: 39
  Info columns: 21
  Method columns: 190

PREVIEW: UNMATCHED ITEMS

✓ All items will match!

✓ Name mapping template saved to: software_analysis_output/name_mappings.json
  Edit 'manual_override' fields for custom mappings

MERGING LLM RESULTS INTO CSV
Loaded 12 LLM assessments

----------------------------------------------------------------------
MERGE STATISTICS
----------------------------------------------------------------------
Total LLM results: 12

Software Matching:
  Matched: 9
  Not matched: 0

Method Matching:
  Matched: 9
  Not matched: 0

Score Updates:
  Updated: 9
  Skipped (low confidence): 3
  Skipped (existing value): 0

✓ Merged CSV saved to: software_analysis_output/software_methods_updated.csv
✓ Mapping report saved to: software_analysis_output\software_methods_updated_mapping_report.json



FileNotFoundError: [Errno 2] No such file or directory: 'software_methods_updated_mapping_report.json'