# Software mapping

In [21]:
# =============================================================================
# IMPORTS
# =============================================================================
import openai
import anthropic
import json
import time
import configparser
import tiktoken
from typing import List, Dict, Tuple, Optional, Callable
from collections import Counter
import numpy as np
from dataclasses import dataclass, asdict
import google.generativeai as genai
from itertools import combinations
import random
from datetime import datetime
from pathlib import Path
import pickle
import traceback
import pandas as pd
from difflib import get_close_matches, SequenceMatcher

In [22]:
# %%
# =============================================================================
# CONFIGURATION & INITIALIZATION
# =============================================================================

def initialize_openai():
    """Initialize OpenAI client from config file"""
    config = configparser.ConfigParser()
    config.read('config_LLM.txt')
    api_key = config['LLM'].get('OPENAI_API_KEY')
    model_type = config['LLM'].get('MODEL_TYPE_adv', 'gpt-4o-mini')
    client = openai.OpenAI(api_key=api_key)
    return client, model_type

def initialize_anthropic():
    """Initialize Anthropic client from config file"""
    config = configparser.ConfigParser()
    config.read('config_LLM.txt')
    api_key = config['LLM'].get('ANTHROPIC_API_KEY')
    client = anthropic.Anthropic(api_key=api_key) if api_key else None
    return client

def initialize_google():
    """Initialize Google Gemini client from config file"""
    config = configparser.ConfigParser()
    config.read('config_LLM.txt')
    api_key = config['LLM'].get('GOOGLE_API_KEY')
    if api_key:
        genai.configure(api_key=api_key)
        return True
    return False

print("✓ Initialization functions defined")


✓ Initialization functions defined


In [23]:
# %%
# =============================================================================
# TOKEN COUNTING UTILITIES
# =============================================================================

def num_tokens_from_string(string: str, model_name: str) -> int:
    """Get token count with fallback for unsupported models"""
    try:
        encoding = tiktoken.encoding_for_model(model_name)
        return len(encoding.encode(string))
    except KeyError:
        if model_name.startswith('gpt-5'):
            encoding = tiktoken.get_encoding("o200k_base")
            return len(encoding.encode(string))
        elif model_name.startswith('gpt-4'):
            encoding = tiktoken.get_encoding("cl100k_base")
            return len(encoding.encode(string))
        elif model_name.startswith('claude'):
            return int(len(string) / 3.5)
        elif model_name.startswith('models/gemini') or model_name.startswith('gemini'):
            return int(len(string) / 4)
        else:
            return len(string) // 4

def count_tokens_in_messages(messages: List[Dict], model: str) -> int:
    """Count tokens in a list of messages"""
    total_tokens = 0
    for message in messages:
        if isinstance(message.get('content'), str):
            total_tokens += num_tokens_from_string(message['content'], model)
        total_tokens += 4  # Message overhead
    total_tokens += 3  # Completion overhead
    return total_tokens

print("✓ Token counting utilities defined")


✓ Token counting utilities defined


In [24]:
# %%
# =============================================================================
# CREDIT TRACKER
# =============================================================================

class CreditTracker:
    """Track API usage and costs across all models"""
    
    PRICING = {
        # OpenAI
        'gpt-4o': {'input': 1.25, 'output': 5.00},
        'gpt-4o-mini': {'input': 0.075, 'output': 0.30},
        
        # Claude
        'claude-3-haiku-20240307': {'input': 0.25, 'output': 1.25},
        'claude-3-5-haiku-20241022': {'input': 0.80, 'output': 4.00},
        'claude-3-5-sonnet-20241022': {'input': 3.00, 'output': 15.00},
        'claude-sonnet-4-20250514': {'input': 3.00, 'output': 15.00},
        
        # Google
        'models/gemini-2.5-flash': {'input': 0.075, 'output': 0.30},
        'models/gemini-2.0-flash': {'input': 0.075, 'output': 0.30},
        'models/gemini-2.0-flash-001': {'input': 0.075, 'output': 0.30},
        'gemini-2.0-flash': {'input': 0.075, 'output': 0.30},
    }

    def __init__(self):
        self.total_input_tokens = 0
        self.total_output_tokens = 0
        self.total_cached_tokens = 0
        self.total_cost = 0
        self.model_usage = {}
        self.call_count = 0

    def update(self, model: str, input_tokens: int, output_tokens: int, cached_tokens: int = 0):
        """Update usage statistics"""
        self.total_input_tokens += input_tokens
        self.total_output_tokens += output_tokens
        self.total_cached_tokens += cached_tokens
        self.call_count += 1

        pricing = self.PRICING.get(model, {'input': 0.00015, 'output': 0.0006})
        
        input_cost = (input_tokens / 1_000_000) * pricing['input']
        output_cost = (output_tokens / 1_000_000) * pricing['output']
        call_cost = input_cost + output_cost
        self.total_cost += call_cost

        if model not in self.model_usage:
            self.model_usage[model] = {
                'calls': 0, 'input_tokens': 0, 'output_tokens': 0,
                'cached_tokens': 0, 'cost': 0
            }

        self.model_usage[model]['calls'] += 1
        self.model_usage[model]['input_tokens'] += input_tokens
        self.model_usage[model]['output_tokens'] += output_tokens
        self.model_usage[model]['cached_tokens'] += cached_tokens
        self.model_usage[model]['cost'] += call_cost

    def get_stats(self):
        """Get current statistics"""
        return {
            "total_calls": self.call_count,
            "total_input_tokens": self.total_input_tokens,
            "total_output_tokens": self.total_output_tokens,
            "total_tokens": self.total_input_tokens + self.total_output_tokens,
            "total_cost": round(self.total_cost, 4),
            "average_cost_per_call": round(self.total_cost / max(self.call_count, 1), 4),
            "model_breakdown": {
                model: {
                    'calls': stats['calls'],
                    'total_tokens': stats['input_tokens'] + stats['output_tokens'],
                    'cost': round(stats['cost'], 4)
                }
                for model, stats in self.model_usage.items()
            }
        }

    def print_summary(self):
        """Print formatted summary"""
        stats = self.get_stats()
        print("\n" + "="*60)
        print("API USAGE SUMMARY")
        print("="*60)
        print(f"Total API Calls: {stats['total_calls']}")
        print(f"Total Tokens: {stats['total_tokens']:,}")
        print(f"  - Input: {stats['total_input_tokens']:,}")
        print(f"  - Output: {stats['total_output_tokens']:,}")
        if self.total_cached_tokens > 0:
            print(f"  - Cached: {self.total_cached_tokens:,}")
        print(f"\nTotal Cost: ${stats['total_cost']:.4f}")
        print(f"Average Cost per Call: ${stats['average_cost_per_call']:.4f}")

        if self.model_usage:
            print("\nBreakdown by Model:")
            print("-" * 60)
            for model, breakdown in stats['model_breakdown'].items():
                print(f"  {model}:")
                print(f"    Calls: {breakdown['calls']}")
                print(f"    Tokens: {breakdown['total_tokens']:,}")
                print(f"    Cost: ${breakdown['cost']:.4f}")
        print("="*60 + "\n")

print("✓ CreditTracker class defined")


✓ CreditTracker class defined


## The method assessor

In [25]:
# %%
# =============================================================================
# DATA STRUCTURES
# =============================================================================

@dataclass
class AssessmentResult:
    """Single LLM assessment result"""
    software: str
    method: str
    rank: int
    reasoning: str
    sources: List[str]
    llm_provider: str
    input_tokens: int = 0
    output_tokens: int = 0

@dataclass
class ConsensusResult:
    """Consensus across multiple LLMs"""
    software: str
    method: str
    final_rank: int
    confidence: float
    individual_ranks: Dict[str, int]
    individual_reasoning: Dict[str, str]
    individual_sources: Dict[str, List[str]]
    agreement_level: str
    total_tokens: int = 0
    total_cost: float = 0.0

print("✓ Data structures defined")


✓ Data structures defined


In [26]:
# %%
# =============================================================================
# MAIN ASSESSOR CLASS - PART 1: Core Functions
# =============================================================================

class SoftwareMethodAssessor:
    """Main class for software-method assessment using multiple LLMs"""
    
    def __init__(self, use_config: bool = True, timeout: int = 180, max_retries: int = 3):
        """Initialize assessor with API clients"""
        if use_config:
            self.openai_client, self.default_model = initialize_openai()
            self.anthropic_client = initialize_anthropic()
            self.google_enabled = initialize_google()
        else:
            self.openai_client = None
            self.anthropic_client = None
            self.google_enabled = False
            self.default_model = "gpt-4o-mini"
        
        self.credit_tracker = CreditTracker()
        self.timeout = timeout
        self.max_retries = max_retries
        
        # System prompt for assessments
        self.system_prompt = """You are a technical software assessment expert specialized in power systems analysis software.

Use this ranking scale:
0 = No support (method cannot be implemented at all)
1 = Limited possibility for implementation or extension (requires significant workarounds)
2 = Indirectly supported through APIs or extensions (requires external tools/plugins)
3 = Directly implemented (native feature in the software)

CRITICAL: You MUST search for and provide actual references. Your assessment must be based on real, verifiable sources.

For each assessment:
1. Search for scientific papers demonstrating implementation (IEEE Xplore, ScienceDirect, arXiv, Google Scholar)
2. Find official documentation from the software vendor or project website
3. Look for GitHub repositories with code examples or open-source implementations
4. Check API documentation or extension/plugin capabilities
5. Review user forums, technical blogs, Stack Overflow, or case studies

Return your response in VALID JSON format with this exact structure:
{
    "rank": <0-3>,
    "reasoning": "<detailed explanation citing specific sources by number, e.g., 'According to [1], PSS/E supports...'>",
    "sources": [
        "https://example.com/documentation - Official PSS/E Manual on OPF",
        "https://doi.org/10.1109/... - Paper title by Author et al.",
        "https://github.com/org/repo/file.py - Implementation example"
    ]
}

Each source must include both the URL and a brief description separated by ' - '.
Minimum 2 sources required for ranks 2-3, minimum 1 source for rank 1."""

    def calculate_confidence(self, ranks: List[int]) -> Tuple[float, str]:
        """Calculate confidence score from multiple assessments"""
        if not ranks:
            return 0.0, "no_data"
        
        rank_counts = Counter(ranks)
        most_common_count = rank_counts.most_common(1)[0][1]
        total_ranks = len(ranks)
        confidence = most_common_count / total_ranks
        
        if total_ranks == 1:
            agreement_level = "single_assessment"
        elif confidence == 1.0:
            agreement_level = "perfect_agreement"
        elif confidence >= 0.75:
            agreement_level = "strong_agreement"
        elif confidence >= 0.5:
            agreement_level = "moderate_agreement"
        else:
            agreement_level = "weak_agreement"
        
        return confidence, agreement_level

    def export_results(self, results: List[ConsensusResult], filename: str):
        """Export results to JSON file"""
        output_data = [asdict(result) for result in results]
        with open(filename, 'w') as f:
            json.dump(output_data, f, indent=2)
        print(f"\nResults exported to {filename}")

print("✓ SoftwareMethodAssessor class initialized (Part 1)")


✓ SoftwareMethodAssessor class initialized (Part 1)


In [27]:
# %%
# =============================================================================
# MAIN ASSESSOR CLASS - PART 2: Batch Assessment Methods
# =============================================================================

def create_batch_assessment_prompt(self, batch_items: List[Tuple[str, str]], batch_size: int = None) -> str:
    """Create a structured prompt for batch assessment"""
    batch_size = batch_size or len(batch_items)
    
    items_text = ""
    for idx, (software, method) in enumerate(batch_items, 1):
        items_text += f"\n{idx}. Software: {software}\n   Method: {method}\n"
    
    prompt = f"""You must assess {len(batch_items)} software-method combinations independently.

CRITICAL INSTRUCTIONS:
- Treat each pair as completely independent
- Do NOT let one assessment influence another
- Provide the SAME quality of research and reasoning for ALL items
- Each assessment must have its own sources

Items to assess:{items_text}

For EACH item above, perform independent research and provide sources with URLs.

Return a JSON array with exactly {len(batch_items)} objects:
[
  {{
    "software": "<software name>",
    "method": "<method name>",
    "rank": <0-3>,
    "reasoning": "<detailed explanation citing sources [1], [2], etc.>",
    "sources": [
      "https://... - Description",
      "https://... - Description"
    ]
  }},
  ...
]

IMPORTANT: Return ONLY the JSON array, no other text."""
    
    return prompt

# Add to SoftwareMethodAssessor class
SoftwareMethodAssessor.create_batch_assessment_prompt = create_batch_assessment_prompt

print("✓ Batch prompt creation added")


✓ Batch prompt creation added


In [28]:
# %%
# =============================================================================
# OPENAI ASSESSMENT METHOD
# =============================================================================

def assess_batch_with_openai(self, batch_items: List[Tuple[str, str]], 
                             model: str = None, debug: bool = False) -> List[AssessmentResult]:
    """Assess a batch of items with OpenAI"""
    if model is None:
        model = self.default_model
    
    try:
        prompt = self.create_batch_assessment_prompt(batch_items)
        messages = [
            {"role": "system", "content": self.system_prompt},
            {"role": "user", "content": prompt}
        ]
        
        if debug:
            print(f"  DEBUG: Batch size: {len(batch_items)}")
        
        response = self.openai_client.chat.completions.create(
            model=model,
            messages=messages,
            temperature=0.3,
            max_tokens=4096,
            timeout=self.timeout,
            response_format={"type": "json_object"}
        )
        
        usage = response.usage
        cached_tokens = 0
        if hasattr(usage, 'prompt_tokens_details') and usage.prompt_tokens_details:
            cached_tokens = getattr(usage.prompt_tokens_details, 'cached_tokens', 0)
        
        self.credit_tracker.update(
            model=model,
            input_tokens=usage.prompt_tokens,
            output_tokens=usage.completion_tokens,
            cached_tokens=cached_tokens
        )
        
        content = response.choices[0].message.content
        
        # Parse JSON
        try:
            parsed = json.loads(content)
            if isinstance(parsed, dict):
                if 'assessments' in parsed:
                    results_data = parsed['assessments']
                elif 'results' in parsed:
                    results_data = parsed['results']
                else:
                    results_data = next(v for v in parsed.values() if isinstance(v, list))
            else:
                results_data = parsed
        except Exception as e:
            print(f"  ERROR parsing batch response: {e}")
            return []
        
        # Convert to AssessmentResult objects
        assessment_results = []
        for item_data in results_data:
            rank = int(item_data.get("rank", 0))
            sources = item_data.get("sources", [])
            reasoning = item_data.get("reasoning", "")
            
            if rank > 0 and len(sources) == 0:
                rank = 0
                reasoning += " [Rank lowered to 0: no sources]"
            
            assessment_results.append(AssessmentResult(
                software=item_data.get("software", ""),
                method=item_data.get("method", ""),
                rank=rank,
                reasoning=reasoning,
                sources=sources,
                llm_provider=f"openai_{model}",
                input_tokens=usage.prompt_tokens // len(batch_items),
                output_tokens=usage.completion_tokens // len(batch_items)
            ))
        
        return assessment_results
        
    except Exception as e:
        print(f"  ERROR in OpenAI batch assessment: {e}")
        return []

# Add to class
SoftwareMethodAssessor.assess_batch_with_openai = assess_batch_with_openai

print("✓ OpenAI assessment method added")


✓ OpenAI assessment method added


In [29]:
# %%
# =============================================================================
# CLAUDE ASSESSMENT METHOD
# =============================================================================

def assess_batch_with_claude(self, batch_items: List[Tuple[str, str]], 
                             model: str = "claude-3-5-haiku-20241022", 
                             debug: bool = False) -> List[AssessmentResult]:
    """Assess a batch of items with Claude"""
    if not self.anthropic_client:
        return []
    
    try:
        prompt = self.create_batch_assessment_prompt(batch_items)
        
        if debug:
            print(f"  DEBUG: Batch size: {len(batch_items)}")
        
        response = self.anthropic_client.messages.create(
            model=model,
            max_tokens=4096,
            temperature=0.3,
            timeout=self.timeout,
            system=self.system_prompt,
            messages=[{"role": "user", "content": prompt}]
        )
        
        self.credit_tracker.update(
            model=model,
            input_tokens=response.usage.input_tokens,
            output_tokens=response.usage.output_tokens
        )
        
        content = response.content[0].text
        
        # Clean markdown code blocks
        content = content.strip()
        if content.startswith('```'):
            lines = content.split('\n')
            start_idx = 0
            end_idx = len(lines)
            for i, line in enumerate(lines):
                if line.strip().startswith('```'):
                    if start_idx == 0:
                        start_idx = i + 1
                    else:
                        end_idx = i
                        break
            content = '\n'.join(lines[start_idx:end_idx])
        
        # Parse JSON
        try:
            parsed = json.loads(content)
            if isinstance(parsed, dict):
                if 'assessments' in parsed:
                    results_data = parsed['assessments']
                elif 'results' in parsed:
                    results_data = parsed['results']
                else:
                    results_data = next(v for v in parsed.values() if isinstance(v, list))
            else:
                results_data = parsed
        except Exception as e:
            print(f"  ERROR parsing Claude batch response: {e}")
            return []
        
        # Convert to AssessmentResult objects
        assessment_results = []
        for item_data in results_data:
            rank = int(item_data.get("rank", 0))
            sources = item_data.get("sources", [])
            reasoning = item_data.get("reasoning", "")
            
            if rank > 0 and len(sources) == 0:
                rank = 0
                reasoning += " [Rank lowered to 0: no sources]"
            
            assessment_results.append(AssessmentResult(
                software=item_data.get("software", ""),
                method=item_data.get("method", ""),
                rank=rank,
                reasoning=reasoning,
                sources=sources,
                llm_provider=f"claude_{model}",
                input_tokens=response.usage.input_tokens // len(batch_items),
                output_tokens=response.usage.output_tokens // len(batch_items)
            ))
        
        return assessment_results
        
    except Exception as e:
        print(f"  ERROR in Claude batch assessment: {e}")
        return []

# Add to class
SoftwareMethodAssessor.assess_batch_with_claude = assess_batch_with_claude

print("✓ Claude assessment method added")


✓ Claude assessment method added


In [30]:
# %%
# =============================================================================
# GOOGLE ASSESSMENT METHOD
# =============================================================================

def assess_batch_with_google(self, batch_items: List[Tuple[str, str]], 
                             model: str = "models/gemini-2.0-flash", 
                             debug: bool = False) -> List[AssessmentResult]:
    """Assess a batch of items with Google Gemini"""
    if not self.google_enabled:
        return []
    
    try:
        prompt = self.create_batch_assessment_prompt(batch_items)
        
        if not model.startswith('models/'):
            model = f"models/{model}"
        
        gemini_model = genai.GenerativeModel(
            model_name=model,
            generation_config={
                "temperature": 0.3,
                "max_output_tokens": 4096,
            },
            system_instruction=self.system_prompt
        )
        
        response = gemini_model.generate_content(
            prompt,
            generation_config={
                "temperature": 0.3,
                "max_output_tokens": 4096,
            },
            request_options={'timeout': self.timeout}
        )
        
        if debug:
            print(f"  DEBUG: Successfully used model: {model}")
        
        # Extract token counts
        try:
            input_tokens = response.usage_metadata.prompt_token_count
            output_tokens = response.usage_metadata.candidates_token_count
        except AttributeError:
            input_tokens = int(len(prompt.split()) * 1.3)
            output_tokens = int(len(response.text.split()) * 1.3)
        
        self.credit_tracker.update(
            model=model,
            input_tokens=int(input_tokens),
            output_tokens=int(output_tokens)
        )
        
        content = response.text.strip()
        
        # Clean markdown
        if content.startswith('```'):
            lines = content.split('\n')
            start_idx = 0
            end_idx = len(lines)
            for i, line in enumerate(lines):
                if line.strip().startswith('```'):
                    if start_idx == 0:
                        start_idx = i + 1
                    else:
                        end_idx = i
                        break
            content = '\n'.join(lines[start_idx:end_idx])
        
        # Parse JSON
        try:
            parsed = json.loads(content)
            if isinstance(parsed, dict):
                if 'assessments' in parsed:
                    results_data = parsed['assessments']
                elif 'results' in parsed:
                    results_data = parsed['results']
                else:
                    results_data = next(v for v in parsed.values() if isinstance(v, list))
            else:
                results_data = parsed
        except Exception as e:
            print(f"  ERROR parsing Google batch response: {e}")
            return []
        
        # Convert to AssessmentResult objects
        assessment_results = []
        for item_data in results_data:
            rank = int(item_data.get("rank", 0))
            sources = item_data.get("sources", [])
            reasoning = item_data.get("reasoning", "")
            
            if rank > 0 and len(sources) == 0:
                rank = 0
                reasoning += " [Rank lowered to 0: no sources]"
            
            assessment_results.append(AssessmentResult(
                software=item_data.get("software", ""),
                method=item_data.get("method", ""),
                rank=rank,
                reasoning=reasoning,
                sources=sources,
                llm_provider=f"google_{model.replace('models/', '')}",
                input_tokens=int(input_tokens) // len(batch_items),
                output_tokens=int(output_tokens) // len(batch_items)
            ))
        
        return assessment_results
        
    except Exception as e:
        print(f"  ERROR in Google batch assessment: {e}")
        return []

# Add to class
SoftwareMethodAssessor.assess_batch_with_google = assess_batch_with_google

print("✓ Google assessment method added")


✓ Google assessment method added


In [37]:
# %%
# =============================================================================
# BATCH CREATION METHOD (CORRECTED)
# =============================================================================

def create_batches(self, software_list: List[str], method_list: List[str],
                  strategy: str = "by_software", batch_size: int = 50) -> List[List[Tuple[str, str]]]:
    """
    Create batches of (software, method) pairs
    
    Args:
        software_list: List of software names
        method_list: List of methods
        strategy: Batching strategy
        batch_size: Size for fixed_size batching
    
    Returns:
        List of batches
    """
    batches = []
    
    if strategy == "by_software":
        # One batch per software with all its methods
        for software in software_list:
            batch = [(software, method) for method in method_list]
            batches.append(batch)
    
    elif strategy == "by_method":
        # One batch per method with all software
        for method in method_list:
            batch = [(software, method) for software in software_list]
            batches.append(batch)
    
    elif strategy == "mixed":
        # Alternate between by_software and by_method
        for i, software in enumerate(software_list[:len(software_list)//2 + 1]):
            batch = [(software, method) for method in method_list]
            batches.append(batch)
        for method in method_list:
            batch = [(software, method) for software in software_list[len(software_list)//2 + 1:]]
            if batch:
                batches.append(batch)
    
    elif strategy == "fixed_size":
        # Create fixed-size batches across all pairs
        all_items = [(sw, method) for sw in software_list for method in method_list]
        # ↓↓↓ THIS IS THE FIX ↓↓↓
        for i in range(0, len(all_items), batch_size):
            batch = all_items[i:i + batch_size]
            batches.append(batch)
        # ↑↑↑ THIS IS THE FIX ↑↑↑
    
    return batches

# Add to class
SoftwareMethodAssessor.create_batches = create_batches

print("✓ Batch creation method corrected")


✓ Batch creation method corrected


In [32]:
# %%
# =============================================================================
# MAIN BATCHED ASSESSMENT METHOD
# =============================================================================

def assess_multiple_batched(self, software_list: List[str], method_list: List[str],
                           batch_strategy: str = "by_software",
                           batch_size: int = 100,
                           overlap_percentage: float = 0.0,
                           use_openai: bool = True,
                           use_claude: bool = True,
                           use_google: bool = True,
                           openai_model: str = None,
                           claude_model: str = "claude-3-5-haiku-20241022",
                           google_model: str = "models/gemini-2.0-flash",
                           debug: bool = False) -> List[ConsensusResult]:
    """
    Assess multiple software-method combinations using batch processing
    """
    total_items = len(software_list) * len(method_list)
    
    print(f"\n{'='*70}")
    print(f"BATCH ASSESSMENT MODE")
    print(f"{'='*70}")
    print(f"Total items: {total_items}")
    print(f"Strategy: {batch_strategy}")
    print(f"LLMs: OpenAI={use_openai}, Claude={use_claude}, Google={use_google}")
    
    # Create batches
    batches = self.create_batches(software_list, method_list, batch_strategy)
    
    print(f"\nCreated {len(batches)} batches")
    for i, batch in enumerate(batches, 1):
        print(f"  Batch {i}: {len(batch)} items")
    
    # Store all individual assessments
    all_assessments = {}  # (software, method) -> list of AssessmentResult
    
    print(f"\n{'-'*70}")
    print(f"Processing batches...")
    print(f"{'-'*70}")
    
    # Process each batch with each LLM
    for batch_idx, batch in enumerate(batches, 1):
        print(f"\n[Batch {batch_idx}/{len(batches)}] {len(batch)} items")
        
        batch_results = []
        
        if use_openai and self.openai_client:
            print(f"  Assessing with OpenAI...")
            results = self.assess_batch_with_openai(batch, openai_model, debug)
            batch_results.extend(results)
            time.sleep(1)
        
        if use_claude and self.anthropic_client:
            print(f"  Assessing with Claude...")
            results = self.assess_batch_with_claude(batch, claude_model, debug)
            batch_results.extend(results)
            time.sleep(1)
        
        if use_google and self.google_enabled:
            print(f"  Assessing with Google...")
            results = self.assess_batch_with_google(batch, google_model, debug)
            batch_results.extend(results)
            time.sleep(1)
        
        # Store results
        for result in batch_results:
            key = (result.software, result.method)
            if key not in all_assessments:
                all_assessments[key] = []
            all_assessments[key].append(result)
        
        # Show progress
        stats = self.credit_tracker.get_stats()
        print(f"  Running cost: ${stats['total_cost']:.4f} ({stats['total_tokens']:,} tokens)")
    
    # Create consensus results
    print(f"\n{'-'*70}")
    print(f"Creating consensus results...")
    print(f"{'-'*70}")
    
    consensus_results = []
    
    for (software, method), assessments in all_assessments.items():
        if len(assessments) == 0:
            continue
        
        # Group by LLM provider (handle duplicates)
        by_provider = {}
        for assessment in assessments:
            if assessment.llm_provider not in by_provider:
                by_provider[assessment.llm_provider] = assessment
        
        assessments = list(by_provider.values())
        
        ranks = [a.rank for a in assessments]
        confidence, agreement_level = self.calculate_confidence(ranks)
        
        rank_counts = Counter(ranks)
        final_rank = rank_counts.most_common(1)[0][0]
        
        individual_ranks = {a.llm_provider: a.rank for a in assessments}
        individual_reasoning = {a.llm_provider: a.reasoning for a in assessments}
        individual_sources = {a.llm_provider: a.sources for a in assessments}
        
        total_tokens = sum(a.input_tokens + a.output_tokens for a in assessments)
        
        # Calculate cost
        total_cost = sum([
            self.credit_tracker.PRICING.get(
                a.llm_provider.replace('openai_', '').replace('claude_', '').replace('google_', ''),
                {'input': 0, 'output': 0}
            )['input'] * a.input_tokens / 1_000_000 +
            self.credit_tracker.PRICING.get(
                a.llm_provider.replace('openai_', '').replace('claude_', '').replace('google_', ''),
                {'input': 0, 'output': 0}
            )['output'] * a.output_tokens / 1_000_000
            for a in assessments
        ])
        
        consensus_results.append(ConsensusResult(
            software=software,
            method=method,
            final_rank=final_rank,
            confidence=confidence,
            individual_ranks=individual_ranks,
            individual_reasoning=individual_reasoning,
            individual_sources=individual_sources,
            agreement_level=agreement_level,
            total_tokens=total_tokens,
            total_cost=total_cost
        ))
    
    print(f"\nCompleted {len(consensus_results)} assessments")
    print(f"{'='*70}\n")
    
    return consensus_results

# Add to class
SoftwareMethodAssessor.assess_multiple_batched = assess_multiple_batched

print("✓ Main batched assessment method added")


✓ Main batched assessment method added


In [33]:
# %%
# =============================================================================
# RESULT MERGER
# =============================================================================

def merge_assessment_results(self, *result_files: str, output_file: str = "merged_results.json",
                            merge_strategy: str = "union") -> List[ConsensusResult]:
    """
    Merge multiple assessment result JSON files
    """
    print(f"\n{'='*70}")
    print(f"MERGING ASSESSMENT RESULTS")
    print(f"{'='*70}")
    print(f"Strategy: {merge_strategy}")
    print(f"Input files: {len(result_files)}")
    
    merged_data = {}
    
    for file_idx, file_path in enumerate(result_files, 1):
        print(f"\nProcessing file {file_idx}/{len(result_files)}: {file_path}")
        
        try:
            with open(file_path, 'r') as f:
                results = json.load(f)
            
            print(f"  Loaded {len(results)} assessments")
            
            for result in results:
                software = result['software']
                method = result['method']
                key = (software, method)
                
                if key not in merged_data:
                    merged_data[key] = result
                else:
                    # Merge: combine all LLM assessments
                    merged_data[key]['individual_ranks'].update(result['individual_ranks'])
                    merged_data[key]['individual_reasoning'].update(result['individual_reasoning'])
                    merged_data[key]['individual_sources'].update(result['individual_sources'])
                    merged_data[key]['total_tokens'] += result['total_tokens']
                    merged_data[key]['total_cost'] += result['total_cost']
                
        except Exception as e:
            print(f"  ERROR loading {file_path}: {e}")
            continue
    
    # Recalculate consensus
    print(f"\nRecalculating consensus for merged results...")
    merged_results_list = list(merged_data.values())
    
    for result in merged_results_list:
        ranks = list(result['individual_ranks'].values())
        rank_counts = Counter(ranks)
        result['final_rank'] = rank_counts.most_common(1)[0][0]
        
        # Calculate confidence
        most_common_count = rank_counts.most_common(1)[0][1]
        result['confidence'] = most_common_count / len(ranks)
        
        if result['confidence'] == 1.0:
            result['agreement_level'] = "perfect_agreement"
        elif result['confidence'] >= 0.75:
            result['agreement_level'] = "strong_agreement"
        elif result['confidence'] >= 0.5:
            result['agreement_level'] = "moderate_agreement"
        else:
            result['agreement_level'] = "weak_agreement"
    
    # Save merged results
    with open(output_file, 'w') as f:
        json.dump(merged_results_list, f, indent=2)
    
    print(f"\n✓ Merged results saved to: {output_file}")
    print(f"{'='*70}\n")
    
    # Convert to ConsensusResult objects
    consensus_results = []
    for result in merged_results_list:
        consensus_results.append(ConsensusResult(
            software=result['software'],
            method=result['method'],
            final_rank=result['final_rank'],
            confidence=result['confidence'],
            individual_ranks=result['individual_ranks'],
            individual_reasoning=result['individual_reasoning'],
            individual_sources=result['individual_sources'],
            agreement_level=result['agreement_level'],
            total_tokens=result['total_tokens'],
            total_cost=result['total_cost']
        ))
    
    return consensus_results

# Add to class
SoftwareMethodAssessor.merge_assessment_results = merge_assessment_results

print("✓ Result merger added")


✓ Result merger added


In [34]:
# %%
# =============================================================================
# LOAD YOUR SOFTWARE AND METHOD LISTS
# =============================================================================

# Replace with your actual data loading
# Example:
# software_list_all = pd.read_csv('software_list.csv')['Name'].tolist()
# method_list_all = pd.read_csv('method_list.csv')['Method'].tolist()

# For testing, here's a placeholder:
software_list_all = [
    'Power Factory Digisilent','DINIS','ERACS','Distribution Network Analysis - ETAP','IPSA',
'Power World','PSS/E','PSSE/SINCAL','SKM Power Tools','OpenDSS','Matlab & Simulink','DYMOLA','MathPower',
'RelyPES','GridLAB-D','PyPSA (Python for Power System Analysis)','TARA','PyPower/Pandapower','GridCal Sk','MatDyn',
'NEPLAN','PSAT','CYMEDIST','Synergi Electric','Dynawo','OpenModellica',
'Sienna(PowerModels.jl PowerSystems.jl & PowerSimulations.jl PowerFlows.jl)','POWSYBL','Hitachi Network Manager','Spectrum Power',
'CIMPLICITY Scada','eTerra','Netbas','Trimble NIS','GAMS'
]

method_list_all = [
'power flow analysis','security-constrained optimal power flow','security constrained unit commitment',
'Non Linear Optimal Power Flow','Multi-Period  Optimisation ','unit commitment','genetic algorithm','neural network',
'kalman filter','monte-carlo','random forest','deep-learning','particle swarm optimization','fuzzy logic','time series',
'artificial bee colony','stochastic simulation','fault analysis','reinforcement learning','linear programming','mixed integer linear programming',
'support vector machine','ensemble-learning','graph-neural network','numerical solvers','global optimization','economic dispatch ED',
'probabilistic-forecasting','General Optimization','data envelopment analysis','machine learning','deep neural network','voltage stability',
'probabilistic analysis','real-time data analysis','optimal power flow','demand response','optimal capacity configuration','sensitivity analysis',
'sequential monte carlo','fuzzy logic','load forecasting','load balancing','power forecasting','state estimation','hosting capacity',
'error estimation techniques','stochastic model','failure modeling','loss of load expectancy','system identification','economic dispatch',
'time series analysis','multi-objective optimization','expected energy not served','power system flexibility','decision tree',
'contingency analysis','load frequency control','power factor correction','voltage control strategy','multi-agent system',
'system average interruption duration index','dynamic line rating','static var compensator','dynamic programming','model predictive control',
'k-means clustering','linear regression','principal component analysis','fault detection classification',
'system average interruption frequency index','stochastic optimization','cost-benefit analysis','fuzzy inference system',
'differential evolution','multi-state model','fault tree analysis','reliability economics','short-term load forecasting',
'dynamic voltage restorer','dynamic reactive power compensation','shunt active power filter','fault detection diagnosis',
'phase-locked loop','power system restoration','load carrying capability elcc','wind power prediction','discrete wavelet transform',
'dynamic resource allocation','space vector pulse width modulation','logistic regression','game theory','binary particle swarm',
'power system stabilizer','firefly algorithm','sliding mode control','modified ieee rts','heuristic optimization','partial discharge pd',
'stochastic programming','simulated annealing','support vector regression','two-stage stochastic','adaptive neuro-fuzzy inference',
'predictive modeling','short-term memory lstm network','load shifting','cuckoo search','automatic generation control agc','quantum computing',
'power quality disturbance','doubly-fed induction','convolutional neural network cnns','empirical mode decomposition','evolution algorithm',
'deep reinforcement learning drl','minimal cut set','tabu search','generative adversarial network','gated recurrent unit',
'approximate computing','demand side management dsm','frequency variation','markov chain monte carlo','ant colony optimization',
'predictive controller','multi-objective particle swarm optimization','power generation modeling','quantile regression','dynamic pricing',
'wavelet transform dwt','modal analysis','power quality assessment','reactive power sharing','quadratic programming','stochastic unit commitment',
'interior point method','process regression','second-order cone','energy resilience analysis','metaheuristics','bayesian optimization',
'clustering analysis','power transfer distribution factor','harmony search','optimization gwo','fuzzy comprehensive evaluation',
'deep deterministic','gaussian process regression','svd','bat algorithm','cumulative distribution function','deep deterministic policy gradient',
'genetic programming','sequential quadratic programming','energy demand forecasting','supply chain optimization','levelized cost of energy lcoe',
'frequency nadir','multi-output','hybrid system modeling','proton exchange membrane','hybrid acdc microgrid','multiple-input-multiple-output mimo',
'alternating direction method','hybrid optimization model','load shedding analysis','non-dominated sorting genetic','deep q-network',
'line outage distribution factor','multi-criteria decision analysis','closed-form expression','energy transition modeling','point estimate method',
'signal noise ratio','agent-based modeling','environmental impact assessment','data-driven optimization','energy consumption modeling',
'state-space modeling','quadrature pase shift keying','multi-fidelity model','stochastic geometry','quadrature amplitude modulation',
'orthogonal frequency-division multiplexing','minimum mean square','adaptive modulation','error rate ber performance']



print(f"✓ Loaded {len(software_list_all)} software")
print(f"✓ Loaded {len(method_list_all)} methods")
print(f"✓ Total pairs to assess: {len(software_list_all) * len(method_list_all)}")


✓ Loaded 35 software
✓ Loaded 189 methods
✓ Total pairs to assess: 6615


In [41]:
# %%
# =============================================================================
# WORKING SOLUTION: Direct batch processing (GUARANTEED TO WORK)
# =============================================================================

from pathlib import Path
from datetime import datetime
import time
import pickle

output_dir = Path("software_analysis_final")
output_dir.mkdir(exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

assessor = SoftwareMethodAssessor(use_config=True, timeout=180)

# Create batches manually with batch_size=20
all_pairs = [(sw, method) for sw in software_list_all for method in method_list_all]
batch_size = 20
batches = [all_pairs[i:i + batch_size] for i in range(0, len(all_pairs), batch_size)]

print(f"\n{'='*70}")
print(f"RUN 1: OpenAI + Google (batch_size={batch_size})")
print(f"{'='*70}")
print(f"Total pairs: {len(all_pairs)}")
print(f"Total batches: {len(batches)}")
print(f"Expected time: 6-8 hours")
print(f"{'='*70}\n")

all_assessments = {}
failed_batches = []

for batch_idx, batch in enumerate(batches, 1):
    print(f"[Batch {batch_idx}/{len(batches)}] {len(batch)} items", flush=True)
    
    # OpenAI
    try:
        print(f"  OpenAI...", end='', flush=True)
        openai_results = assessor.assess_batch_with_openai(batch, 'gpt-4o-mini', debug=False)
        for result in openai_results:
            key = (result.software, result.method)
            if key not in all_assessments:
                all_assessments[key] = []
            all_assessments[key].append(result)
        print(f" ✓ {len(openai_results)}", flush=True)
    except Exception as e:
        print(f" ✗ {str(e)[:50]}", flush=True)
        failed_batches.append(('openai', batch_idx))
    
    time.sleep(2)
    
    # Google
    try:
        print(f"  Google...", end='', flush=True)
        google_results = assessor.assess_batch_with_google(batch, 'models/gemini-2.0-flash', debug=False)
        for result in google_results:
            key = (result.software, result.method)
            if key not in all_assessments:
                all_assessments[key] = []
            all_assessments[key].append(result)
        print(f" ✓ {len(google_results)}", flush=True)
    except Exception as e:
        print(f" ✗ {str(e)[:50]}", flush=True)
        failed_batches.append(('google', batch_idx))
    
    time.sleep(2)
    
    # Progress
    if batch_idx % 10 == 0:
        stats = assessor.credit_tracker.get_stats()
        print(f"  Progress: ${stats['total_cost']:.2f} | {len(all_assessments)} unique pairs")
    
    # Checkpoint every 50 batches
    if batch_idx % 50 == 0:
        checkpoint_file = output_dir / f"checkpoint_{batch_idx}.pkl"
        with open(checkpoint_file, 'wb') as f:
            pickle.dump(all_assessments, f)
        print(f"  💾 Checkpoint saved: {checkpoint_file.name}")

# Create consensus
print(f"\n{'='*70}")
print(f"Creating consensus results...")
print(f"{'='*70}")

consensus_results = []
for (software, method), assessments in all_assessments.items():
    # Remove duplicates by provider
    by_provider = {}
    for a in assessments:
        if a.llm_provider not in by_provider:
            by_provider[a.llm_provider] = a
    assessments = list(by_provider.values())
    
    if len(assessments) == 0:
        continue
    
    ranks = [a.rank for a in assessments]
    confidence, agreement_level = assessor.calculate_confidence(ranks)
    rank_counts = Counter(ranks)
    final_rank = rank_counts.most_common(1)[0][0]
    
    consensus_results.append(ConsensusResult(
        software=software,
        method=method,
        final_rank=final_rank,
        confidence=confidence,
        individual_ranks={a.llm_provider: a.rank for a in assessments},
        individual_reasoning={a.llm_provider: a.reasoning for a in assessments},
        individual_sources={a.llm_provider: a.sources for a in assessments},
        agreement_level=agreement_level,
        total_tokens=sum(a.input_tokens + a.output_tokens for a in assessments),
        total_cost=0.0
    ))

# Save results
run1_file = output_dir / f"run1_openai_google_{timestamp}.json"
assessor.export_results(consensus_results, str(run1_file))

results_df = pd.DataFrame([{
    'software': r.software,
    'method': r.method,
    'final_rank': r.final_rank,
    'confidence': r.confidence,
    'agreement_level': r.agreement_level,
    'num_llms': len(r.individual_ranks)
} for r in consensus_results])

csv_file = output_dir / f"run1_openai_google_{timestamp}.csv"
results_df.to_csv(csv_file, index=False)

# Final summary
assessor.credit_tracker.print_summary()

print(f"\n{'='*70}")
print(f"RUN 1 COMPLETE")
print(f"{'='*70}")
print(f"✓ Completed: {len(consensus_results)} unique assessments")
print(f"✗ Failed batches: {len(failed_batches)}")
if failed_batches:
    print(f"  Failed: {failed_batches[:5]}{'...' if len(failed_batches) > 5 else ''}")
print(f"\nFiles:")
print(f"  JSON: {run1_file}")
print(f"  CSV: {csv_file}")
print(f"{'='*70}\n")



RUN 1: OpenAI + Google (batch_size=20)
Total pairs: 6615
Total batches: 331
Expected time: 6-8 hours

[Batch 1/331] 20 items
  OpenAI... ✓ 20
  Google...  ERROR parsing Google batch response: Unterminated string starting at: line 190 column 18 (char 17028)
 ✓ 0
[Batch 2/331] 20 items
  OpenAI... ✓ 20
  Google...  ERROR parsing Google batch response: Unterminated string starting at: line 192 column 18 (char 18012)
 ✓ 0
[Batch 3/331] 20 items
  OpenAI... ✓ 20
  Google... ✓ 20
[Batch 4/331] 20 items
  OpenAI... ✓ 20
  Google...  ERROR parsing Google batch response: Unterminated string starting at: line 180 column 5 (char 17366)
 ✓ 0
[Batch 5/331] 20 items
  OpenAI... ✓ 20
  Google... ✓ 20
[Batch 6/331] 20 items
  OpenAI... ✓ 20
  Google... ✓ 20
[Batch 7/331] 20 items
  OpenAI... ✓ 20
  Google...  ERROR parsing Google batch response: Unterminated string starting at: line 179 column 7 (char 18159)
 ✓ 0
[Batch 8/331] 20 items
  OpenAI... ✓ 20
  Google...  ERROR parsing Google batch response

In [42]:
# %%
# =============================================================================
# EXECUTION: RUN 2 - Claude + Google (for overlap validation)
# =============================================================================

print("\n" + "="*70)
print("RUN 2: Claude + Google Assessment")
print("="*70)
print("NOTE: Google appears in both runs for validation")

# Create fresh assessor to reset token tracking
assessor_run2 = SoftwareMethodAssessor(use_config=True, timeout=180)

# Run assessment
results_run2 = assessor_run2.assess_multiple_batched(
    software_list=software_list_all,
    method_list=method_list_all,
    batch_strategy="by_software",
    use_openai=False,
    use_google=True,  # Google overlap with Run 1
    use_claude=True,
    claude_model='claude-3-5-haiku-20241022',
    google_model='models/gemini-2.0-flash'
)

# Save results
run2_file = output_dir / f"run2_claude_google_{timestamp}.json"
assessor_run2.export_results(results_run2, str(run2_file))

# Save as CSV
results_df2 = pd.DataFrame([{
    'software': r.software,
    'method': r.method,
    'final_rank': r.final_rank,
    'confidence': r.confidence,
    'agreement_level': r.agreement_level,
    'num_llms': len(r.individual_ranks)
} for r in results_run2])

csv_file2 = output_dir / f"run2_claude_google_{timestamp}.csv"
results_df2.to_csv(csv_file2, index=False)

# Print summary
assessor_run2.credit_tracker.print_summary()
print(f"\n✓ Run 2 complete!")
print(f"  JSON: {run2_file}")
print(f"  CSV: {csv_file2}")
print(f"  Completed {len(results_run2)} assessments")
print("\n" + "="*70)



RUN 2: Claude + Google Assessment
NOTE: Google appears in both runs for validation

BATCH ASSESSMENT MODE
Total items: 6615
Strategy: by_software
LLMs: OpenAI=False, Claude=True, Google=True

Created 35 batches
  Batch 1: 189 items
  Batch 2: 189 items
  Batch 3: 189 items
  Batch 4: 189 items
  Batch 5: 189 items
  Batch 6: 189 items
  Batch 7: 189 items
  Batch 8: 189 items
  Batch 9: 189 items
  Batch 10: 189 items
  Batch 11: 189 items
  Batch 12: 189 items
  Batch 13: 189 items
  Batch 14: 189 items
  Batch 15: 189 items
  Batch 16: 189 items
  Batch 17: 189 items
  Batch 18: 189 items
  Batch 19: 189 items
  Batch 20: 189 items
  Batch 21: 189 items
  Batch 22: 189 items
  Batch 23: 189 items
  Batch 24: 189 items
  Batch 25: 189 items
  Batch 26: 189 items
  Batch 27: 189 items
  Batch 28: 189 items
  Batch 29: 189 items
  Batch 30: 189 items
  Batch 31: 189 items
  Batch 32: 189 items
  Batch 33: 189 items
  Batch 34: 189 items
  Batch 35: 189 items

--------------------------

In [43]:
# %%
# =============================================================================
# MERGE BOTH RUNS
# =============================================================================

print("\n" + "="*70)
print("MERGING RUN 1 AND RUN 2")
print("="*70)

# Create merger instance
merger = SoftwareMethodAssessor(use_config=True)

# Merge results
merged_results = merger.merge_assessment_results(
    str(run1_file),
    str(run2_file),
    output_file=str(output_dir / f"merged_final_{timestamp}.json"),
    merge_strategy="union"
)

print(f"\n✓ Merged {len(merged_results)} unique assessments")

# Create summary CSV
merged_df = pd.DataFrame([{
    'software': r.software,
    'method': r.method,
    'final_rank': r.final_rank,
    'confidence': r.confidence,
    'agreement_level': r.agreement_level,
    'num_llms': len(r.individual_ranks),
    'llms_used': ', '.join(r.individual_ranks.keys()),
    'total_cost': r.total_cost
} for r in merged_results])

merged_csv = output_dir / f"merged_final_{timestamp}.csv"
merged_df.to_csv(merged_csv, index=False)

print(f"✓ Merged CSV saved to: {merged_csv}")
print("\n" + "="*70)



MERGING RUN 1 AND RUN 2

MERGING ASSESSMENT RESULTS
Strategy: union
Input files: 2

Processing file 1/2: software_analysis_final\run1_openai_google_20251019_015733.json
  Loaded 6855 assessments

Processing file 2/2: software_analysis_final\run2_claude_google_20251019_015733.json
  Loaded 0 assessments

Recalculating consensus for merged results...

✓ Merged results saved to: software_analysis_final\merged_final_20251019_015733.json


✓ Merged 6855 unique assessments
✓ Merged CSV saved to: software_analysis_final\merged_final_20251019_015733.csv



In [44]:
# %%
# =============================================================================
# GOOGLE OVERLAP VALIDATION ANALYSIS
# =============================================================================

print("\n" + "="*70)
print("GOOGLE OVERLAP VALIDATION")
print("="*70)
print("Checking consistency of Google assessments across both runs...")

google_consistency = []

for result in merged_results:
    # Find Google assessments from both runs
    google_ranks = []
    google_providers = []
    
    for llm, rank in result.individual_ranks.items():
        if 'google' in llm.lower():
            google_ranks.append(rank)
            google_providers.append(llm)
    
    if len(google_ranks) == 2:  # Google assessed twice
        consistency_record = {
            'software': result.software,
            'method': result.method,
            'google_rank_run1': google_ranks[0],
            'google_rank_run2': google_ranks[1],
            'difference': abs(google_ranks[0] - google_ranks[1]),
            'consistent': google_ranks[0] == google_ranks[1],
            'final_rank': result.final_rank,
            'confidence': result.confidence
        }
        google_consistency.append(consistency_record)

# Create consistency report
consistency_df = pd.DataFrame(google_consistency)
consistency_file = output_dir / f"google_consistency_{timestamp}.csv"
consistency_df.to_csv(consistency_file, index=False)

# Calculate statistics
if len(google_consistency) > 0:
    perfect_consistency = sum(1 for c in google_consistency if c['consistent'])
    within_one = sum(1 for c in google_consistency if c['difference'] <= 1)
    
    print(f"\nGoogle Consistency Statistics:")
    print(f"  Total pairs assessed by Google in both runs: {len(google_consistency)}")
    print(f"  Perfect consistency (exact same rank): {perfect_consistency} ({perfect_consistency/len(google_consistency)*100:.1f}%)")
    print(f"  Within ±1 rank: {within_one} ({within_one/len(google_consistency)*100:.1f}%)")
    print(f"  Average rank difference: {consistency_df['difference'].mean():.2f}")
    print(f"  Max rank difference: {consistency_df['difference'].max()}")
    
    # Show distribution of differences
    print(f"\nDifference distribution:")
    diff_counts = consistency_df['difference'].value_counts().sort_index()
    for diff, count in diff_counts.items():
        print(f"    Difference {int(diff)}: {count} pairs ({count/len(google_consistency)*100:.1f}%)")
    
    print(f"\n✓ Consistency report saved to: {consistency_file}")
else:
    print("\n⚠ No Google overlap found in merged results")

print("\n" + "="*70)



GOOGLE OVERLAP VALIDATION
Checking consistency of Google assessments across both runs...

⚠ No Google overlap found in merged results



In [45]:
# %%
# =============================================================================
# FINAL RESULTS ANALYSIS
# =============================================================================

print("\n" + "="*70)
print("FINAL RESULTS SUMMARY")
print("="*70)

# LLM coverage analysis
print(f"\nTotal unique pairs assessed: {len(merged_results)}")

llm_coverage = {}
for result in merged_results:
    num_llms = len(result.individual_ranks)
    llm_coverage[num_llms] = llm_coverage.get(num_llms, 0) + 1

print(f"\nLLM Coverage Distribution:")
for num_llms in sorted(llm_coverage.keys(), reverse=True):
    print(f"  {num_llms} LLMs: {llm_coverage[num_llms]:,} pairs ({llm_coverage[num_llms]/len(merged_results)*100:.1f}%)")

# Confidence analysis
avg_confidence = sum(r.confidence for r in merged_results) / len(merged_results)
print(f"\nAverage Confidence: {avg_confidence:.2%}")

confidence_levels = Counter([r.agreement_level for r in merged_results])
print(f"\nAgreement Level Distribution:")
for level, count in confidence_levels.most_common():
    print(f"  {level}: {count:,} pairs ({count/len(merged_results)*100:.1f}%)")

# Rank distribution
rank_dist = Counter([r.final_rank for r in merged_results])
print(f"\nRank Distribution:")
for rank in sorted(rank_dist.keys()):
    print(f"  Rank {rank}: {rank_dist[rank]:,} pairs ({rank_dist[rank]/len(merged_results)*100:.1f}%)")

# Cost analysis
total_cost_run1 = sum(r.total_cost for r in results_run1)
total_cost_run2 = sum(r.total_cost for r in results_run2)
total_cost_combined = total_cost_run1 + total_cost_run2

print(f"\nCost Breakdown:")
print(f"  Run 1 (OpenAI + Google): ${total_cost_run1:.2f}")
print(f"  Run 2 (Claude + Google): ${total_cost_run2:.2f}")
print(f"  Total: ${total_cost_combined:.2f}")
print(f"  Average per pair: ${total_cost_combined/len(merged_results):.4f}")

print("\n" + "="*70)



FINAL RESULTS SUMMARY

Total unique pairs assessed: 6855

LLM Coverage Distribution:
  2 LLMs: 3,622 pairs (52.8%)
  1 LLMs: 3,233 pairs (47.2%)

Average Confidence: 84.99%

Agreement Level Distribution:
  perfect_agreement: 4,797 pairs (70.0%)
  moderate_agreement: 2,058 pairs (30.0%)

Rank Distribution:
  Rank 0: 1,662 pairs (24.2%)
  Rank 1: 2,202 pairs (32.1%)
  Rank 2: 1,928 pairs (28.1%)
  Rank 3: 1,063 pairs (15.5%)

Cost Breakdown:
  Run 1 (OpenAI + Google): $0.00
  Run 2 (Claude + Google): $0.00
  Total: $0.00
  Average per pair: $0.0000



In [46]:
# %%
# =============================================================================
# CREATE COMPREHENSIVE SUMMARY REPORT
# =============================================================================

print("\n" + "="*70)
print("CREATING SUMMARY REPORT")
print("="*70)

summary_report = {
    'timestamp': timestamp,
    'assessment_info': {
        'total_software': len(software_list_all),
        'total_methods': len(method_list_all),
        'total_pairs_assessed': len(merged_results),
        'expected_pairs': len(software_list_all) * len(method_list_all)
    },
    'runs': {
        'run1': {
            'llms': 'OpenAI + Google',
            'pairs_assessed': len(results_run1),
            'cost': round(total_cost_run1, 4)
        },
        'run2': {
            'llms': 'Claude + Google',
            'pairs_assessed': len(results_run2),
            'cost': round(total_cost_run2, 4)
        }
    },
    'validation': {
        'google_overlap_pairs': len(google_consistency),
        'google_perfect_consistency_rate': perfect_consistency/len(google_consistency) if len(google_consistency) > 0 else 0,
        'google_within_one_rate': within_one/len(google_consistency) if len(google_consistency) > 0 else 0
    },
    'quality_metrics': {
        'average_confidence': round(avg_confidence, 4),
        'llm_coverage': {str(k): v for k, v in llm_coverage.items()},
        'agreement_levels': {k: v for k, v in confidence_levels.items()},
        'rank_distribution': {str(k): v for k, v in rank_dist.items()}
    },
    'costs': {
        'total_cost': round(total_cost_combined, 4),
        'cost_per_pair': round(total_cost_combined/len(merged_results), 6),
        'run1_cost': round(total_cost_run1, 4),
        'run2_cost': round(total_cost_run2, 4)
    },
    'files': {
        'run1_json': str(run1_file.name),
        'run1_csv': str(csv_file.name),
        'run2_json': str(run2_file.name),
        'run2_csv': str(csv_file2.name),
        'merged_json': f"merged_final_{timestamp}.json",
        'merged_csv': f"merged_final_{timestamp}.csv",
        'consistency_report': str(consistency_file.name)
    }
}

# Save summary report
summary_file = output_dir / f"_SUMMARY_{timestamp}.json"
with open(summary_file, 'w') as f:
    json.dump(summary_report, f, indent=2)

print(f"✓ Summary report saved to: {summary_file}")

# Print final file list
print(f"\n{'='*70}")
print("ALL OUTPUT FILES")
print(f"{'='*70}")
print(f"\nDirectory: {output_dir.absolute()}\n")
print("Assessment Results:")
print(f"  1. {run1_file.name} - Run 1 results (JSON)")
print(f"  2. {csv_file.name} - Run 1 results (CSV)")
print(f"  3. {run2_file.name} - Run 2 results (JSON)")
print(f"  4. {csv_file2.name} - Run 2 results (CSV)")
print(f"  5. merged_final_{timestamp}.json - Combined results (JSON)")
print(f"  6. {merged_csv.name} - Combined results (CSV)")
print(f"\nQuality Reports:")
print(f"  7. {consistency_file.name} - Google validation")
print(f"  8. {summary_file.name} - Master summary")

print(f"\n{'='*70}")
print("✓ ASSESSMENT COMPLETE!")
print(f"{'='*70}\n")



CREATING SUMMARY REPORT
✓ Summary report saved to: software_analysis_final\_SUMMARY_20251019_015733.json

ALL OUTPUT FILES

Directory: c:\git_repos\Literature-search-and-analysis\software_analysis_final

Assessment Results:
  1. run1_openai_google_20251019_015733.json - Run 1 results (JSON)
  2. run1_openai_google_20251019_015733.csv - Run 1 results (CSV)
  3. run2_claude_google_20251019_015733.json - Run 2 results (JSON)
  4. run2_claude_google_20251019_015733.csv - Run 2 results (CSV)
  5. merged_final_20251019_015733.json - Combined results (JSON)
  6. merged_final_20251019_015733.csv - Combined results (CSV)

Quality Reports:
  7. google_consistency_20251019_015733.csv - Google validation
  8. _SUMMARY_20251019_015733.json - Master summary

✓ ASSESSMENT COMPLETE!



In [47]:
# %%
# =============================================================================
# CSV MERGER - INTEGRATE WITH YOUR EXISTING DATA
# =============================================================================

class ResultCSVMerger:
    """Merge LLM results into existing CSV"""
    
    def __init__(self, csv_file: str, delimiter: str = ';', 
                 software_name_column: str = 'Name',
                 method_start_column: str = 'Numerical-solvers'):
        """Initialize merger"""
        self.csv_file = Path(csv_file)
        self.delimiter = delimiter
        self.software_name_column = software_name_column
        self.method_start_column = method_start_column
        
        # Load CSV
        self.df = pd.read_csv(csv_file, delimiter=delimiter, encoding='utf-8-sig')
        
        # Identify columns
        self.info_columns = []
        self.method_columns = []
        found_methods = False
        
        for col in self.df.columns:
            if col == method_start_column:
                found_methods = True
            if found_methods:
                self.method_columns.append(col)
            else:
                self.info_columns.append(col)
        
        print(f"✓ CSV Merger initialized:")
        print(f"  File: {csv_file}")
        print(f"  Software rows: {len(self.df)}")
        print(f"  Info columns: {len(self.info_columns)}")
        print(f"  Method columns: {len(self.method_columns)}")
    
    def _normalize_name(self, name: str) -> str:
        """Normalize name for matching"""
        if pd.isna(name):
            return ""
        name = str(name).lower()
        name = ''.join(c if c.isalnum() or c in ' -' else ' ' for c in name)
        return ' '.join(name.split())
    
    def merge_llm_results(self, llm_results_file: str, output_file: str,
                         min_confidence: float = 0.5,
                         overwrite_existing: bool = True) -> pd.DataFrame:
        """Merge LLM results into CSV"""
        
        print(f"\n{'='*70}")
        print("MERGING LLM RESULTS INTO EXISTING CSV")
        print(f"{'='*70}")
        
        # Load LLM results
        with open(llm_results_file, 'r') as f:
            llm_results = json.load(f)
        
        print(f"Loaded {len(llm_results)} LLM assessments")
        
        # Create working copy
        df_merged = self.df.copy()
        
        # Track updates
        updates = 0
        skipped_low_conf = 0
        skipped_not_found = 0
        
        for result in llm_results:
            if result['confidence'] < min_confidence:
                skipped_low_conf += 1
                continue
            
            # Find matching software row
            software_norm = self._normalize_name(result['software'])
            df_norm = self.df[self.software_name_column].apply(self._normalize_name)
            
            matches = df_norm[df_norm == software_norm]
            if len(matches) == 0:
                skipped_not_found += 1
                continue
            
            row_idx = matches.index[0]
            
            # Find matching method column
            method_norm = self._normalize_name(result['method'])
            method_cols_norm = {self._normalize_name(col): col for col in self.method_columns}
            
            if method_norm not in method_cols_norm:
                skipped_not_found += 1
                continue
            
            method_col = method_cols_norm[method_norm]
            
            # Update value
            if overwrite_existing or pd.isna(df_merged.at[row_idx, method_col]):
                df_merged.at[row_idx, method_col] = result['final_rank']
                updates += 1
        
        # Save
        df_merged.to_csv(output_file, sep=self.delimiter, index=False, encoding='utf-8-sig')
        
        print(f"\nMerge Statistics:")
        print(f"  Updated: {updates}")
        print(f"  Skipped (low confidence): {skipped_low_conf}")
        print(f"  Skipped (not found): {skipped_not_found}")
        print(f"\n✓ Updated CSV saved to: {output_file}")
        print(f"{'='*70}\n")
        
        return df_merged

print("✓ ResultCSVMerger class defined")


✓ ResultCSVMerger class defined


In [49]:
# %%
# =============================================================================
# FINAL STEP: MERGE INTO YOUR EXISTING CSV
# =============================================================================

# Run this AFTER both assessment runs are complete

# Initialize CSV merger with YOUR file
csv_merger = ResultCSVMerger(
    csv_file="input_data\Software_method_implementation_score.csv",  # ← Replace with your file path
    delimiter=';',
    software_name_column='Name',
    method_start_column='Numerical-solvers'
)

# Merge results into your CSV
updated_csv = csv_merger.merge_llm_results(
    llm_results_file=str(output_dir / f"merged_final_{timestamp}.json"),
    output_file=str(output_dir / f"software_methods_updated_{timestamp}.csv"),
    min_confidence=0.4,  # Only use results with ≥60% confidence
    overwrite_existing=True  # Overwrite existing scores
)

print("\n✓ Your CSV has been updated with LLM assessment results!")


✓ CSV Merger initialized:
  File: input_data\Software_method_implementation_score.csv
  Software rows: 39
  Info columns: 21
  Method columns: 190

MERGING LLM RESULTS INTO EXISTING CSV
Loaded 6855 LLM assessments

Merge Statistics:
  Updated: 6121
  Skipped (low confidence): 0
  Skipped (not found): 734

✓ Updated CSV saved to: software_analysis_final\software_methods_updated_20251019_015733.csv


✓ Your CSV has been updated with LLM assessment results!


In [57]:
# %%
# =============================================================================
# FIXED METHOD MIS CALCULATOR - HANDLES EUROPEAN DECIMALS
# =============================================================================

import pandas as pd
import numpy as np
from pathlib import Path
from collections import Counter

class MethodMISCalculator:
    """
    Calculate Method Implementation Score (MIS) for each METHOD
    and update the MIS row in CSV (handles European decimal format)
    """
    
    def __init__(self, csv_file: str, delimiter: str = ';',
                 software_name_column: str = 'Name',
                 osmm_column: str = 'OSMM Score',
                 method_start_column: str = 'Numerical-solvers',
                 mis_row_name: str = 'Gjennomsnittlig score (MIS)'):
        """Initialize MIS calculator"""
        self.csv_file = Path(csv_file)
        self.delimiter = delimiter
        self.software_name_column = software_name_column
        self.osmm_column = osmm_column
        self.mis_row_name = mis_row_name
        
        # Load CSV - don't try to parse decimals automatically
        self.df = pd.read_csv(csv_file, delimiter=delimiter, encoding='utf-8-sig', 
                              dtype=str)  # ← Read everything as string first
        
        # Identify method columns
        self.info_columns = []
        self.method_columns = []
        found_methods = False
        
        for col in self.df.columns:
            if col == method_start_column:
                found_methods = True
            if found_methods:
                self.method_columns.append(col)
            else:
                self.info_columns.append(col)
        
        print(f"✓ Method MIS Calculator initialized:")
        print(f"  File: {csv_file}")
        print(f"  Software: {len(self.df)}")
        print(f"  Methods: {len(self.method_columns)}")
    
    
    def _convert_to_float(self, value) -> float:
        """
        Convert value to float, handling European format (comma as decimal)
        
        Args:
            value: Value to convert (can be string with comma or period)
        
        Returns:
            Float value or 0.0 if conversion fails
        """
        if pd.isna(value) or value == '' or value == ' ':
            return 0.0
        
        try:
            # Convert to string and clean
            value_str = str(value).strip()
            
            # Replace comma with period for European decimals
            value_str = value_str.replace(',', '.')
            
            # Remove any spaces
            value_str = value_str.replace(' ', '')
            
            # Convert to float
            return float(value_str)
        except (ValueError, TypeError):
            return 0.0
    
    
    def calculate_method_mis_scores(self, output_file: str = None,
                                    include_details: bool = True,
                                    exclude_mis_row: bool = True) -> pd.DataFrame:
        """Calculate MIS scores for each METHOD"""
        print(f"\n{'='*70}")
        print(f"CALCULATING MIS SCORES PER METHOD")
        print(f"{'='*70}")
        
        method_results = []
        detailed_results = []
        
        for method_col in self.method_columns:
            method_name = method_col
            method_scores = []
            software_scores = []
            
            for idx, row in self.df.iterrows():
                software_name = row[self.software_name_column]
                
                # Skip MIS row if requested
                if exclude_mis_row and software_name == self.mis_row_name:
                    continue
                
                # Convert OSMM score using helper function
                osmm_score = self._convert_to_float(row[self.osmm_column])
                
                # Convert method rank using helper function
                method_rank = self._convert_to_float(row[method_col])
                
                # Calculate score: OSMM × method_rank
                score = osmm_score * method_rank
                method_scores.append(score)
                
                # Store detailed result
                if include_details:
                    detailed_results.append({
                        'method': method_name,
                        'software': software_name,
                        'osmm_score': osmm_score,
                        'method_rank': method_rank,
                        'contribution_to_mis': score
                    })
                
                software_scores.append({
                    'software': software_name,
                    'osmm': osmm_score,
                    'rank': method_rank,
                    'score': score
                })
            
            # Calculate MIS for this method
            mis_score = np.mean(method_scores) if method_scores else 0.0
            
            # Statistics
            software_with_implementation = sum(1 for s in software_scores if s['rank'] > 0)
            total_software = len(software_scores)
            coverage = (software_with_implementation / total_software * 100) if total_software > 0 else 0.0
            
            # Rank distribution
            rank_counts = Counter([s['rank'] for s in software_scores])
            
            method_results.append({
                'method': method_name,
                'mis_score': round(mis_score, 4),
                'software_with_implementation': software_with_implementation,
                'total_software': total_software,
                'coverage_percentage': round(coverage, 2),
                'avg_rank': round(np.mean([s['rank'] for s in software_scores]), 2),
                'rank_0_count': rank_counts.get(0.0, 0),
                'rank_1_count': rank_counts.get(1.0, 0),
                'rank_2_count': rank_counts.get(2.0, 0),
                'rank_3_count': rank_counts.get(3.0, 0),
                'max_score': round(max(method_scores), 4) if method_scores else 0.0,
                'min_score': round(min(method_scores), 4) if method_scores else 0.0
            })
        
        # Create results DataFrame
        results_df = pd.DataFrame(method_results)
        results_df = results_df.sort_values('mis_score', ascending=False)
        
        print(f"\n{'='*70}")
        print(f"METHOD MIS CALCULATION COMPLETE")
        print(f"{'='*70}")
        print(f"\nTop 20 Methods by MIS Score:")
        print(results_df[['method', 'mis_score', 'coverage_percentage', 'avg_rank']].head(20).to_string(index=False))
        
        # Save if requested
        if output_file:
            results_df.to_csv(output_file, index=False)
            print(f"\n✓ Method MIS scores saved to: {output_file}")
            
            if include_details:
                detailed_df = pd.DataFrame(detailed_results)
                detail_file = Path(output_file).parent / f"{Path(output_file).stem}_detailed.csv"
                detailed_df.to_csv(detail_file, index=False)
                print(f"✓ Detailed method scores saved to: {detail_file}")
        
        print(f"{'='*70}\n")
        
        return results_df
    
    
    def update_mis_row_in_csv(self, output_file: str = None, 
                              use_comma_as_decimal: bool = True) -> pd.DataFrame:
        """
        Update the MIS row in the CSV with calculated MIS scores
        
        Args:
            output_file: Output file path (if None, overwrites original)
            use_comma_as_decimal: If True, format MIS values with comma as decimal (European)
        
        Returns:
            Updated DataFrame
        """
        print(f"\n{'='*70}")
        print(f"UPDATING MIS ROW IN CSV")
        print(f"{'='*70}")
        
        # Calculate MIS scores per method
        print(f"\nCalculating MIS scores for each method...")
        
        method_mis_scores = {}
        
        for method_col in self.method_columns:
            method_scores = []
            
            for idx, row in self.df.iterrows():
                software_name = row[self.software_name_column]
                
                # Skip the MIS row itself
                if software_name == self.mis_row_name:
                    continue
                
                # Convert using helper function
                osmm_score = self._convert_to_float(row[self.osmm_column])
                method_rank = self._convert_to_float(row[method_col])
                
                # Calculate score
                score = osmm_score * method_rank
                method_scores.append(score)
            
            # Calculate average MIS for this method
            mis_score = np.mean(method_scores) if method_scores else 0.0
            method_mis_scores[method_col] = mis_score
        
        print(f"  Calculated MIS for {len(method_mis_scores)} methods")
        
        # Find the MIS row
        mis_row_idx = self.df[self.df[self.software_name_column] == self.mis_row_name].index
        
        if len(mis_row_idx) == 0:
            print(f"\n⚠ MIS row '{self.mis_row_name}' not found in CSV!")
            print(f"  Creating new row...")
            
            # Create new row
            new_row = {col: '' for col in self.df.columns}
            new_row[self.software_name_column] = self.mis_row_name
            
            # Add to dataframe
            self.df = pd.concat([self.df, pd.DataFrame([new_row])], ignore_index=True)
            mis_row_idx = self.df[self.df[self.software_name_column] == self.mis_row_name].index
        
        mis_row_idx = mis_row_idx[0]
        
        print(f"\n✓ Found MIS row at index {mis_row_idx}")
        print(f"  Updating {len(method_mis_scores)} method columns...")
        
        # Update each method column with its MIS score
        updates_made = 0
        for method_col, mis_score in method_mis_scores.items():
            if method_col in self.df.columns:
                old_value = self.df.at[mis_row_idx, method_col]
                
                # Format the score
                if use_comma_as_decimal:
                    # European format: comma as decimal, 4 decimals
                    formatted_score = f"{mis_score:.4f}".replace('.', ',')
                else:
                    # US format: period as decimal, 4 decimals
                    formatted_score = f"{mis_score:.4f}"
                
                self.df.at[mis_row_idx, method_col] = formatted_score
                updates_made += 1
                
                # Show some examples
                if updates_made <= 5:
                    old_display = str(old_value) if old_value else 'empty'
                    print(f"    {method_col}: {old_display} → {formatted_score}")
        
        if updates_made > 5:
            print(f"    ... and {updates_made - 5} more")
        
        # Clear OSMM Score column for MIS row
        if self.osmm_column in self.df.columns:
            self.df.at[mis_row_idx, self.osmm_column] = ''
        
        # Determine output file
        if output_file is None:
            output_file = self.csv_file
        
        # Save updated CSV
        self.df.to_csv(output_file, sep=self.delimiter, index=False, encoding='utf-8-sig')
        
        print(f"\n✓ Updated CSV saved to: {output_file}")
        print(f"  Total updates: {updates_made} method columns")
        print(f"  Decimal format: {'Comma (European)' if use_comma_as_decimal else 'Period (US)'}")
        print(f"{'='*70}\n")
        
        return self.df
    
    
    def show_mis_row_preview(self):
        """Show preview of the MIS row values"""
        print(f"\n{'='*70}")
        print(f"MIS ROW PREVIEW")
        print(f"{'='*70}")
        
        # Calculate MIS scores
        method_mis_scores = {}
        
        for method_col in self.method_columns:
            method_scores = []
            
            for idx, row in self.df.iterrows():
                software_name = row[self.software_name_column]
                
                # Skip the MIS row
                if software_name == self.mis_row_name:
                    continue
                
                osmm_score = self._convert_to_float(row[self.osmm_column])
                method_rank = self._convert_to_float(row[method_col])
                
                score = osmm_score * method_rank
                method_scores.append(score)
            
            mis_score = np.mean(method_scores) if method_scores else 0.0
            method_mis_scores[method_col] = mis_score
        
        # Sort by MIS score
        sorted_methods = sorted(method_mis_scores.items(), key=lambda x: x[1], reverse=True)
        
        print(f"\nTop 20 Methods by MIS Score:")
        print(f"{'Method':<50} {'MIS Score':>10}")
        print(f"{'-'*61}")
        for method, score in sorted_methods[:20]:
            method_display = method[:47] + "..." if len(method) > 50 else method
            print(f"{method_display:<50} {score:>10.4f}")
        
        print(f"\nBottom 10 Methods by MIS Score:")
        print(f"{'Method':<50} {'MIS Score':>10}")
        print(f"{'-'*61}")
        for method, score in sorted_methods[-10:]:
            method_display = method[:47] + "..." if len(method) > 50 else method
            print(f"{method_display:<50} {score:>10.4f}")
        
        print(f"\nStatistics:")
        scores = list(method_mis_scores.values())
        print(f"  Total methods: {len(scores)}")
        print(f"  Mean MIS: {np.mean(scores):.4f}")
        print(f"  Median MIS: {np.median(scores):.4f}")
        print(f"  Std Dev: {np.std(scores):.4f}")
        print(f"  Min: {min(scores):.4f}")
        print(f"  Max: {max(scores):.4f}")
        
        print(f"{'='*70}\n")
    
    
    def create_method_comparison_report(self, output_file: str = "method_mis_comparison.csv"):
        """Create comprehensive comparison report"""
        print(f"\n{'='*70}")
        print(f"CREATING METHOD COMPARISON REPORT")
        print(f"{'='*70}")
        
        comparison_data = []
        
        for method_col in self.method_columns:
            implementations = []
            
            for idx, row in self.df.iterrows():
                software_name = row[self.software_name_column]
                
                # Skip MIS row
                if software_name == self.mis_row_name:
                    continue
                
                osmm_score = self._convert_to_float(row[self.osmm_column])
                method_rank = self._convert_to_float(row[method_col])
                
                implementations.append({
                    'osmm': osmm_score,
                    'rank': method_rank,
                    'score': osmm_score * method_rank
                })
            
            # Calculate statistics
            mis_score = np.mean([imp['score'] for imp in implementations])
            implemented = sum(1 for imp in implementations if imp['rank'] > 0)
            direct_impl = sum(1 for imp in implementations if imp['rank'] == 3.0)
            indirect_impl = sum(1 for imp in implementations if imp['rank'] == 2.0)
            limited_impl = sum(1 for imp in implementations if imp['rank'] == 1.0)
            
            implementing_osmm = [imp['osmm'] for imp in implementations if imp['rank'] > 0]
            avg_implementing_osmm = np.mean(implementing_osmm) if implementing_osmm else 0.0
            
            comparison_data.append({
                'method': method_col,
                'mis_score': round(mis_score, 4),
                'total_software': len(implementations),
                'implemented_in': implemented,
                'coverage_pct': round(implemented / len(implementations) * 100, 2),
                'direct_implementations': direct_impl,
                'indirect_implementations': indirect_impl,
                'limited_implementations': limited_impl,
                'not_supported': len(implementations) - implemented,
                'avg_implementation_rank': round(np.mean([imp['rank'] for imp in implementations]), 2),
                'avg_osmm_of_implementers': round(avg_implementing_osmm, 2),
                'implementation_maturity_score': round(avg_implementing_osmm * (implemented / len(implementations)), 2)
            })
        
        comparison_df = pd.DataFrame(comparison_data)
        comparison_df = comparison_df.sort_values('mis_score', ascending=False)
        comparison_df.to_csv(output_file, index=False)
        
        print(f"\n✓ Method comparison report saved to: {output_file}")
        print(f"{'='*70}\n")
        
        return comparison_df





In [58]:
# %%
# =============================================================================
# USAGE WITH FIXED DECIMAL HANDLING
# =============================================================================

# Initialize calculator
method_mis_calc = MethodMISCalculator(
    csv_file="software_analysis_output/software_methods_updated.csv",
    delimiter=';',
    software_name_column='Name',
    osmm_column='OSMM Score',
    method_start_column='Numerical-solvers',
    mis_row_name='Gjennomsnittlig score (MIS)'
)

# Preview
method_mis_calc.show_mis_row_preview()



✓ Method MIS Calculator initialized:
  File: software_analysis_output/software_methods_updated.csv
  Software: 39
  Methods: 190

MIS ROW PREVIEW

Top 20 Methods by MIS Score:
Method                                              MIS Score
-------------------------------------------------------------
power flow analysis                                    1.1734
numerical solvers                                      1.0308
contingency analysis                                   0.9961
voltage control strategy                               0.9705
automatic generation control agc                       0.9705
power system stabilizer                                0.9595
static var compensator                                 0.9453
voltage stability                                      0.9413
load balancing                                         0.9395
security constrained unit commitment                   0.9311
frequency nadir                                        0.9268
dynamic reactive p

In [59]:
# Update CSV with European format (comma as decimal)
updated_df = method_mis_calc.update_mis_row_in_csv(
    output_file="software_methods_with_mis_updated.csv",
    use_comma_as_decimal=True  # ← European format
)

# Create reports
method_mis_scores = method_mis_calc.calculate_method_mis_scores(
    output_file="method_mis_scores.csv",
    include_details=True
)

method_comparison = method_mis_calc.create_method_comparison_report(
    output_file="method_mis_comparison.csv"
)

print("\n✓ All MIS calculations complete with proper decimal handling!")


UPDATING MIS ROW IN CSV

Calculating MIS scores for each method...
  Calculated MIS for 190 methods

✓ Found MIS row at index 36
  Updating 190 method columns...
    Numerical-solvers: nan → 0,0000
    power flow analysis:  1,2726  → 1,1734
    security-constrained optimal power flow:  0,9437  → 0,8587
    security constrained unit commitment:  1,0415  → 0,9311
    Non Linear Optimal Power Flow:  0,7133  → 0,6574
    ... and 185 more

✓ Updated CSV saved to: software_methods_with_mis_updated.csv
  Total updates: 190 method columns
  Decimal format: Comma (European)


CALCULATING MIS SCORES PER METHOD

METHOD MIS CALCULATION COMPLETE

Top 20 Methods by MIS Score:
                              method  mis_score  coverage_percentage  avg_rank
                 power flow analysis     1.1734                97.37      3.84
                   numerical solvers     1.0308                92.11      3.40
                contingency analysis     0.9961                92.11      3.16
    automati