In [1]:
# Cell 1: Install Required Dependencies with Version Constraints
!pip install "numpy<2.0" --upgrade
!pip install openai transformers scikit-learn ast2json pytest coverage pytest-cov pandas matplotlib seaborn tqdm nest_asyncio
!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension

Requirement already up-to-date: numpy<2.0 in ./venv/lib/python3.9/site-packages (1.26.4)
You should consider upgrading via the '/Users/sepehr/IdeaProjects/testgen-council/venv/bin/python3 -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/Users/sepehr/IdeaProjects/testgen-council/venv/bin/python3 -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/Users/sepehr/IdeaProjects/testgen-council/venv/bin/python3 -m pip install --upgrade pip' command.[0m
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.9/bin/jupyter-nbextension", line 8, in <module>
    sys.exit(main())
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/jupyter_core/application.py", line 264, in launch_instance
    return super(JupyterApp, cls).launch_instance(argv=argv, **kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/traitlets/config/applic

In [2]:
# Cell 2: Import Required Libraries with Error Handling
import ast
import json
import os
import re
import subprocess
import tempfile
from typing import List, Dict, Tuple, Any
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Core data science imports
try:
    import numpy as np
    print(f"NumPy version: {np.__version__}")
except ImportError as e:
    print(f"NumPy import error: {e}")

try:
    import pandas as pd
    print(f"Pandas version: {pd.__version__}")
except ImportError as e:
    print(f"Pandas import error: {e}")

# ML and NLP imports
try:
    from sklearn.metrics.pairwise import cosine_similarity
    print("Scikit-learn imported successfully")
except ImportError as e:
    print(f"Scikit-learn import error: {e}")

# OpenAI import
try:
    import openai
    print(f"OpenAI library version: {openai.__version__}")
except ImportError as e:
    print(f"OpenAI import error: {e}")

# Plotting imports
try:
    import matplotlib.pyplot as plt
    import seaborn as sns
    plt.rcParams['figure.figsize'] = (12, 8)
    sns.set_style("whitegrid")
    print("Plotting libraries imported successfully")
except ImportError as e:
    print(f"Plotting libraries import error: {e}")

# Progress bar
try:
    from tqdm import tqdm
    print("tqdm imported successfully")
except ImportError as e:
    print(f"tqdm import error: {e}")

print("All imports completed!")

NumPy version: 1.26.4
Pandas version: 2.3.2
Scikit-learn imported successfully
OpenAI library version: 1.108.1
Plotting libraries imported successfully
tqdm imported successfully
All imports completed!


In [3]:
# Cell 3: Configuration and API Setup
SYNTHESIZER_MODEL = "gemini-2.0-flash"

class Config:
    """Configuration class for the intelligent council system"""
    
    # API Keys (replace with your actual keys)
    OPENAI_API_KEY = "sk-JdU36bC7BG2996XHH3YmKOQG8Xm9x9ii5u5E9uwPC54oAkHE"
    
    # Base URLs for different providers
    OPENAI_BASE_URL = "https://api.gapgpt.app/v1"  # Default OpenAI

    
    # Model configurations
    LLM_MODELS = {
        "gemini-2.0-flash": {
            "type": "openai",
            "model_name": "gemini-2.0-flash",
            "base_url": OPENAI_BASE_URL,
            "api_key": OPENAI_API_KEY,
        },
        "grok-3-mini": {
            "type": "openai", 
            "model_name": "grok-3-mini",
            "base_url": OPENAI_BASE_URL,
            "api_key": OPENAI_API_KEY,
        },
        "qwen3-235b-a22b": {
            "type": "openai",
            "model_name": "qwen3-235b-a22b",
            "base_url": OPENAI_BASE_URL,
            "api_key": OPENAI_API_KEY,
        }
    }
    
    # Role-Based Test Generation Personas
    ROLES = {
        "qa_engineer": {
            "name": "By-the-Book QA Engineer",
            "philosophy": "Meticulous and systematic. Focuses on covering the function's explicit requirements.",
            "focus_categories": ["positive", "boundary"],
            "prompt_persona": """You are a meticulous QA Engineer with 15 years of experience in software testing. Your primary goal is to verify that the function behaves exactly as described in its documentation.

YOUR MISSION:
- Generate high-quality, standard tests that cover the core functionality
- Focus on positive test cases (normal, expected usage)
- Test boundary conditions explicitly mentioned in the specification
- Ensure every part of the docstring's promise is tested
- Write clear, maintainable tests that serve as documentation

APPROACH:
1. Read the function signature and docstring carefully
2. Identify all promised behaviors
3. Create tests for typical use cases
4. Test boundary values (min, max, empty, single element)
5. Verify return types and value ranges match specifications

Generate well-structured tests following pytest best practices."""
        },
        
        "agent_of_chaos": {
            "name": "Agent of Chaos",
            "philosophy": "If it can break, I will find a way. Make the function fail.",
            "focus_categories": ["negative", "edge_case"],
            "prompt_persona": """You are a destructive tester known as the "Agent of Chaos". Your mission is to BREAK this function by any means necessary.

YOUR MISSION:
- Find every possible way the function can fail
- Generate tests that SHOULD raise exceptions
- Think about unexpected, malformed, or adversarial inputs
- Test with wrong types, None values, empty data structures
- Push the function beyond its limits

ATTACK VECTORS TO CONSIDER:
1. Type violations (pass string when int expected, etc.)
2. Null/None inputs where objects are expected
3. Empty collections ([], {}, "")
4. Extreme values (very large numbers, very long strings)
5. Negative numbers where positive expected
6. Zero division scenarios
7. Invalid combinations of parameters
8. Corrupted or malformed data structures

Generate tests that you expect will raise specific exceptions (TypeError, ValueError, IndexError, ZeroDivisionError, etc.). Use pytest.raises() to verify these failures."""
        },
        
        "security_auditor": {
            "name": "Paranoid Security Auditor",
            "philosophy": "Trust nothing. Assume all input is hostile.",
            "focus_categories": ["security", "negative"],
            "prompt_persona": """You are a cybersecurity expert and penetration tester. Your task is to find security vulnerabilities in this code.

YOUR MISSION:
- Analyze the function for potential security flaws
- Generate tests that attempt to exploit vulnerabilities
- Think like an attacker trying to compromise the system

SECURITY CONCERNS TO TEST:
1. **Injection Attacks**: SQL injection, command injection, code injection
2. **Path Traversal**: Attempts to access files outside intended directory (../, absolute paths)
3. **Buffer Overflow**: Oversized inputs that might cause issues
4. **Format String Attacks**: Special characters in strings (%s, %d, {}, etc.)
5. **Insecure Deserialization**: Malicious pickled objects or JSON
6. **Input Validation Bypass**: Special characters, Unicode, null bytes
7. **Resource Exhaustion**: Inputs that could cause infinite loops or memory issues
8. **Data Leakage**: Can the function expose sensitive information?

Generate security-focused tests. If the function has file operations, test path traversal. If it processes strings, test injection. If it handles numbers, test integer overflow. If no obvious vulnerabilities exist, test with security-minded inputs (special characters, scripts, oversized data)."""
        },
        
        "abstract_thinker": {
            "name": "Abstract Thinker",
            "philosophy": "Test the underlying properties and invariants, not just specific cases.",
            "focus_categories": ["positive", "boundary", "edge_case"],
            "prompt_persona": """You are a computer scientist specializing in formal methods and property-based testing. Your goal is to verify the fundamental mathematical and logical properties of this function.

YOUR MISSION:
- Think beyond specific test cases to general properties
- Identify invariants that must always hold
- Create tests that verify logical consistency
- Check mathematical properties and relationships

PROPERTIES TO CONSIDER:
1. **Identity Properties**: f(x) with some operation returns x
2. **Inverse Properties**: decode(encode(x)) == x
3. **Idempotency**: f(f(x)) == f(x) for some functions
4. **Commutativity**: Does order matter? f(a,b) == f(b,a)?
5. **Associativity**: f(f(a,b),c) == f(a,f(b,c))?
6. **Preservation Properties**: Input length = output length?
7. **Boundary Properties**: For sorted output, output[i] <= output[i+1]
8. **Type Invariants**: Output type consistent with specification?
9. **Domain/Range Properties**: All outputs within valid range?

Generate property-based tests. You may use standard pytest format or suggest hypothesis library tests. Focus on testing fundamental truths about the function's behavior rather than specific input-output pairs."""
        }
    }
    
    # Model-Role Assignment Strategy
    # This assigns each model to specific roles based on hypothesized strengths
    # You can modify this based on your experimental results

    # MODEL_ROLE_ASSIGNMENTS = {
    #     "gemini-2.0-flash": ["qa_engineer", "abstract_thinker"],
    #     "grok-3-mini": ["agent_of_chaos", "security_auditor"],
    #     "qwen3-235b-a22b": ["qa_engineer", "agent_of_chaos"]
    # }
    
    MODEL_ROLE_ASSIGNMENTS = {
        "gemini-2.0-flash": ["qa_engineer", "abstract_thinker", "agent_of_chaos"],
        "grok-3-mini": ["qa_engineer", "agent_of_chaos"],
        "qwen3-235b-a22b": ["abstract_thinker", "security_auditor"]
    }
    
    # Test categories (kept for backward compatibility)
    TEST_CATEGORIES = [
        "positive",    # مثبت - حالات عادی
        "negative",    # منفی - حالات خطا
        "boundary",    # مرزی - مقادیر حدی
        "edge_case",   # موارد استثنایی
        "security"     # امنیتی
    ]

# Initialize configuration
config = Config()

# Setup API clients
if config.OPENAI_API_KEY != "sk-JdU36bC7BG2996XHH3YmKOQG8Xm9x9ii5u5E9uwPC54oAkHE":
    openai.api_key = config.OPENAI_API_KEY

print("✅ Configuration loaded with role-based personas:")
for role_id, role_info in config.ROLES.items():
    print(f"   🎭 {role_info['name']}")

✅ Configuration loaded with role-based personas:
   🎭 By-the-Book QA Engineer
   🎭 Agent of Chaos
   🎭 Paranoid Security Auditor
   🎭 Abstract Thinker


In [4]:
# Cell 4: Code Analysis and AST Processing Module
class CodeAnalyzer:
    """Analyzes Python code and extracts function information using AST"""
    
    @staticmethod
    def extract_function_info(code: str) -> Dict[str, Any]:
        """Extract function information from Python code"""
        try:
            # Clean up the code string and ensure proper formatting
            code = code.strip()
            
            # Try to parse with ast
            tree = ast.parse(code)
            functions = []
            
            for node in ast.walk(tree):
                if isinstance(node, ast.FunctionDef):
                    # Get function source by reconstructing from lines
                    lines = code.split('\n')
                    start_line = node.lineno - 1
                    end_line = node.end_lineno if hasattr(node, 'end_lineno') else len(lines)
                    
                    func_source = '\n'.join(lines[start_line:end_line])
                    
                    func_info = {
                        'name': node.name,
                        'args': [arg.arg for arg in node.args.args],
                        'docstring': ast.get_docstring(node),
                        'source_code': func_source,
                        'line_start': node.lineno,
                        'line_end': node.end_lineno if hasattr(node, 'end_lineno') else len(lines)
                    }
                    functions.append(func_info)
            
            return {
                'functions': functions,
                'total_functions': len(functions),
                'source_code': code
            }
            
        except SyntaxError as e:
            print(f"Syntax error parsing code: {e}")
            print(f"Error at line {e.lineno}: {e.text}")
            print(f"Code that failed to parse:\n{code}")
            return {'functions': [], 'total_functions': 0, 'source_code': code, 'syntax_error': str(e)}
        except Exception as e:
            print(f"Error parsing code: {e}")
            return {'functions': [], 'total_functions': 0, 'source_code': code, 'error': str(e)}
    
    @staticmethod
    def extract_test_methods_from_response(response: str) -> List[Dict[str, str]]:
        """Extract individual test methods from LLM response"""
        test_methods = []
        
        # Try to find test functions using regex
        test_pattern = r'def (test_\w+)\([^)]*\):(.*?)(?=def test_|\Z)'
        matches = re.findall(test_pattern, response, re.DOTALL)
        
        for match in matches:
            func_name, func_body = match
            full_test = f"def {func_name}():{func_body}"
            test_methods.append({
                'name': func_name,
                'code': full_test.strip()
            })
        
        return test_methods

# Initialize code analyzer
code_analyzer = CodeAnalyzer()

In [5]:
# Cell 5: LLM Council Module (Role-Based Version with Concurrent API Support)
import asyncio
from typing import Dict, Any, List, Tuple
import openai

class LLMCouncil:
    """Manages multiple LLM models with specialized roles for test case generation"""
    
    def __init__(self, config: Config):
        self.config = config
        self.models = config.LLM_MODELS
        self.roles = config.ROLES
        self.model_role_assignments = config.MODEL_ROLE_ASSIGNMENTS
        self.client = openai.OpenAI(
            base_url=config.OPENAI_BASE_URL,
            api_key=config.OPENAI_API_KEY
        )
        self.async_client = openai.AsyncOpenAI(
            base_url=config.OPENAI_BASE_URL,
            api_key=config.OPENAI_API_KEY
        )
        
    def create_role_based_prompt(self, function_info: Dict[str, Any], role_id: str) -> str:
        """Create a role-specific prompt for test case generation"""
        func = function_info['functions'][0] if function_info['functions'] else {}
        role = self.roles[role_id]
        
        prompt = f"""
{role['prompt_persona']}

YOUR ROLE: "{role['name']}"
PHILOSOPHY: {role['philosophy']}

FUNCTION TO TEST:
```python
{func.get('source_code', function_info['source_code'])}

FUNCTION DETAILS:
- Name: {func.get('name', 'unknown')}
- Parameters: {', '.join(func.get('args', [])) if func.get('args') else 'None'}
- Docstring: {func.get('docstring', 'No docstring provided')}

REQUIREMENTS:
1. Stay true to your role as "{role['name']}" - let your {role['philosophy'].lower()} guide your test design
2. Focus on test categories: {', '.join(role['focus_categories'])}
3. Label each test with a category comment from the definitions below:

**CATEGORY DEFINITIONS:**

**# Category: positive**
Valid, typical inputs representing normal usage. Tests the "happy path" to confirm the function fulfills its contract.
Examples: sort([3,1,2]), add(5,3), valid user credentials

**# Category: negative**
Invalid inputs that SHOULD raise exceptions. Tests graceful failure handling. MUST use pytest.raises().
Examples: divide(5,0) → ZeroDivisionError, int("abc") → ValueError, accessing non-existent files
Key: Tests error handling, not malicious exploitation (that's security)

**# Category: boundary**
Values at the LIMITS of valid ranges where behavior might change. Tests threshold values and off-by-one errors.
Examples: For range [1,100] test: 0, 1, 100, 101; empty list vs single element; MIN_INT/MAX_INT
Formula: If valid range is [a,b], test: a-1, a, a+1, b-1, b, b+1

**# Category: edge_case**
VALID but UNUSUAL scenarios - rare but legitimate use cases that might be overlooked.
Examples: already-sorted lists, all duplicates [5,5,5,5], negative indices, float('inf'), unicode "emoji😊"
Key: Unusual but still valid inputs, not boundaries of ranges

**# Category: security**
MALICIOUS/ADVERSARIAL inputs testing exploitation resistance. Focus on attack vectors and vulnerabilities.
Examples: SQL injection "'; DROP TABLE;--", path traversal "../../../etc/passwd", XSS "<script>", command injection "; rm -rf /", extremely long strings (DoS)
Key: Testing if function can be exploited, not just validation (that's negative tests)

4. Use pytest format with descriptive test names
5. Include clear assertions with meaningful error messages
6. Add docstrings explaining what each test verifies

CRITICAL: Your tests must reflect your role's philosophy: {role['philosophy']}
Your unique perspective as "{role['name']}" should be evident in test selection and design.

EXAMPLE FORMAT:

python
import pytest

def test_function_name_descriptive_scenario():
    '''Clear description of what this test verifies'''
    # Category: [appropriate category]
    # Your test implementation here
    result = function_name(test_input)
    assert result == expected, "Clear assertion message"

Generate your role-specific tests now:
"""
        return prompt

    def call_openai_model(self, prompt: str, model_config: Dict) -> str:
        """Call OpenAI API (synchronous version)"""
        try:
            response = self.client.chat.completions.create(
                model=model_config["model_name"],
                messages=[{"role": "user", "content": prompt}]
            )
            return response.choices[0].message.content
        except Exception as e:
            print(f"Error calling OpenAI API: {e}")
            return ""
    
    async def call_openai_model_async(self, prompt: str, model_config: Dict, 
                                      model_name: str, role_id: str) -> Tuple[str, str, str]:
        """Call OpenAI API asynchronously with tracking info"""
        try:
            response = await self.async_client.chat.completions.create(
                model=model_config["model_name"],
                messages=[{"role": "user", "content": prompt}]
            )
            return (model_name, role_id, response.choices[0].message.content)
        except Exception as e:
            print(f"❌ Error calling {model_name} for role {role_id}: {e}")
            return (model_name, role_id, "")

    def generate_tests_from_council(self, function_info: Dict[str, Any]) -> Dict[str, Any]:
        """Generate test cases using role-based assignments (synchronous version)"""
        council_results = {}
        
        print("🤖 Consulting Role-Based LLM Council for test generation...")
        print(f"{'='*70}")
        
        # Calculate total tasks for progress bar
        total_tasks = sum(len(roles) for roles in self.model_role_assignments.values())
        
        with tqdm(total=total_tasks, desc="Generating role-based tests") as pbar:
            for model_name, assigned_roles in self.model_role_assignments.items():
                if model_name not in self.models:
                    print(f"⚠️  Warning: Model {model_name} not found in configuration")
                    continue
                
                model_config = self.models[model_name]
                model_results = {}
                
                for role_id in assigned_roles:
                    if role_id not in self.roles:
                        print(f"⚠️  Warning: Role {role_id} not defined")
                        continue
                    
                    role = self.roles[role_id]
                    
                    try:
                        prompt = self.create_role_based_prompt(function_info, role_id)
                        
                        if model_config["type"] == "openai":
                            response = self.call_openai_model(prompt, model_config)
                        else:
                            response = ""
                        
                        test_methods = code_analyzer.extract_test_methods_from_response(response)
                        
                        model_results[role_id] = {
                            'role_name': role['name'],
                            'raw_response': response,
                            'test_methods': test_methods,
                            'test_count': len(test_methods),
                            'focus_categories': role['focus_categories']
                        }
                        
                        print(f"✅ {model_name} as '{role['name']}': {len(test_methods)} tests")
                        pbar.update(1)
                        
                    except Exception as e:
                        print(f"❌ Error with {model_name} in role {role_id}: {e}")
                        model_results[role_id] = {
                            'role_name': role['name'],
                            'raw_response': "",
                            'test_methods': [],
                            'test_count': 0,
                            'focus_categories': role['focus_categories']
                        }
                        pbar.update(1)
                
                council_results[model_name] = model_results
        
        print(f"{'='*70}")
        return council_results
    
    async def generate_tests_from_council_async(self, function_info: Dict[str, Any], 
                                                 max_concurrent: int = 7) -> Dict[str, Any]:
        """Generate test cases using role-based assignments with concurrent API calls"""
        print("🤖 Consulting Role-Based LLM Council for test generation (Concurrent Mode)...")
        print(f"{'='*70}")
        print(f"⚡ Maximum concurrent requests: {max_concurrent}")
        
        # Create a semaphore to limit concurrent requests
        semaphore = asyncio.Semaphore(max_concurrent)
        
        # Prepare all tasks
        tasks = []
        task_metadata = []  # To track which task belongs to which model-role pair
        
        for model_name, assigned_roles in self.model_role_assignments.items():
            if model_name not in self.models:
                print(f"⚠️  Warning: Model {model_name} not found in configuration")
                continue
            
            model_config = self.models[model_name]
            
            for role_id in assigned_roles:
                if role_id not in self.roles:
                    print(f"⚠️  Warning: Role {role_id} not defined")
                    continue
                
                role = self.roles[role_id]
                
                # Create prompt
                prompt = self.create_role_based_prompt(function_info, role_id)
                
                # Create async task with semaphore
                async def bounded_call(sem, p, mc, mn, rid):
                    async with sem:
                        return await self.call_openai_model_async(p, mc, mn, rid)
                
                task = bounded_call(semaphore, prompt, model_config, model_name, role_id)
                tasks.append(task)
                task_metadata.append({
                    'model_name': model_name,
                    'role_id': role_id,
                    'role_name': role['name'],
                    'focus_categories': role['focus_categories']
                })
        
        total_tasks = len(tasks)
        print(f"📊 Total API calls to make: {total_tasks}")
        
        # Execute all tasks concurrently with progress tracking
        results = []
        with tqdm(total=total_tasks, desc="Concurrent API calls") as pbar:
            # Use asyncio.gather to run all tasks
            for coro in asyncio.as_completed(tasks):
                result = await coro
                results.append(result)
                pbar.update(1)
        
        # Organize results back into the expected structure
        council_results = {}
        
        for (model_name, role_id, response), metadata in zip(results, task_metadata):
            # Ensure model_name matches metadata (it should)
            if model_name not in council_results:
                council_results[model_name] = {}
            
            # Extract test methods from response
            test_methods = code_analyzer.extract_test_methods_from_response(response)
            
            council_results[model_name][role_id] = {
                'role_name': metadata['role_name'],
                'raw_response': response,
                'test_methods': test_methods,
                'test_count': len(test_methods),
                'focus_categories': metadata['focus_categories']
            }
            
            print(f"✅ {model_name} as '{metadata['role_name']}': {len(test_methods)} tests")
        
        print(f"{'='*70}")
        return council_results

# Initialize LLM Council
llm_council = LLMCouncil(config)

In [6]:
# Cell 6: Test Classification Module (Enhanced for Role Tracking)
class TestClassifier:
    """Classifies test cases by category and tracks role assignments"""
    
    @staticmethod
    def extract_category_from_test(test_code: str) -> str:
        """Extract category from test code comments"""
        category_pattern = r'#\s*Category:\s*(\w+)'
        match = re.search(category_pattern, test_code, re.IGNORECASE)
        
        if match:
            category = match.group(1).lower()
            if category in config.TEST_CATEGORIES:
                return category
        
        # Fallback classification based on test name and content
        test_code_lower = test_code.lower()
        
        if 'error' in test_code_lower or 'exception' in test_code_lower or 'invalid' in test_code_lower:
            return 'negative'
        elif 'boundary' in test_code_lower or 'edge' in test_code_lower or 'limit' in test_code_lower:
            return 'boundary'
        elif 'security' in test_code_lower or 'auth' in test_code_lower or 'injection' in test_code_lower:
            return 'security'
        else:
            return 'positive'
    
    @staticmethod
    def classify_council_results(council_results: Dict[str, Any]) -> List[Dict[str, Any]]:
        """Classify all test cases from council results with role information"""
        all_classified_tests = []
        
        for model_name, role_results in council_results.items():
            for role_id, results in role_results.items():
                for test in results['test_methods']:
                    category = TestClassifier.extract_category_from_test(test['code'])
                    classified_test = test.copy()
                    classified_test['category'] = category
                    classified_test['source_model'] = model_name
                    classified_test['source_role'] = role_id
                    classified_test['role_name'] = results['role_name']
                    all_classified_tests.append(classified_test)
        
        return all_classified_tests

# Initialize classifier
test_classifier = TestClassifier()

In [7]:
# Cell 7: Enhanced Test Synthesizer Module (UPDATED)
class TestSynthesizer:
    """Synthesizes final optimized test file with intelligent duplicate removal"""
    
    def __init__(self, llm_council: LLMCouncil):
        self.llm_council = llm_council
    
    def create_synthesis_prompt(self, all_tests: List[Dict], function_info: Dict) -> str:
        """Create prompt for synthesizing final test file with duplicate removal"""
        func = function_info['functions'][0] if function_info['functions'] else {}
        
        # Group tests by model for better presentation
        tests_by_model = {}
        for test in all_tests:
            model = test['source_model']
            if model not in tests_by_model:
                tests_by_model[model] = []
            tests_by_model[model].append(test)
        
        # Create formatted test presentation
        test_presentation = ""
        for model, tests in tests_by_model.items():
            test_presentation += f"\n--- Tests from {model} ---\n"
            for i, test in enumerate(tests, 1):
                test_presentation += f"\nTest {i} (Category: {test['category']}):\n"
                test_presentation += f"```python\n{test['code']}\n```\n"
        
        function_name = func.get('name', 'unknown_function')
        
        prompt = f"""
You are an expert Python test engineer responsible for creating the final, optimized test suite.

ORIGINAL FUNCTION TO TEST:
```python
{func.get('source_code', function_info['source_code'])}
```

RAW TESTS FROM MULTIPLE AI MODELS:
{test_presentation}

YOUR TASK:
1. **DO NOT INCLUDE THE SOURCE FUNCTION**: The function will be saved separately in function.py and imported.

2. **USE IMPORTS**: Start your test file with: from function import {function_name}

3. **ANALYZE ALL TESTS**: Carefully examine each test case for:
   - Functionality being tested
   - Input/output scenarios
   - Edge cases covered
   - Error conditions tested

4. **REMOVE DUPLICATES**: Identify and eliminate tests that are functionally equivalent, even if they have different:
   - Variable names
   - Assertion styles
   - Code structure
   - Comments

5. **SELECT BEST REPRESENTATIVES**: When multiple tests cover the same scenario:
   - Choose the most comprehensive version
   - Prefer tests with better error messages
   - Keep tests with clearer documentation

6. **ENSURE COMPREHENSIVE COVERAGE**: Make sure the final suite covers:
   - All major functionality paths
   - Various input types and ranges
   - Error conditions and edge cases
   - All important test categories (positive, negative, boundary, edge_case, security)

7. **CREATE CLEAN, PRODUCTION-READY CODE**: Generate a single, well-organized test file with:
   - Proper imports (pytest, and function import)
   - Clear test organization by category
   - Descriptive test names and docstrings
   - Comprehensive comments
   - Consistent code style

8. **OPTIMIZE FOR QUALITY**: Prioritize test quality over quantity. Include only meaningful, non-redundant tests.

REQUIREMENTS:
- Start with: import pytest
- Then add: from function import {function_name}
- Use pytest format for all tests
- Group tests logically by category
- Ensure each test has a clear purpose
- Make test names descriptive and consistent
- DO NOT include the original function code
- DO NOT use markdown code fences (```python) in the output - provide clean Python code only

EXAMPLE OUTPUT FORMAT:

import pytest
from function import {function_name}

def test_{function_name}_positive_case():
    '''Test normal functionality with valid inputs'''
    # Category: positive
    result = {function_name}(valid_input)
    assert result == expected_output, "Should handle normal case correctly"

def test_{function_name}_boundary_case():
    '''Test boundary conditions'''
    # Category: boundary
    # Test implementation here

Generate the complete, final test file that represents the best, non-duplicate tests from all models:
"""
        return prompt
    
    def synthesize_final_test_file(self, all_tests: List[Dict], function_info: Dict) -> Dict[str, Any]:
        """Synthesize the final optimized test file with duplicate removal"""
        print("🔄 Synthesizing final test file with intelligent duplicate removal...")
        
        prompt = self.create_synthesis_prompt(all_tests, function_info)
        
        # Use the best available model for synthesis
        best_model = SYNTHESIZER_MODEL if SYNTHESIZER_MODEL in self.llm_council.models else list(self.llm_council.models.keys())[0]
        model_config = self.llm_council.models[best_model]
        
        try:
            if model_config["type"] == "openai":
                synthesized_content = self.llm_council.call_openai_model(prompt, model_config)
                
                # Clean the synthesized content immediately
                cleaned_content = self._clean_synthesized_content(synthesized_content)
                
                # Extract final tests from synthesized content
                final_tests = self._extract_final_tests_from_synthesis(cleaned_content, all_tests)
                
                print(f"✅ Final test file synthesized successfully!")
                print(f"📊 Reduced {len(all_tests)} original tests to {len(final_tests)} unique tests")
                print(f"📉 Reduction ratio: {((len(all_tests) - len(final_tests)) / len(all_tests)) * 100:.1f}%")
                
                return {
                    'synthesized_content': cleaned_content,
                    'final_tests': final_tests,
                    'original_count': len(all_tests),
                    'final_count': len(final_tests),
                    'reduction_ratio': (len(all_tests) - len(final_tests)) / len(all_tests) if len(all_tests) > 0 else 0,
                    'synthesizer_model': best_model
                }
            else:
                return self._fallback_synthesis(all_tests, function_info)
                
        except Exception as e:
            print(f"❌ Error in synthesis, using fallback: {e}")
            return self._fallback_synthesis(all_tests, function_info)
    
    def _clean_synthesized_content(self, content: str) -> str:
        """Clean synthesized content by removing markdown artifacts"""
        lines = content.split('\n')
        cleaned_lines = []
        
        for line in lines:
            stripped_line = line.strip()
            
            # Skip markdown code fence lines
            if stripped_line in ['```python', '```py', '```', '```\n']:
                continue
            
            # Remove leading ```python or ```py from lines that start with it
            if stripped_line.startswith('```python'):
                line = line.replace('```python', '', 1)
            elif stripped_line.startswith('```py'):
                line = line.replace('```py', '', 1)
            elif stripped_line.startswith('```') and stripped_line.endswith('```'):
                # Skip lines that are just code fences
                continue
            
            cleaned_lines.append(line)
        
        # Join back and clean up any extra newlines at start/end
        cleaned_code = '\n'.join(cleaned_lines).strip()
        
        # Remove any remaining triple backticks that might be at the end
        while cleaned_code.endswith('```'):
            cleaned_code = cleaned_code[:-3].strip()
        
        return cleaned_code
    
    def _extract_final_tests_from_synthesis(self, synthesized_content: str, original_tests: List[Dict]) -> List[Dict]:
        """Extract final test information from synthesized content"""
        final_tests = []
        
        # Extract test methods from synthesized content
        test_methods = code_analyzer.extract_test_methods_from_response(synthesized_content)
        
        for test_method in test_methods:
            # Classify the synthesized test
            category = test_classifier.extract_category_from_test(test_method['code'])
            
            final_test = {
                'name': test_method['name'],
                'code': test_method['code'],
                'category': category,
                'source': 'synthesized',
                'original_sources': self._find_original_sources(test_method, original_tests)
            }
            final_tests.append(final_test)
        
        return final_tests
    
    def _find_original_sources(self, synthesized_test: Dict, original_tests: List[Dict]) -> List[str]:
        """Find which original tests likely contributed to the synthesized test"""
        sources = []
        synthesized_name = synthesized_test['name'].lower()
        
        for original in original_tests:
            original_name = original['name'].lower()
            # Simple heuristic: if names are similar or test same functionality
            if (synthesized_name in original_name or original_name in synthesized_name or
                any(word in original_name for word in synthesized_name.split('_') if len(word) > 3)):
                sources.append(original['source_model'])
        
        return list(set(sources))  # Remove duplicates
    
    def _fallback_synthesis(self, all_tests: List[Dict], function_info: Dict) -> Dict[str, Any]:
        """Fallback synthesis method using simple deduplication"""
        func = function_info['functions'][0] if function_info['functions'] else {}
        function_name = func.get('name', 'unknown_function')
        
        # Simple deduplication based on test names and categories
        seen_tests = set()
        unique_tests = []
        
        for test in all_tests:
            test_signature = (test['name'].lower(), test['category'])
            if test_signature not in seen_tests:
                seen_tests.add(test_signature)
                unique_tests.append(test)
        
        header = f'''"""
Comprehensive Test Suite
Generated by Intelligent LLM Council
Target Function: {function_name}
Total Unique Tests: {len(unique_tests)}
Original Tests: {len(all_tests)}
Reduction Ratio: {((len(all_tests) - len(unique_tests)) / len(all_tests)) * 100:.1f}%
"""

import pytest
from function import {function_name}

'''
        
        # Group tests by category
        by_category = {}
        for test in unique_tests:
            category = test['category']
            if category not in by_category:
                by_category[category] = []
            by_category[category].append(test)
        
        # Generate organized test code
        test_code = header
        
        for category, tests in by_category.items():
            test_code += f"\n\n# {category.upper()} TESTS\n"
            test_code += f"# {'='*50}\n\n"
            
            for test in tests:
                # Remove any function definition from test code since we're importing
                test_lines = test['code'].split('\n')
                cleaned_test_lines = []
                for line in test_lines:
                    if line.strip().startswith('def ') and not line.strip().startswith('def test_'):
                        continue  # Skip function definitions that aren't tests
                    cleaned_test_lines.append(line)
                
                test_code += '\n'.join(cleaned_test_lines) + "\n\n"
        
        return {
            'synthesized_content': test_code,
            'final_tests': unique_tests,
            'original_count': len(all_tests),
            'final_count': len(unique_tests),
            'reduction_ratio': (len(all_tests) - len(unique_tests)) / len(all_tests) if len(all_tests) > 0 else 0,
            'synthesizer_model': 'fallback'
        }

# Initialize synthesizer
test_synthesizer = TestSynthesizer(llm_council)

In [8]:
# Cell 8: Coverage Analyzer
import subprocess
import re
import os
from typing import Dict, Any

class CoverageAnalyzer:
    """Analyzes code coverage for generated test files"""
    
    @staticmethod
    def _clean_terminal_output(output: str) -> str:
        """Remove terminal formatting and escape sequences"""
        # Remove ANSI escape sequences
        ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')
        cleaned = ansi_escape.sub('', output)
        
        # Remove carriage returns and normalize line endings
        cleaned = cleaned.replace('\r\n', '\n').replace('\r', '\n')
        
        # Remove any remaining control characters
        cleaned = re.sub(r'[\x00-\x1f\x7f-\x9f]', '', cleaned)
        
        return cleaned
    
    @staticmethod
    def _extract_function_coverage_percentage(stdout: str) -> float:
        """Extract coverage percentage specifically for function.py from pytest output"""
        try:
            # Clean terminal formatting first
            cleaned_stdout = CoverageAnalyzer._clean_terminal_output(stdout)
            
            # More robust regex pattern to find function.py coverage line
            # Pattern matches: function.py followed by whitespace, numbers, whitespace, percentage
            pattern = r'function\.py\s+\d+\s+\d+\s+(\d+)%'
            match = re.search(pattern, cleaned_stdout)
            
            if match:
                percentage = float(match.group(1))
                print(f"Debug: Found function.py coverage: {percentage}%")
                return percentage
            
            # Fallback: Look for function.py line manually
            lines = cleaned_stdout.split('\n')
            for line in lines:
                if 'function.py' in line and '%' in line:
                    print(f"Debug: Processing line: {repr(line)}")
                    
                    # Use regex to extract percentage from this specific line
                    percent_match = re.search(r'(\d+)%', line)
                    if percent_match:
                        percentage = float(percent_match.group(1))
                        print(f"Debug: Extracted percentage: {percentage}%")
                        return percentage
                    
                    # Alternative parsing if regex fails
                    parts = line.split()
                    print(f"Debug: Line parts: {parts}")
                    for part in parts:
                        if part.endswith('%'):
                            try:
                                percentage_str = part.rstrip('%')
                                percentage = float(percentage_str)
                                print(f"Debug: Parsed percentage from part '{part}': {percentage}%")
                                return percentage
                            except ValueError:
                                continue
            
            # If function.py not found specifically, return 0
            print("Warning: function.py coverage not found in output")
            print(f"Debug: Cleaned stdout:\n{cleaned_stdout}")
            return 0.0
            
        except Exception as e:
            print(f"Warning: Could not extract function.py coverage percentage: {e}")
            print(f"Debug: Raw stdout:\n{stdout}")
            return 0.0
    
    @staticmethod
    def _extract_test_execution_stats(stdout: str) -> Dict[str, int]:
        """Extract test execution statistics from pytest output"""
        try:
            cleaned_stdout = CoverageAnalyzer._clean_terminal_output(stdout)
            
            stats = {
                'total_tests': 0,
                'passed_tests': 0,
                'failed_tests': 0,
                'skipped_tests': 0,
                'error_tests': 0
            }
            
            # Look for test collection line: "collected X items"
            collection_match = re.search(r'collected (\d+) items?', cleaned_stdout)
            if collection_match:
                stats['total_tests'] = int(collection_match.group(1))
                print(f"Debug: Found total tests: {stats['total_tests']}")
            
            # Look for final result line: "X passed, Y failed, Z skipped in N.NNs"
            # Various patterns to catch different pytest output formats
            result_patterns = [
                r'(\d+) passed.*?in \d+\.\d+s',  # X passed in N.NNs
                r'(\d+) passed, (\d+) failed.*?in \d+\.\d+s',  # X passed, Y failed in N.NNs
                r'(\d+) passed, (\d+) skipped.*?in \d+\.\d+s',  # X passed, Y skipped in N.NNs
                r'(\d+) failed.*?in \d+\.\d+s',  # X failed in N.NNs
                r'(\d+) skipped.*?in \d+\.\d+s',  # X skipped in N.NNs
            ]
            
            # Try the comprehensive pattern first
            comprehensive_pattern = r'(?:(\d+) failed.*?)?(?:(\d+) passed.*?)?(?:(\d+) skipped.*?)?(?:(\d+) error.*?)?in \d+\.\d+s'
            match = re.search(comprehensive_pattern, cleaned_stdout)
            
            if match:
                failed, passed, skipped, errors = match.groups()
                if passed:
                    stats['passed_tests'] = int(passed)
                if failed:
                    stats['failed_tests'] = int(failed)
                if skipped:
                    stats['skipped_tests'] = int(skipped)
                if errors:
                    stats['error_tests'] = int(errors)
                    
                print(f"Debug: Extracted test stats - Passed: {stats['passed_tests']}, Failed: {stats['failed_tests']}, Skipped: {stats['skipped_tests']}")
            else:
                # Fallback: look for simpler patterns
                passed_match = re.search(r'(\d+) passed', cleaned_stdout)
                if passed_match:
                    stats['passed_tests'] = int(passed_match.group(1))
                    
                failed_match = re.search(r'(\d+) failed', cleaned_stdout)
                if failed_match:
                    stats['failed_tests'] = int(failed_match.group(1))
                    
                skipped_match = re.search(r'(\d+) skipped', cleaned_stdout)
                if skipped_match:
                    stats['skipped_tests'] = int(skipped_match.group(1))
                    
                print(f"Debug: Fallback extraction - Passed: {stats['passed_tests']}, Failed: {stats['failed_tests']}")
            
            # If we couldn't get total from collection, calculate from individual counts
            if stats['total_tests'] == 0:
                stats['total_tests'] = stats['passed_tests'] + stats['failed_tests'] + stats['skipped_tests'] + stats['error_tests']
            
            return stats
            
        except Exception as e:
            print(f"Warning: Could not extract test execution stats: {e}")
            return {
                'total_tests': 0,
                'passed_tests': 0,
                'failed_tests': 0,
                'skipped_tests': 0,
                'error_tests': 0
            }
    
    @staticmethod
    def _clean_test_code(code: str) -> str:
        """Clean test code by removing markdown formatting and extra whitespace"""
        # Remove markdown code block markers
        code = re.sub(r'```python\s*\n?', '', code)
        code = re.sub(r'```\s*$', '', code)
        
        # Remove leading/trailing whitespace from each line and rejoin
        lines = [line.rstrip() for line in code.split('\n')]
        cleaned_code = '\n'.join(lines).strip()
        
        return cleaned_code
    
    @staticmethod
    def analyze_coverage(function_code: str, test_code: str) -> Dict[str, Any]:
        """
        Analyze code coverage by running the test against the function
        
        Returns:
            Dict containing coverage information and analysis
        """
        import tempfile
        import shutil
        
        coverage_info = {
            'coverage_percentage': 0.0,
            'test_passed': False,
            'error_message': None,
            'stdout': '',
            'stderr': '',
            'total_tests': 0,
            'passed_tests': 0,
            'failed_tests': 0,
            'skipped_tests': 0,
            'error_tests': 0,
            'success_rate': 0.0
        }
        
        # Create temporary directory
        with tempfile.TemporaryDirectory() as temp_dir:
            try:
                # Write function code to file
                function_file = os.path.join(temp_dir, "function.py")
                with open(function_file, 'w') as f:
                    f.write(function_code)
                
                # Clean and write test code to file
                cleaned_test_code = CoverageAnalyzer._clean_test_code(test_code)
                test_file = os.path.join(temp_dir, "test_generated.py")
                with open(test_file, 'w') as f:
                    f.write(cleaned_test_code)
                
                # Change to temp directory
                original_cwd = os.getcwd()
                os.chdir(temp_dir)
                
                try:
                    # Run pytest with coverage
                    cmd = ["python", "-m", "pytest", "test_generated.py", "--cov=.", "--cov-report=term-missing", "-v"]
                    
                    result = subprocess.run(
                        cmd,
                        capture_output=True,
                        text=True,
                        timeout=30
                    )
                    
                    coverage_info['stdout'] = result.stdout
                    coverage_info['stderr'] = result.stderr
                    coverage_info['test_passed'] = result.returncode == 0
                    
                    # Extract coverage percentage for function.py
                    coverage_percentage = CoverageAnalyzer._extract_function_coverage_percentage(result.stdout)
                    coverage_info['coverage_percentage'] = coverage_percentage
                    
                    # Extract test execution statistics
                    test_stats = CoverageAnalyzer._extract_test_execution_stats(result.stdout)
                    coverage_info.update(test_stats)
                    
                    # Calculate success rate
                    if coverage_info['total_tests'] > 0:
                        coverage_info['success_rate'] = (coverage_info['passed_tests'] / coverage_info['total_tests']) * 100.0
                    else:
                        coverage_info['success_rate'] = 0.0
                    
                    print(f"Coverage analysis complete. Function.py coverage: {coverage_percentage}%")
                    print(f"Test results: {coverage_info['passed_tests']}/{coverage_info['total_tests']} passed ({coverage_info['success_rate']:.1f}%)")
                    
                finally:
                    # Restore original working directory
                    os.chdir(original_cwd)
                    
            except subprocess.TimeoutExpired:
                coverage_info['error_message'] = "Test execution timed out"
                print("Error: Test execution timed out")
            except Exception as e:
                coverage_info['error_message'] = str(e)
                print(f"Error during coverage analysis: {e}")
        
        return coverage_info


# Initialize coverage analyzer
coverage_analyzer = CoverageAnalyzer()

In [9]:
# Cell 9: Main Pipeline Orchestrator (Role-Based Version)
class IntelligentTestCouncil:
    """Main orchestrator for the intelligent role-based test generation pipeline"""
    
    def __init__(self, config: Config):
        self.config = config
        self.code_analyzer = CodeAnalyzer()
        self.llm_council = LLMCouncil(config)
        self.test_classifier = TestClassifier()
        self.test_synthesizer = TestSynthesizer(self.llm_council)
        self.coverage_analyzer = CoverageAnalyzer()
        
    def generate_comprehensive_tests(self, function_code: str) -> Dict[str, Any]:
        """Main pipeline for generating comprehensive test suite with role-based generation"""
        print("🚀 Starting Role-Based Intelligent Test Council Pipeline")
        print("=" * 70)
        
        # Step 1: Analyze the input function
        print("\n📝 Step 1: Analyzing input function...")
        function_info = self.code_analyzer.extract_function_info(function_code)
        
        if not function_info['functions']:
            error_msg = 'No functions found in the provided code'
            if 'syntax_error' in function_info:
                error_msg += f". Syntax error: {function_info['syntax_error']}"
            return {'error': error_msg}
        
        print(f"✅ Found {function_info['total_functions']} function(s)")
        
        # Step 2: Generate tests using role-based LLM council
        print("\n🎭 Step 2: Consulting Role-Based LLM Council...")
        council_results = self.llm_council.generate_tests_from_council(function_info)
        
        # Step 3: Classify all test cases
        print("\n🏷️  Step 3: Classifying test cases by category and role...")
        all_classified_tests = self.test_classifier.classify_council_results(council_results)
        
        print(f"✅ Total tests generated: {len(all_classified_tests)}")
        
        # Display role distribution
        role_counts = Counter(test['role_name'] for test in all_classified_tests)
        print("\n🎭 Role distribution:")
        for role_name, count in role_counts.items():
            print(f"   • {role_name}: {count} tests")
        
        # Display category distribution
        category_counts = Counter(test['category'] for test in all_classified_tests)
        print("\n📊 Category distribution:")
        for category, count in category_counts.items():
            print(f"   • {category}: {count} tests")
        
        # Display model-role performance matrix
        print("\n🔬 Model-Role Performance Matrix:")
        for model_name, role_results in council_results.items():
            print(f"\n   {model_name}:")
            for role_id, results in role_results.items():
                print(f"      └─ {results['role_name']}: {results['test_count']} tests")
        
        # Step 4: Synthesize final test file with intelligent duplicate removal
        print(f"\n🔄 Step 4: Synthesizing final test file with duplicate removal...")
        synthesis_results = self.test_synthesizer.synthesize_final_test_file(all_classified_tests, function_info)
        
        # Step 5: Analyze coverage
        print("\n📊 Step 5: Analyzing code coverage...")
        coverage_results = self.coverage_analyzer.analyze_coverage(
            function_code, 
            synthesis_results['synthesized_content']
        )
        
        # Prepare comprehensive results with role-based metrics
        results = {
            'function_info': function_info,
            'council_results': council_results,
            'all_classified_tests': all_classified_tests,
            'synthesis_results': synthesis_results,
            'final_test_file': synthesis_results['synthesized_content'],
            'coverage_results': coverage_results,
            'statistics': {
                'original_test_count': len(all_classified_tests),
                'final_test_count': synthesis_results['final_count'],
                'reduction_ratio': synthesis_results['reduction_ratio'],
                'coverage_percentage': coverage_results.get('coverage_percentage', 0.0),
                'test_success_rate': coverage_results.get('success_rate', 0.0),
                'total_tests_run': coverage_results.get('total_tests', 0),
                'passed_tests': coverage_results.get('passed_tests', 0),
                'failed_tests': coverage_results.get('failed_tests', 0),
                'skipped_tests': coverage_results.get('skipped_tests', 0),
                'error_tests': coverage_results.get('error_tests', 0),
                'models_used': list(council_results.keys()),
                'roles_used': list(set(test['role_name'] for test in all_classified_tests)),
                'categories_found': list(category_counts.keys()),
                'synthesizer_model': synthesis_results['synthesizer_model'],
                # Role-specific metrics
                'tests_per_role': dict(role_counts),
                'tests_per_category': dict(category_counts),
                'model_role_matrix': {
                    model: {role: results['test_count'] for role, results in roles.items()}
                    for model, roles in council_results.items()
                }
            }
        }
        
        print("\n🎉 Pipeline completed successfully!")
        print(f"📊 Test Success Rate: {coverage_results.get('success_rate', 0.0):.1f}%")
        print(f"📈 Code Coverage: {coverage_results.get('coverage_percentage', 0.0):.1f}%")
        print(f"✅ Passed Tests: {coverage_results.get('passed_tests', 0)}/{coverage_results.get('total_tests', 0)}")
        print("=" * 70)
        
        return results
    
    def clean_python_code(self, code_content: str) -> str:
        """Clean Python code by removing markdown code fences and extra formatting"""
        lines = code_content.split('\n')
        cleaned_lines = []
        
        for line in lines:
            stripped_line = line.strip()
            
            # Skip markdown code fence lines
            if stripped_line in ['```python', '```py', '```']:
                continue
            
            # Remove leading ```python or ```py from lines that start with it
            if stripped_line.startswith('```python'):
                line = line.replace('```python', '', 1)
            elif stripped_line.startswith('```py'):
                line = line.replace('```py', '', 1)
            elif stripped_line.startswith('```') and stripped_line.endswith('```'):
                # Skip lines that are just code fences
                continue
            
            cleaned_lines.append(line)
        
        # Join back and clean up any extra newlines at start/end
        cleaned_code = '\n'.join(cleaned_lines).strip()
        
        # Remove any remaining triple backticks that might be at the end
        while cleaned_code.endswith('```'):
            cleaned_code = cleaned_code[:-3].strip()
        
        return cleaned_code
    
    def save_results(self, results: Dict[str, Any], output_dir: str = "./test_output"):
        """Save all results to files with separate function file"""
        os.makedirs(output_dir, exist_ok=True)

        # Get the original function code
        function_info = results['function_info']
        if function_info['functions']:
            function_code = function_info['functions'][0]['source_code']
        else:
            function_code = function_info['source_code']

        # Save the function under test to function.py
        with open(f"{output_dir}/function.py", 'w', encoding='utf-8') as f:
            f.write(function_code)

        # The test file content should already be cleaned from synthesis
        test_file_content = results['final_test_file']

        # Additional cleaning just in case
        cleaned_test_content = self.clean_python_code(test_file_content)

        # Save final test file (cleaned Python code) 
        with open(f"{output_dir}/test_generated.py", 'w', encoding='utf-8') as f:
            f.write(cleaned_test_content)

        # Save detailed results
        with open(f"{output_dir}/analysis_results.json", 'w', encoding='utf-8') as f:
            # Convert results to JSON-serializable format
            json_results = results.copy()
            # Remove non-serializable content
            if 'synthesis_results' in json_results:
                json_results['synthesis_results'] = {
                    k: v for k, v in json_results['synthesis_results'].items() 
                    if k != 'synthesized_content'  # This is already in final_test_file
                }
            json.dump(json_results, f, indent=2, ensure_ascii=False)

        print(f"\n📁 Results saved to {output_dir}/")
        print(f"🔧 Function under test: {output_dir}/function.py")
        print(f"✅ Clean Python test file: {output_dir}/test_generated.py")
        print(f"📊 Analysis results: {output_dir}/analysis_results.json")
        
        # Print role-based summary
        if 'statistics' in results and 'tests_per_role' in results['statistics']:
            print(f"\n🎭 Role-Based Generation Summary:")
            for role, count in results['statistics']['tests_per_role'].items():
                print(f"   • {role}: {count} tests")

# Initialize the main pipeline
intelligent_council = IntelligentTestCouncil(config)

In [10]:
# # Cell 10: Role Assignment Optimization Experiment
# import random
# import json
# from collections import defaultdict
# from datetime import datetime
# import pandas as pd
# import asyncio
# import nest_asyncio

# # Enable nested event loops for Jupyter compatibility
# nest_asyncio.apply()

# class RoleAssignmentExperiment:
#     """Conducts experiments to determine optimal model-role assignments with concurrent processing"""
    
#     def __init__(self, llm_council, code_analyzer, test_classifier):
#         self.llm_council = llm_council
#         self.code_analyzer = code_analyzer
#         self.test_classifier = test_classifier
#         self.results = []
        
#     def load_dataset(self, dataset_path: str) -> List[Dict]:
#         """Load functions from dataset"""
#         try:
#             with open(dataset_path, 'r') as f:
#                 data = json.load(f)
#             functions = data.get('functions', [])
#             print(f"✅ Loaded {len(functions)} functions from dataset")
#             return functions
#         except Exception as e:
#             print(f"❌ Error loading dataset: {e}")
#             return []
    
#     def sample_functions(self, functions: List[Dict], n: int = 20, seed: int = 42) -> List[Dict]:
#         """Randomly sample n functions from dataset"""
#         random.seed(seed)
#         sampled = random.sample(functions, min(n, len(functions)))
#         print(f"📊 Sampled {len(sampled)} functions for experiment")
#         return sampled
    
#     async def process_single_function_async(self, func_data: Dict, func_idx: int, total: int) -> Dict[str, Any]:
#         """Process a single function through the council asynchronously"""
#         print(f"\n{'='*80}")
#         print(f"Processing function {func_idx}/{total}: {func_data.get('name', 'unknown')}")
#         print(f"Category: {func_data.get('category', 'unknown')}")
#         print(f"{'='*80}")
        
#         try:
#             # Extract function info
#             function_info = self.code_analyzer.extract_function_info(func_data['source'])
            
#             if not function_info['functions']:
#                 print(f"⚠️ Could not parse function {func_data.get('name', 'unknown')}")
#                 return None
            
#             # Generate tests from council using concurrent API calls
#             council_results = await self.llm_council.generate_tests_from_council_async(
#                 function_info, 
#                 max_concurrent=7
#             )
            
#             # Classify tests
#             classified_tests = self.test_classifier.classify_council_results(council_results)
            
#             # Aggregate statistics
#             stats = self._aggregate_function_stats(func_data, classified_tests)
            
#             return stats
            
#         except Exception as e:
#             print(f"❌ Error processing function {func_data.get('name', 'unknown')}: {e}")
#             import traceback
#             traceback.print_exc()
#             return None
    
#     def _aggregate_function_stats(self, func_data: Dict, classified_tests: List[Dict]) -> Dict[str, Any]:
#         """Aggregate statistics for a single function"""
#         stats = {
#             'function_name': func_data.get('name', 'unknown'),
#             'function_category': func_data.get('category', 'unknown'),
#             'function_file': func_data.get('file', 'unknown'),
#             'total_tests_generated': len(classified_tests),
#             'model_role_category_matrix': defaultdict(lambda: defaultdict(lambda: defaultdict(int))),
#             'model_totals': defaultdict(int),
#             'role_totals': defaultdict(int),
#             'category_totals': defaultdict(int),
#             'tests': classified_tests
#         }
        
#         # Count tests by model, role, and category
#         for test in classified_tests:
#             model = test['source_model']
#             role = test['source_role']
#             category = test['category']
            
#             stats['model_role_category_matrix'][model][role][category] += 1
#             stats['model_totals'][model] += 1
#             stats['role_totals'][role] += 1
#             stats['category_totals'][category] += 1
        
#         return stats
    
#     async def run_experiment_async(self, dataset_path: str, n_functions: int = 20, seed: int = 42) -> Dict[str, Any]:
#         """Run the complete role assignment optimization experiment with concurrent processing"""
#         print("🚀 Starting Role Assignment Optimization Experiment (Concurrent Mode)")
#         print(f"Target: {n_functions} functions")
#         print(f"Random seed: {seed}")
#         print(f"Maximum concurrent API requests: 10")
#         print(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        
#         # Load and sample functions
#         all_functions = self.load_dataset(dataset_path)
#         if not all_functions:
#             return {'error': 'Failed to load dataset'}
        
#         sampled_functions = self.sample_functions(all_functions, n_functions, seed)
        
#         # Process each function
#         function_results = []
#         for idx, func_data in enumerate(sampled_functions, 1):
#             result = await self.process_single_function_async(func_data, idx, len(sampled_functions))
#             if result:
#                 function_results.append(result)
#                 self._save_checkpoint(function_results, idx)
        
#         # Aggregate cross-function statistics
#         aggregated_stats = self._aggregate_cross_function_stats(function_results)
        
#         # Generate recommendations
#         recommendations = self._generate_role_recommendations(aggregated_stats)
        
#         experiment_results = {
#             'timestamp': datetime.now().isoformat(),
#             'n_functions_processed': len(function_results),
#             'n_functions_target': n_functions,
#             'seed': seed,
#             'max_concurrent_requests': 10,
#             'function_results': function_results,
#             'aggregated_stats': aggregated_stats,
#             'recommendations': recommendations
#         }
        
#         # Save final results
#         self._save_experiment_results(experiment_results)
        
#         # Print summary
#         self._print_experiment_summary(experiment_results)
        
#         return experiment_results
    
#     def run_experiment(self, dataset_path: str, n_functions: int = 20, seed: int = 42) -> Dict[str, Any]:
#         """Wrapper to run async experiment from sync context (Jupyter-compatible)"""
#         # Check if there's already a running event loop (Jupyter)
#         try:
#             loop = asyncio.get_running_loop()
#             # We're in Jupyter, use the existing loop
#             return loop.run_until_complete(self.run_experiment_async(dataset_path, n_functions, seed))
#         except RuntimeError:
#             # No running loop, create a new one
#             return asyncio.run(self.run_experiment_async(dataset_path, n_functions, seed))
    
#     def _aggregate_cross_function_stats(self, function_results: List[Dict]) -> Dict[str, Any]:
#         """Aggregate statistics across all functions"""
#         aggregated = {
#             'total_tests': 0,
#             'model_role_category_totals': defaultdict(lambda: defaultdict(lambda: defaultdict(int))),
#             'model_category_totals': defaultdict(lambda: defaultdict(int)),
#             'role_category_totals': defaultdict(lambda: defaultdict(int)),
#             'model_totals': defaultdict(int),
#             'role_totals': defaultdict(int),
#             'category_totals': defaultdict(int)
#         }
        
#         for func_result in function_results:
#             aggregated['total_tests'] += func_result['total_tests_generated']
            
#             # Aggregate model × role × category
#             for model, roles in func_result['model_role_category_matrix'].items():
#                 for role, categories in roles.items():
#                     for category, count in categories.items():
#                         aggregated['model_role_category_totals'][model][role][category] += count
#                         aggregated['model_category_totals'][model][category] += count
#                         aggregated['role_category_totals'][role][category] += count
            
#             # Aggregate totals
#             for model, count in func_result['model_totals'].items():
#                 aggregated['model_totals'][model] += count
#             for role, count in func_result['role_totals'].items():
#                 aggregated['role_totals'][role] += count
#             for category, count in func_result['category_totals'].items():
#                 aggregated['category_totals'][category] += count
        
#         return aggregated
    
#     def _generate_role_recommendations(self, aggregated_stats: Dict) -> Dict[str, Any]:
#         """Generate role assignment recommendations based on performance"""
#         recommendations = {
#             'model_strengths': {},
#             'optimal_assignments': {},
#             'specialization_scores': {}
#         }
        
#         # Calculate specialization scores for each model-role-category combination
#         for model, roles in aggregated_stats['model_role_category_totals'].items():
#             model_strengths = defaultdict(dict)
            
#             for role, categories in roles.items():
#                 role_info = self.llm_council.roles[role]
#                 focus_categories = role_info['focus_categories']
                
#                 # Calculate alignment score: tests in focus categories / total tests
#                 focus_count = sum(categories.get(cat, 0) for cat in focus_categories)
#                 total_count = sum(categories.values())
                
#                 if total_count > 0:
#                     alignment_score = focus_count / total_count
#                     productivity_score = total_count  # Raw number of tests
                    
#                     # Combined score: weighted average of alignment and productivity
#                     combined_score = (alignment_score * 0.6) + (min(productivity_score / 10, 1.0) * 0.4)
                    
#                     model_strengths[role] = {
#                         'alignment_score': alignment_score,
#                         'productivity_score': productivity_score,
#                         'combined_score': combined_score,
#                         'focus_categories': focus_categories,
#                         'category_distribution': dict(categories)
#                     }
            
#             recommendations['model_strengths'][model] = dict(model_strengths)
        
#         # Determine optimal assignments (highest combined score for each model)
#         for model, strengths in recommendations['model_strengths'].items():
#             if strengths:
#                 best_roles = sorted(strengths.items(), key=lambda x: x[1]['combined_score'], reverse=True)
#                 recommendations['optimal_assignments'][model] = [role for role, _ in best_roles[:2]]  # Top 2 roles
        
#         return recommendations
    
#     def _save_checkpoint(self, function_results: List[Dict], checkpoint_num: int):
#         """Save checkpoint of results"""
#         checkpoint_path = f'experiment_checkpoint_{checkpoint_num}.json'
#         try:
#             # Convert defaultdict to regular dict for JSON serialization
#             serializable_results = []
#             for result in function_results:
#                 serializable_result = result.copy()
#                 serializable_result['model_role_category_matrix'] = {
#                     model: {
#                         role: dict(categories)
#                         for role, categories in roles.items()
#                     }
#                     for model, roles in result['model_role_category_matrix'].items()
#                 }
#                 serializable_result['model_totals'] = dict(result['model_totals'])
#                 serializable_result['role_totals'] = dict(result['role_totals'])
#                 serializable_result['category_totals'] = dict(result['category_totals'])
#                 serializable_results.append(serializable_result)
            
#             with open(checkpoint_path, 'w') as f:
#                 json.dump(serializable_results, f, indent=2)
#         except Exception as e:
#             print(f"⚠️ Could not save checkpoint: {e}")
    
#     def _save_experiment_results(self, experiment_results: Dict):
#         """Save final experiment results"""
#         output_path = f'role_assignment_experiment_{datetime.now().strftime("%Y%m%d_%H%M%S")}.json'
        
#         try:
#             # Make results JSON serializable
#             serializable_results = self._make_serializable(experiment_results)
            
#             with open(output_path, 'w') as f:
#                 json.dump(serializable_results, f, indent=2)
            
#             print(f"\n💾 Results saved to: {output_path}")
#         except Exception as e:
#             print(f"⚠️ Could not save experiment results: {e}")
    
#     def _make_serializable(self, obj):
#         """Convert defaultdicts and other non-serializable objects to regular dicts"""
#         if isinstance(obj, defaultdict):
#             return {k: self._make_serializable(v) for k, v in obj.items()}
#         elif isinstance(obj, dict):
#             return {k: self._make_serializable(v) for k, v in obj.items()}
#         elif isinstance(obj, list):
#             return [self._make_serializable(item) for item in obj]
#         else:
#             return obj
    
#     def _print_experiment_summary(self, experiment_results: Dict):
#         """Print a comprehensive summary of experiment results"""
#         print(f"\n{'='*80}")
#         print("📊 ROLE ASSIGNMENT OPTIMIZATION EXPERIMENT SUMMARY")
#         print(f"{'='*80}\n")
        
#         stats = experiment_results['aggregated_stats']
#         recs = experiment_results['recommendations']
        
#         print(f"Functions Processed: {experiment_results['n_functions_processed']}/{experiment_results['n_functions_target']}")
#         print(f"Total Tests Generated: {stats['total_tests']}")
#         print(f"Max Concurrent Requests: {experiment_results.get('max_concurrent_requests', 'N/A')}")
#         print(f"\n{'─'*80}\n")
        
#         # Model productivity
#         print("📈 MODEL PRODUCTIVITY (Total Tests Generated):")
#         for model, count in sorted(stats['model_totals'].items(), key=lambda x: x[1], reverse=True):
#             print(f"   {model}: {count} tests")
        
#         print(f"\n{'─'*80}\n")
        
#         # Role distribution
#         print("🎭 ROLE DISTRIBUTION (Total Tests per Role):")
#         for role, count in sorted(stats['role_totals'].items(), key=lambda x: x[1], reverse=True):
#             role_name = self.llm_council.roles[role]['name']
#             print(f"   {role_name}: {count} tests")
        
#         print(f"\n{'─'*80}\n")
        
#         # Category distribution
#         print("📁 CATEGORY DISTRIBUTION (Total Tests per Category):")
#         for category, count in sorted(stats['category_totals'].items(), key=lambda x: x[1], reverse=True):
#             print(f"   {category}: {count} tests")
        
#         print(f"\n{'='*80}\n")
#         print("🎯 RECOMMENDED OPTIMAL ROLE ASSIGNMENTS:")
#         print(f"{'='*80}\n")
        
#         for model, roles in recs['optimal_assignments'].items():
#             print(f"💡 {model}:")
#             for role in roles:
#                 role_name = self.llm_council.roles[role]['name']
#                 strength = recs['model_strengths'][model][role]
#                 print(f"   → {role_name}")
#                 print(f"      Alignment: {strength['alignment_score']:.2%}")
#                 print(f"      Productivity: {strength['productivity_score']} tests")
#                 print(f"      Combined Score: {strength['combined_score']:.3f}")
#                 print()
        
#         print(f"{'='*80}\n")
        
#         # Create detailed performance matrix
#         self._print_performance_matrix(stats, recs)
    
#     def _print_performance_matrix(self, stats: Dict, recs: Dict):
#         """Print detailed performance matrix"""
#         print("📊 DETAILED PERFORMANCE MATRIX (Model × Role × Category):")
#         print(f"{'='*80}\n")
        
#         for model in stats['model_role_category_totals'].keys():
#             print(f"🤖 {model}:")
#             print(f"{'─'*76}")
            
#             if model in recs['model_strengths']:
#                 for role, strength_data in recs['model_strengths'][model].items():
#                     role_name = self.llm_council.roles[role]['name']
#                     categories = strength_data['category_distribution']
                    
#                     print(f"\n   🎭 {role_name}:")
#                     print(f"      Focus Categories: {', '.join(strength_data['focus_categories'])}")
#                     print(f"      Performance:")
                    
#                     for category in sorted(categories.keys()):
#                         count = categories[category]
#                         is_focus = category in strength_data['focus_categories']
#                         marker = "★" if is_focus else " "
#                         print(f"         {marker} {category}: {count} tests")
                    
#                     print(f"      → Alignment: {strength_data['alignment_score']:.2%} | "
#                           f"Combined Score: {strength_data['combined_score']:.3f}")
            
#             print()

# # Initialize and run experiment
# experiment = RoleAssignmentExperiment(llm_council, code_analyzer, test_classifier)

# # Run the experiment with 20 random functions using concurrent API calls
# print("⚡ Starting experiment with concurrent API processing...")
# results = experiment.run_experiment(
#     dataset_path='data/python_algorithms_dataset.json',
#     n_functions=20,
#     seed=42
# )

In [11]:
# # Cell 11: Analyze Role Assignment Results from Checkpoints
# import json
# import glob
# from collections import defaultdict
# import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns
# import numpy as np

# class RoleAssignmentAnalyzer:
#     """Analyze checkpoint results to determine optimal model-role assignments"""
    
#     def __init__(self, roles_config):
#         """
#         Args:
#             roles_config: Dictionary of role configurations from llm_council.roles
#         """
#         self.roles_config = roles_config
#         self.aggregated_data = None
#         self.recommendations = None
    
#     def load_checkpoints(self, checkpoint_pattern='experiment_checkpoint_*.json'):
#         """Load all checkpoint files and aggregate results"""
#         checkpoint_files = sorted(glob.glob(checkpoint_pattern), 
#                                  key=lambda x: int(x.split('_')[-1].split('.')[0]))
        
#         if not checkpoint_files:
#             print(f"❌ No checkpoint files found matching pattern: {checkpoint_pattern}")
#             return None
        
#         print(f"📂 Found {len(checkpoint_files)} checkpoint files")
        
#         # Aggregate all checkpoint data
#         all_function_results = []
#         for checkpoint_file in checkpoint_files:
#             with open(checkpoint_file, 'r') as f:
#                 checkpoint_data = json.load(f)
#                 # Each checkpoint file contains a list with one function result
#                 all_function_results.extend(checkpoint_data)
        
#         print(f"✅ Loaded {len(all_function_results)} function results")
        
#         # Aggregate statistics
#         self.aggregated_data = self._aggregate_statistics(all_function_results)
#         return self.aggregated_data
    
#     def _aggregate_statistics(self, function_results):
#         """Aggregate statistics across all function results"""
#         aggregated = {
#             'total_tests': 0,
#             'model_role_category_matrix': defaultdict(lambda: defaultdict(lambda: defaultdict(int))),
#             'model_totals': defaultdict(int),
#             'role_totals': defaultdict(int),
#             'category_totals': defaultdict(int)
#         }
        
#         for func_result in function_results:
#             aggregated['total_tests'] += func_result['total_tests_generated']
            
#             # Aggregate model × role × category
#             for model, roles in func_result['model_role_category_matrix'].items():
#                 for role, categories in roles.items():
#                     for category, count in categories.items():
#                         aggregated['model_role_category_matrix'][model][role][category] += count
            
#             # Aggregate totals
#             for model, count in func_result['model_totals'].items():
#                 aggregated['model_totals'][model] += count
#             for role, count in func_result['role_totals'].items():
#                 aggregated['role_totals'][role] += count
#             for category, count in func_result['category_totals'].items():
#                 aggregated['category_totals'][category] += count
        
#         return aggregated
    
#     def calculate_scores(self):
#         """Calculate alignment, productivity, and combined scores for each model-role combination"""
#         if self.aggregated_data is None:
#             print("❌ No aggregated data. Run load_checkpoints() first.")
#             return None
        
#         scores = defaultdict(lambda: defaultdict(dict))
        
#         for model, roles in self.aggregated_data['model_role_category_matrix'].items():
#             for role, categories in roles.items():
#                 role_info = self.roles_config[role]
#                 focus_categories = role_info['focus_categories']
                
#                 # Calculate metrics
#                 focus_count = sum(categories.get(cat, 0) for cat in focus_categories)
#                 total_count = sum(categories.values())
                
#                 if total_count > 0:
#                     alignment_score = focus_count / total_count
#                     productivity_score = total_count
#                     # Combined score: weighted average (60% alignment, 40% normalized productivity)
#                     combined_score = (alignment_score * 0.6) + (min(productivity_score / 10, 1.0) * 0.4)
                    
#                     scores[model][role] = {
#                         'alignment': alignment_score,
#                         'productivity': productivity_score,
#                         'combined': combined_score,
#                         'focus_categories': focus_categories,
#                         'category_distribution': dict(categories)
#                     }
        
#         self.recommendations = dict(scores)
#         return self.recommendations
    
#     def create_summary_dataframes(self):
#         """Create pandas DataFrames for easy viewing and analysis"""
#         if self.recommendations is None:
#             print("❌ No recommendations. Run calculate_scores() first.")
#             return None
        
#         # Create separate DataFrames for each metric
#         models = sorted(self.recommendations.keys())
#         roles = sorted(set(role for model_roles in self.recommendations.values() 
#                           for role in model_roles.keys()))
        
#         # Productivity DataFrame
#         productivity_data = []
#         for model in models:
#             row = {'Model': model}
#             for role in roles:
#                 if role in self.recommendations[model]:
#                     row[self.roles_config[role]['name']] = self.recommendations[model][role]['productivity']
#                 else:
#                     row[self.roles_config[role]['name']] = 0
#             productivity_data.append(row)
#         df_productivity = pd.DataFrame(productivity_data).set_index('Model')
        
#         # Alignment DataFrame
#         alignment_data = []
#         for model in models:
#             row = {'Model': model}
#             for role in roles:
#                 if role in self.recommendations[model]:
#                     row[self.roles_config[role]['name']] = self.recommendations[model][role]['alignment']
#                 else:
#                     row[self.roles_config[role]['name']] = 0
#             alignment_data.append(row)
#         df_alignment = pd.DataFrame(alignment_data).set_index('Model')
        
#         # Combined Score DataFrame
#         combined_data = []
#         for model in models:
#             row = {'Model': model}
#             for role in roles:
#                 if role in self.recommendations[model]:
#                     row[self.roles_config[role]['name']] = self.recommendations[model][role]['combined']
#                 else:
#                     row[self.roles_config[role]['name']] = 0
#             combined_data.append(row)
#         df_combined = pd.DataFrame(combined_data).set_index('Model')
        
#         return {
#             'productivity': df_productivity,
#             'alignment': df_alignment,
#             'combined': df_combined
#         }
    
#     def visualize_results(self, figsize=(20, 12)):
#         """Create comprehensive visualizations of model-role performance"""
#         dataframes = self.create_summary_dataframes()
#         if dataframes is None:
#             return
        
#         fig, axes = plt.subplots(2, 2, figsize=figsize)
#         fig.suptitle('Model-Role Performance Analysis', fontsize=16, fontweight='bold', y=1.00)
        
#         # 1. Productivity Heatmap
#         ax1 = axes[0, 0]
#         sns.heatmap(dataframes['productivity'], annot=True, fmt='.0f', cmap='YlOrRd', 
#                    ax=ax1, cbar_kws={'label': 'Tests Generated'})
#         ax1.set_title('Productivity Score (Total Tests Generated)', fontweight='bold')
#         ax1.set_xlabel('')
        
#         # 2. Alignment Heatmap
#         ax2 = axes[0, 1]
#         sns.heatmap(dataframes['alignment'], annot=True, fmt='.2%', cmap='YlGnBu', 
#                    ax=ax2, cbar_kws={'label': 'Alignment Score'}, vmin=0, vmax=1)
#         ax2.set_title('Alignment Score (Focus Category Accuracy)', fontweight='bold')
#         ax2.set_xlabel('')
        
#         # 3. Combined Score Heatmap
#         ax3 = axes[1, 0]
#         sns.heatmap(dataframes['combined'], annot=True, fmt='.3f', cmap='RdYlGn', 
#                    ax=ax3, cbar_kws={'label': 'Combined Score'}, vmin=0, vmax=1)
#         ax3.set_title('Combined Score (60% Alignment + 40% Productivity)', fontweight='bold')
#         ax3.set_xlabel('')
        
#         # 4. Best Model per Role (Bar Chart)
#         ax4 = axes[1, 1]
#         best_models_per_role = {}
#         for role in dataframes['combined'].columns:
#             best_model = dataframes['combined'][role].idxmax()
#             best_score = dataframes['combined'][role].max()
#             best_models_per_role[role] = (best_model, best_score)
        
#         roles_list = list(best_models_per_role.keys())
#         scores_list = [score for _, score in best_models_per_role.values()]
#         models_list = [model for model, _ in best_models_per_role.values()]
        
#         colors = plt.cm.Set3(np.linspace(0, 1, len(set(models_list))))
#         model_to_color = {model: colors[i] for i, model in enumerate(sorted(set(models_list)))}
#         bar_colors = [model_to_color[model] for model in models_list]
        
#         bars = ax4.barh(roles_list, scores_list, color=bar_colors)
#         ax4.set_xlabel('Combined Score', fontweight='bold')
#         ax4.set_title('Best Model per Role', fontweight='bold')
#         ax4.set_xlim(0, 1)
        
#         # Add model names on bars
#         for i, (bar, model) in enumerate(zip(bars, models_list)):
#             width = bar.get_width()
#             ax4.text(width + 0.02, bar.get_y() + bar.get_height()/2, 
#                     f'{model}', ha='left', va='center', fontsize=9)
        
#         plt.tight_layout()
#         plt.show()
    
#     def print_recommendations(self):
#         """Print detailed recommendations for role assignments"""
#         if self.recommendations is None:
#             print("❌ No recommendations. Run calculate_scores() first.")
#             return
        
#         print(f"\n{'='*80}")
#         print("🎯 ROLE ASSIGNMENT RECOMMENDATIONS")
#         print(f"{'='*80}\n")
        
#         # For each role, show which model performs best
#         roles = sorted(set(role for model_roles in self.recommendations.values() 
#                           for role in model_roles.keys()))
        
#         for role in roles:
#             role_name = self.roles_config[role]['name']
#             focus_cats = self.roles_config[role]['focus_categories']
            
#             print(f"\n{'─'*80}")
#             print(f"📋 Role: {role_name}")
#             print(f"   Focus Categories: {', '.join(focus_cats)}")
#             print(f"{'─'*80}")
            
#             # Collect scores for this role across all models
#             model_scores = []
#             for model in sorted(self.recommendations.keys()):
#                 if role in self.recommendations[model]:
#                     scores = self.recommendations[model][role]
#                     model_scores.append((model, scores))
            
#             # Sort by combined score
#             model_scores.sort(key=lambda x: x[1]['combined'], reverse=True)
            
#             print(f"\n   Model Performance Ranking:\n")
#             for rank, (model, scores) in enumerate(model_scores, 1):
#                 marker = "🥇" if rank == 1 else "🥈" if rank == 2 else "🥉" if rank == 3 else f"  {rank}."
#                 print(f"   {marker} {model}")
#                 print(f"      ├─ Productivity: {scores['productivity']} tests")
#                 print(f"      ├─ Alignment:    {scores['alignment']:.1%}")
#                 print(f"      └─ Combined:     {scores['combined']:.3f}")
                
#                 if rank == 1:
#                     print(f"      ✅ RECOMMENDED for this role")
#                 print()
        
#         print(f"\n{'='*80}")
#         print("💡 SUMMARY: Optimal Model-Role Assignments")
#         print(f"{'='*80}\n")
        
#         # Show best model for each role
#         for role in roles:
#             role_name = self.roles_config[role]['name']
#             best_model = None
#             best_score = -1
            
#             for model in self.recommendations.keys():
#                 if role in self.recommendations[model]:
#                     if self.recommendations[model][role]['combined'] > best_score:
#                         best_score = self.recommendations[model][role]['combined']
#                         best_model = model
            
#             if best_model:
#                 print(f"   {role_name:30s} → {best_model}")
        
#         print(f"\n{'='*80}\n")

# # Create analyzer instance
# analyzer = RoleAssignmentAnalyzer(llm_council.roles)

# # Load and analyze checkpoint data
# print("📊 Loading checkpoint files...")
# aggregated_data = analyzer.load_checkpoints('experiment_checkpoint_*.json')

# if aggregated_data:
#     print("\n🔍 Calculating performance scores...")
#     recommendations = analyzer.calculate_scores()
    
#     print("\n📈 Creating visualizations...")
#     analyzer.visualize_results(figsize=(20, 12))
    
#     print("\n📋 Generating recommendations...")
#     analyzer.print_recommendations()
    
#     # Show summary tables
#     print("\n" + "="*80)
#     print("📊 DETAILED SCORE TABLES")
#     print("="*80 + "\n")
    
#     dfs = analyzer.create_summary_dataframes()
    
#     print("\n1️⃣ PRODUCTIVITY SCORES (Total Tests Generated):")
#     print(dfs['productivity'].to_string())
    
#     print("\n\n2️⃣ ALIGNMENT SCORES (Focus Category Accuracy):")
#     print(dfs['alignment'].applymap(lambda x: f"{x:.1%}").to_string())
    
#     print("\n\n3️⃣ COMBINED SCORES (Weighted Performance):")
#     print(dfs['combined'].applymap(lambda x: f"{x:.3f}").to_string())


In [12]:
# Cell 10: Async Demo with Concurrent API Calls
import nest_asyncio
import asyncio
from datetime import datetime

# Enable nested asyncio for Jupyter
nest_asyncio.apply()

class AsyncIntelligentTestCouncil(IntelligentTestCouncil):
    """Async version of the test council with concurrent API calls"""
    
    async def generate_comprehensive_tests_async(self, function_code: str, max_concurrent: int = 7) -> Dict[str, Any]:
        """Async version of main pipeline with concurrent API calls"""
        print("🚀 Starting Role-Based Intelligent Test Council Pipeline (Async Mode)")
        print("=" * 70)
        
        # Step 1: Analyze the input function
        print("\n📝 Step 1: Analyzing input function...")
        function_info = self.code_analyzer.extract_function_info(function_code)
        
        if not function_info['functions']:
            error_msg = 'No functions found in the provided code'
            if 'syntax_error' in function_info:
                error_msg += f". Syntax error: {function_info['syntax_error']}"
            return {'error': error_msg}
        
        print(f"✅ Found {function_info['total_functions']} function(s)")
        
        # Step 2: Generate tests using role-based LLM council with CONCURRENT API calls
        print(f"\n🎭 Step 2: Consulting Role-Based LLM Council (Concurrent Mode)...")
        council_results = await self.llm_council.generate_tests_from_council_async(
            function_info, 
            max_concurrent=max_concurrent
        )
        
        # Step 3: Classify all test cases
        print("\n🏷️  Step 3: Classifying test cases by category and role...")
        all_classified_tests = self.test_classifier.classify_council_results(council_results)
        
        print(f"✅ Total tests generated: {len(all_classified_tests)}")
        
        # Display role distribution
        role_counts = Counter(test['role_name'] for test in all_classified_tests)
        print("\n🎭 Role distribution:")
        for role_name, count in role_counts.items():
            print(f"   • {role_name}: {count} tests")
        
        # Display category distribution
        category_counts = Counter(test['category'] for test in all_classified_tests)
        print("\n📊 Category distribution:")
        for category, count in category_counts.items():
            print(f"   • {category}: {count} tests")
        
        # Display model-role performance matrix
        print("\n🔬 Model-Role Performance Matrix:")
        for model_name, role_results in council_results.items():
            print(f"\n   {model_name}:")
            for role_id, results in role_results.items():
                print(f"      └─ {results['role_name']}: {results['test_count']} tests")
        
        # Step 4: Synthesize final test file
        print(f"\n🔄 Step 4: Synthesizing final test file with duplicate removal...")
        synthesis_results = self.test_synthesizer.synthesize_final_test_file(all_classified_tests, function_info)
        
        # Step 5: Analyze coverage
        print("\n📊 Step 5: Analyzing code coverage...")
        coverage_results = self.coverage_analyzer.analyze_coverage(
            function_code, 
            synthesis_results['synthesized_content']
        )
        
        # Prepare comprehensive results
        results = {
            'function_info': function_info,
            'council_results': council_results,
            'all_classified_tests': all_classified_tests,
            'synthesis_results': synthesis_results,
            'final_test_file': synthesis_results['synthesized_content'],
            'coverage_results': coverage_results,
            'statistics': {
                'original_test_count': len(all_classified_tests),
                'final_test_count': synthesis_results['final_count'],
                'reduction_ratio': synthesis_results['reduction_ratio'],
                'coverage_percentage': coverage_results.get('coverage_percentage', 0.0),
                'test_success_rate': coverage_results.get('success_rate', 0.0),
                'total_tests_run': coverage_results.get('total_tests', 0),
                'passed_tests': coverage_results.get('passed_tests', 0),
                'failed_tests': coverage_results.get('failed_tests', 0),
                'skipped_tests': coverage_results.get('skipped_tests', 0),
                'error_tests': coverage_results.get('error_tests', 0),
                'models_used': list(council_results.keys()),
                'roles_used': list(set(test['role_name'] for test in all_classified_tests)),
                'categories_found': list(category_counts.keys()),
                'synthesizer_model': synthesis_results['synthesizer_model'],
                'tests_per_role': dict(role_counts),
                'tests_per_category': dict(category_counts),
                'model_role_matrix': {
                    model: {role: results['test_count'] for role, results in roles.items()}
                    for model, roles in council_results.items()
                }
            }
        }
        
        print("\n🎉 Pipeline completed successfully!")
        print(f"📊 Test Success Rate: {coverage_results.get('success_rate', 0.0):.1f}%")
        print(f"📈 Code Coverage: {coverage_results.get('coverage_percentage', 0.0):.1f}%")
        print(f"✅ Passed Tests: {coverage_results.get('passed_tests', 0)}/{coverage_results.get('total_tests', 0)}")
        print("=" * 70)
        
        return results

# Example function to test
example_function_1 = '''def divide_numbers(a, b):
    """
    Divide two numbers with error handling

    Args:
        a (float): Numerator
        b (float): Denominator

    Returns:
        float: Result of division

    Raises:
        ValueError: If denominator is zero
        TypeError: If inputs are not numeric
    """
    if not isinstance(a, (int, float)) or not isinstance(b, (int, float)):
        raise TypeError("Both arguments must be numeric")

    if b == 0:
        raise ValueError("Cannot divide by zero")

    return a / b'''

# Example 2: More complex function (PROPERLY INDENTED)
example_function_2 = '''def validate_password(password):
    """
    Validate password strength

    Args:
        password (str): Password to validate

    Returns:
        dict: Validation results with 'valid' boolean and 'errors' list
    """
    if not isinstance(password, str):
        return {'valid': False, 'errors': ['Password must be a string']}

    errors = []

    if len(password) < 8:
        errors.append('Password must be at least 8 characters long')

    if not any(c.isupper() for c in password):
        errors.append('Password must contain at least one uppercase letter')

    if not any(c.islower() for c in password):
        errors.append('Password must contain at least one lowercase letter')

    if not any(c.isdigit() for c in password):
        errors.append('Password must contain at least one digit')

    special_chars = '!@#$%^&*(),.?":{}|<>'
    if not any(c in special_chars for c in password):
        errors.append('Password must contain at least one special character')

    return {'valid': len(errors) == 0, 'errors': errors}'''

async def demonstrate_council_async(max_concurrent: int = 7):
    """Async demonstration of the intelligent council with concurrent API calls"""
    
    print("🎯 Demonstrating Intelligent Test Council (Async + Concurrent Mode)")
    print("=" * 70)
    print(f"⚡ Maximum concurrent API requests: {max_concurrent}")
    print("=" * 70)
    
    start_time = datetime.now()
    
    # Initialize async version of the council
    async_council = AsyncIntelligentTestCouncil(config)
    
    # Choose example to run
    selected_function = example_function_1
    
    try:
        # Run the intelligent council pipeline with concurrent API calls
        results = await async_council.generate_comprehensive_tests_async(
            selected_function,
            max_concurrent=max_concurrent
        )
        
        if 'error' in results:
            print(f"❌ Error: {results['error']}")
            return results
        
        # Calculate execution time
        end_time = datetime.now()
        execution_time = (end_time - start_time).total_seconds()
        
        # Display key statistics
        stats = results['statistics']
        print(f"\n📊 Key Statistics:")
        print(f"   • Execution time: {execution_time:.2f} seconds")
        print(f"   • Original tests generated: {stats['original_test_count']}")
        print(f"   • Final tests after synthesis: {stats['final_test_count']}")
        print(f"   • Reduction ratio: {stats['reduction_ratio']:.2%}")
        print(f"   • Code coverage: {stats['coverage_percentage']:.1f}%")
        print(f"   • Test success rate: {stats['test_success_rate']:.1f}%")
        print(f"   • Models used: {', '.join(stats['models_used'])}")
        print(f"   • Roles used: {', '.join(stats['roles_used'])}")
        print(f"   • Test categories: {', '.join(stats['categories_found'])}")
        print(f"   • Synthesizer model: {stats['synthesizer_model']}")
        
        # Display role-based metrics
        print(f"\n🎭 Role-Based Generation Summary:")
        for role, count in stats['tests_per_role'].items():
            print(f"   • {role}: {count} tests")
        
        # Save results
        async_council.save_results(results)
        
        # Display final test file preview
        print(f"\n📋 Final Test File Preview:")
        print("-" * 70)
        preview_length = 1500
        if len(results['final_test_file']) > preview_length:
            print(results['final_test_file'][:preview_length] + "\n... (truncated)")
        else:
            print(results['final_test_file'])
        
        print(f"\n⏱️  Total execution time: {execution_time:.2f} seconds")
        
        return results
        
    except Exception as e:
        print(f"❌ Pipeline failed: {e}")
        import traceback
        traceback.print_exc()
        return {'error': str(e)}

# Run the async demo
print("🚀 Starting async demo with concurrent API calls...")
demo_results = await demonstrate_council_async(max_concurrent=7)

🚀 Starting async demo with concurrent API calls...
🎯 Demonstrating Intelligent Test Council (Async + Concurrent Mode)
⚡ Maximum concurrent API requests: 7
🚀 Starting Role-Based Intelligent Test Council Pipeline (Async Mode)

📝 Step 1: Analyzing input function...
✅ Found 1 function(s)

🎭 Step 2: Consulting Role-Based LLM Council (Concurrent Mode)...
🤖 Consulting Role-Based LLM Council for test generation (Concurrent Mode)...
⚡ Maximum concurrent requests: 7
📊 Total API calls to make: 7


Concurrent API calls: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:45<00:00,  6.56s/it]


✅ gemini-2.0-flash as 'By-the-Book QA Engineer': 13 tests
✅ gemini-2.0-flash as 'Abstract Thinker': 10 tests
✅ gemini-2.0-flash as 'Agent of Chaos': 14 tests
✅ qwen3-235b-a22b as 'By-the-Book QA Engineer': 11 tests
✅ qwen3-235b-a22b as 'Agent of Chaos': 14 tests
✅ grok-3-mini as 'Abstract Thinker': 17 tests
✅ grok-3-mini as 'Paranoid Security Auditor': 9 tests

🏷️  Step 3: Classifying test cases by category and role...
✅ Total tests generated: 88

🎭 Role distribution:
   • By-the-Book QA Engineer: 24 tests
   • Abstract Thinker: 27 tests
   • Agent of Chaos: 28 tests
   • Paranoid Security Auditor: 9 tests

📊 Category distribution:
   • positive: 27 tests
   • negative: 27 tests
   • edge_case: 17 tests
   • boundary: 10 tests
   • security: 7 tests

🔬 Model-Role Performance Matrix:

   gemini-2.0-flash:
      └─ By-the-Book QA Engineer: 13 tests
      └─ Abstract Thinker: 10 tests
      └─ Agent of Chaos: 14 tests

   qwen3-235b-a22b:
      └─ By-the-Book QA Engineer: 11 tests
      └

In [16]:
# # Cell 14: Batch Testing and Evaluation (FIXED)
# def batch_evaluate_functions(function_list: List[str], output_file: str = "batch_evaluation.csv"):
#     """Evaluate multiple functions in batch and generate comparison report"""
    
#     print(f"🔄 Starting batch evaluation of {len(function_list)} functions...")
    
#     results_data = []
    
#     for i, func_code in enumerate(tqdm(function_list, desc="Processing functions")):
#         try:
#             print(f"\n📝 Processing function {i+1}/{len(function_list)}")
            
#             # Run council pipeline
#             results = intelligent_council.generate_comprehensive_tests(func_code)
            
#             if 'error' in results:
#                 print(f"❌ Error processing function {i+1}: {results['error']}")
#                 row_data = {
#                     'function_name': f'function_{i+1}',
#                     'original_tests': 0,
#                     'final_tests': 0,
#                     'reduction_ratio': 0,
#                     'coverage_percentage': 0,
#                     'models_used': 0,
#                     'categories_count': 0,
#                     'categories': '',
#                     'error': results['error'],
#                     'success': False
#                 }
#                 results_data.append(row_data)
#                 continue
            
#             # Extract key metrics
#             stats = results['statistics']
#             func_info = results['function_info']
#             func_name = func_info['functions'][0]['name'] if func_info['functions'] else f"function_{i+1}"
            
#             row_data = {
#                 'function_name': func_name,
#                 'original_tests': stats['original_test_count'],
#                 'final_tests': stats['final_test_count'],
#                 'reduction_ratio': stats['reduction_ratio'],
#                 'coverage_percentage': stats['coverage_percentage'],
#                 'models_used': len(stats['models_used']),
#                 'categories_count': len(stats['categories_found']),
#                 'categories': ','.join(stats['categories_found']),
#                 'synthesizer_model': stats['synthesizer_model'],
#                 'success': True
#             }
            
#             results_data.append(row_data)
#             print(f"✅ Function {i+1} processed successfully")
            
#         except Exception as e:
#             print(f"❌ Error processing function {i+1}: {e}")
#             results_data.append({
#                 'function_name': f'function_{i+1}',
#                 'original_tests': 0,
#                 'final_tests': 0,
#                 'reduction_ratio': 0,
#                 'coverage_percentage': 0,
#                 'models_used': 0,
#                 'categories_count': 0,
#                 'categories': '',
#                 'error': str(e),
#                 'success': False
#             })
    
#     # Create DataFrame and save results
#     df = pd.DataFrame(results_data)
#     df.to_csv(output_file, index=False)
    
#     # Generate summary statistics
#     successful_runs = df[df['success'] == True]
    
#     if len(successful_runs) > 0:
#         print(f"\n📊 Batch Evaluation Summary:")
#         print(f"   • Successful runs: {len(successful_runs)}/{len(function_list)}")
#         print(f"   • Average original tests: {successful_runs['original_tests'].mean():.1f}")
#         print(f"   • Average final tests: {successful_runs['final_tests'].mean():.1f}")
#         print(f"   • Average reduction ratio: {successful_runs['reduction_ratio'].mean():.2%}")
#         print(f"   • Average coverage: {successful_runs['coverage_percentage'].mean():.1f}%")
        
#         # Visualization
#         fig, axes = plt.subplots(2, 2, figsize=(15, 10))
        
#         # Test count distribution
#         axes[0, 0].hist(successful_runs['final_tests'], bins=10, alpha=0.7, edgecolor='black')
#         axes[0, 0].set_title('Distribution of Final Test Counts')
#         axes[0, 0].set_xlabel('Number of Final Tests')
#         axes[0, 0].set_ylabel('Frequency')
        
#         # Coverage distribution
#         axes[0, 1].hist(successful_runs['coverage_percentage'], bins=10, alpha=0.7, edgecolor='black')
#         axes[0, 1].set_title('Distribution of Code Coverage')
#         axes[0, 1].set_xlabel('Coverage Percentage')
#         axes[0, 1].set_ylabel('Frequency')
        
#         # Reduction ratio distribution
#         axes[1, 0].hist(successful_runs['reduction_ratio'], bins=10, alpha=0.7, edgecolor='black')
#         axes[1, 0].set_title('Distribution of Test Reduction Ratios')
#         axes[1, 0].set_xlabel('Reduction Ratio')
#         axes[1, 0].set_ylabel('Frequency')
        
#         # Correlation plot
#         axes[1, 1].scatter(successful_runs['original_tests'], successful_runs['coverage_percentage'])
#         axes[1, 1].set_title('Original Tests vs Coverage')
#         axes[1, 1].set_xlabel('Original Test Count')
#         axes[1, 1].set_ylabel('Coverage Percentage')
        
#         plt.tight_layout()
#         plt.show()
#     else:
#         print("❌ No successful runs to analyze")
    
#     print(f"📁 Detailed results saved to {output_file}")
#     return df

# # Example batch evaluation with the corrected functions
# example_functions = [example_function_1, example_function_2]
# batch_results = batch_evaluate_functions(example_functions)