In [7]:
import os
import sys
import json
import time
import statistics
import traceback
from typing import Dict, List, Any, Tuple
from dataclasses import dataclass, asdict
from datetime import datetime
import pandas as pd
import numpy as np
from collections import defaultdict
import gc
import signal

# Import your RAG system
try:
    from rag_script_24 import ConfigurableRAG, DEFAULT_CONFIG, create_rag_instance
    print("✅ Successfully imported RAG system")
except ImportError as e:
    print(f"❌ Failed to import RAG system: {e}")
    print("Make sure your RAG file is named correctly or update the import")
    sys.exit(1)

def safe_mean(values: List[float], default: float = 0.0) -> float:
    """Safely calculate mean, returning default if list is empty."""
    return statistics.mean(values) if values else default

def safe_max(values: List[float], default: float = 0.0) -> float:
    """Safely calculate max, returning default if list is empty."""
    return max(values) if values else default

def safe_min(values: List[float], default: float = 0.0) -> float:
    """Safely calculate min, returning default if list is empty."""
    return min(values) if values else default

def safe_variance(values: List[float], default: float = 0.0) -> float:
    """Safely calculate variance, returning default if list is empty or has one element."""
    return statistics.variance(values) if len(values) > 1 else default

@dataclass
class TestResult:
    """Comprehensive test result container."""
    config_id: str
    config_name: str
    query: str
    query_type: str
    answer: str
    response_time: float
    answer_length: int
    word_count: int
    sentence_count: int
    sources_count: int
    chunks_used: int
    similarity_scores: List[float]
    quality_scores: List[float]
    cross_encoder_scores: List[float]
    cached: bool
    error: str = None
    
    # Computed quality metrics
    avg_similarity: float = 0.0
    max_similarity: float = 0.0
    min_similarity: float = 0.0
    avg_quality: float = 0.0
    avg_cross_encoder: float = 0.0
    similarity_variance: float = 0.0
    information_density: float = 0.0
    technical_terms_count: int = 0
    numerical_data_count: int = 0
    concrete_examples: int = 0
    
    def __post_init__(self):
        """Calculate derived metrics with safe operations."""
        # Similarity metrics - using safe functions
        self.avg_similarity = safe_mean(self.similarity_scores)
        self.max_similarity = safe_max(self.similarity_scores)
        self.min_similarity = safe_min(self.similarity_scores)
        self.similarity_variance = safe_variance(self.similarity_scores)
        
        # Quality metrics
        self.avg_quality = safe_mean(self.quality_scores)
        
        # Cross-encoder metrics
        self.avg_cross_encoder = safe_mean(self.cross_encoder_scores)
        
        # Content analysis
        self._analyze_content()
    
    def _analyze_content(self):
        """Analyze answer content for quality indicators."""
        if not self.answer:
            return
        
        answer_lower = self.answer.lower()
        
        # Count technical terms
        re_terms = [
            'cap rate', 'noi', 'cash flow', 'roi', 'irr', 'debt service',
            'capitalization', 'dcf', 'appreciation', 'vacancy', 'operating expenses',
            'gross rent multiplier', 'debt-to-equity', 'leverage', 'amortization',
            'depreciation', 'basis points', 'yield', 'market value', 'appraisal'
        ]
        self.technical_terms_count = sum(1 for term in re_terms if term in answer_lower)
        
        # Count numerical data (percentages, dollar amounts, ratios)
        import re
        numbers = re.findall(r'\b\d+(?:,\d{3})*(?:\.\d+)?[%$]?|\$[\d,]+(?:\.\d{2})?|\d+:\d+|\d+\.?\d*%', self.answer)
        self.numerical_data_count = len(numbers)
        
        # Count concrete examples (sentences with "for example", "such as", specific scenarios)
        example_indicators = ['for example', 'such as', 'for instance', 'consider', 'suppose']
        self.concrete_examples = sum(1 for indicator in example_indicators if indicator in answer_lower)
        
        # Information density (technical terms + numbers per 100 words)
        if self.word_count > 0:
            self.information_density = ((self.technical_terms_count + self.numerical_data_count) / self.word_count) * 100

class RAGGridTester:
    """Comprehensive RAG testing with multiple configurations and quality metrics."""
    
    def __init__(self):
        self.base_config = DEFAULT_CONFIG.copy()
        self.rag_instance = None
        self.test_results = []
        self.config_performance = defaultdict(list)
        self.interrupted = False
        
        # Setup signal handler for graceful interruption
        signal.signal(signal.SIGINT, self._signal_handler)
        
        # Test queries covering different types and complexities
        self.test_queries = [
            "What is cap rate and how do you calculate it?",
            "How do I analyze cash flow for a rental property investment?",
            "What are the key differences between commercial and residential real estate investing?",
            "Explain the risks involved in real estate investment and how to mitigate them",
            "What factors should I consider when choosing an investment property location?"
        ]
        
        # Quality thresholds for evaluation
        self.quality_thresholds = {
            'min_answer_length': 200,
            'min_word_count': 30,
            'min_similarity': 0.15,
            'min_sources': 1,
            'max_response_time': 45.0,  # Increased timeout
            'min_technical_terms': 2,
            'min_information_density': 1.0
        }
    
    def _signal_handler(self, signum, frame):
        """Handle interrupt signals gracefully."""
        print("\n⚠️  Interrupt received. Finishing current test and saving results...")
        self.interrupted = True
    
    def cleanup_rag(self):
        """Properly cleanup RAG instance to prevent resource leaks."""
        if self.rag_instance:
            try:
                # Try to access model attribute safely
                if hasattr(self.rag_instance, 'model') and self.rag_instance.model:
                    del self.rag_instance.model
                
                # Clear any other resources
                if hasattr(self.rag_instance, 'vectorstore'):
                    del self.rag_instance.vectorstore
                
                if hasattr(self.rag_instance, 'embeddings'):
                    del self.rag_instance.embeddings
                
                del self.rag_instance
                self.rag_instance = None
                
                # Force garbage collection
                gc.collect()
                
                # Brief pause to allow cleanup
                time.sleep(1)
                
            except Exception as e:
                print(f"⚠️  Warning during cleanup: {e}")
                # Force cleanup anyway
                self.rag_instance = None
                gc.collect()
    
    def generate_test_configurations(self) -> List[Dict[str, Any]]:
        """Generate 12 smart LLM configurations for comprehensive testing."""
        configs = []
        
        # Base configuration variations
        base_temps = [0.1, 0.2, 0.4]  # Conservative, balanced, creative
        base_top_ps = [0.8, 0.9, 0.95]  # Focused, balanced, diverse
        
        # Configuration 1-9: Temperature and top_p matrix
        for i, temp in enumerate(base_temps):
            for j, top_p in enumerate(base_top_ps):
                config = self.base_config.copy()
                config['llm_options'] = config['llm_options'].copy()
                config['llm_options']['temperature'] = temp
                config['llm_options']['top_p'] = top_p
                config['llm_options']['num_predict'] = 400
                config['llm_options']['top_k'] = 15
                # Add connection stability options
                config['llm_options']['num_ctx'] = 2048
                config['llm_options']['repeat_penalty'] = 1.1
                
                configs.append({
                    'id': f'temp_{temp}_top_p_{top_p}',
                    'name': f'Temp {temp}, Top-P {top_p}',
                    'config': config,
                    'focus': 'temperature_top_p_balance'
                })
        
        # Configuration 10: High creativity
        config = self.base_config.copy()
        config['llm_options'] = config['llm_options'].copy()
        config['llm_options']['temperature'] = 0.6
        config['llm_options']['top_p'] = 0.95
        config['llm_options']['top_k'] = 25
        config['llm_options']['num_predict'] = 500
        config['llm_options']['num_ctx'] = 2048
        configs.append({
            'id': 'high_creativity',
            'name': 'High Creativity',
            'config': config,
            'focus': 'creative_responses'
        })
        
        # Configuration 11: Maximum precision
        config = self.base_config.copy()
        config['llm_options'] = config['llm_options'].copy()
        config['llm_options']['temperature'] = 0.05
        config['llm_options']['top_p'] = 0.7
        config['llm_options']['top_k'] = 5
        config['llm_options']['num_predict'] = 300
        config['llm_options']['repeat_penalty'] = 1.1
        config['llm_options']['num_ctx'] = 2048
        configs.append({
            'id': 'max_precision',
            'name': 'Maximum Precision',
            'config': config,
            'focus': 'precise_factual'
        })
        
        # Configuration 12: Extended context
        config = self.base_config.copy()
        config['llm_options'] = config['llm_options'].copy()
        config['llm_options']['temperature'] = 0.25
        config['llm_options']['top_p'] = 0.85
        config['llm_options']['num_predict'] = 600
        config['llm_options']['num_ctx'] = 4096
        config['max_context_length'] = 3000
        config['max_context_chunks'] = 6
        configs.append({
            'id': 'extended_context',
            'name': 'Extended Context',
            'config': config,
            'focus': 'comprehensive_answers'
        })
        
        return configs
    
    def setup_rag(self, config: Dict[str, Any]) -> bool:
        """Setup RAG instance with improved retry mechanism and error handling."""
        max_retries = 5  # Increased retries
        retry_delays = [2, 4, 6, 8, 10]  # Progressive delays
        
        for attempt in range(max_retries):
            try:
                print(f"🔧 Setting up RAG (attempt {attempt + 1}/{max_retries})...")
                
                # Cleanup any existing instance first
                self.cleanup_rag()
                
                # Additional cleanup time for first attempt failures
                if attempt > 0:
                    print(f"   ⏳ Waiting {retry_delays[attempt-1]}s before retry...")
                    time.sleep(retry_delays[attempt-1])
                
                # Create new instance with timeout handling
                print("   📡 Creating RAG instance...")
                self.rag_instance = create_rag_instance(config)
                
                if not self.rag_instance:
                    raise Exception("Failed to create RAG instance")
                
                # Set verbose to false to reduce noise
                if hasattr(self.rag_instance, 'set_verbose'):
                    self.rag_instance.set_verbose(False)
                
                print("   🚀 Initializing RAG components...")
                setup_success = self.rag_instance.setup()
                
                if setup_success:
                    print("   ✅ RAG setup successful")
                    
                    # Test the connection with a simple query
                    print("   🧪 Testing connection...")
                    try:
                        test_result = self.rag_instance.ask("test", use_cache=False)
                        if test_result and 'answer' in test_result:
                            print("   ✅ Connection test passed")
                            return True
                        else:
                            raise Exception("Connection test failed - no valid response")
                    except Exception as test_e:
                        print(f"   ❌ Connection test failed: {test_e}")
                        raise test_e
                else:
                    raise Exception("RAG setup returned False")
                    
            except Exception as e:
                error_msg = str(e)
                print(f"   ❌ Setup attempt {attempt + 1} failed: {error_msg}")
                
                # Cleanup on failure
                self.cleanup_rag()
                
                # Check for specific error types
                if "broken pipe" in error_msg.lower() or "connection" in error_msg.lower():
                    print(f"   🔌 Detected connection issue, extending retry delay...")
                    if attempt < max_retries - 1:
                        extra_delay = 5 + (attempt * 2)
                        time.sleep(extra_delay)
                
                if attempt == max_retries - 1:
                    print("   💀 All setup attempts failed")
                    return False
        
        return False
    
    def run_query_test(self, query: str, config_id: str, config_name: str) -> TestResult:
        """Run a single query test with improved error handling and retry logic."""
        max_retries = 3
        retry_delays = [2, 5, 10]
        
        for attempt in range(max_retries):
            try:
                print(f"  🔍 Testing query: {query[:50]}..." + ("..." if len(query) > 50 else ""))
                
                # Check if RAG instance is still valid
                if not self.rag_instance:
                    raise Exception("RAG instance is None")
                
                start_time = time.time()
                
                # Add timeout wrapper for the query
                try:
                    result = self.rag_instance.ask(query, use_cache=False)
                except Exception as query_error:
                    # Check for broken pipe or connection errors
                    if "broken pipe" in str(query_error).lower() or "connection" in str(query_error).lower():
                        print(f"    🔌 Connection error detected: {query_error}")
                        raise query_error
                    else:
                        # Re-raise other errors
                        raise query_error
                
                end_time = time.time()
                
                # Validate result
                if not result or not isinstance(result, dict):
                    raise Exception("Invalid result format")
                
                # Extract comprehensive metrics with safe defaults
                answer = result.get('answer', '') or ''
                sources = result.get('sources', []) or []
                query_info = result.get('query_info', {}) or {}
                
                # Basic metrics with safe calculations
                word_count = len(answer.split()) if answer else 0
                sentence_count = len([s for s in answer.split('.') if s.strip()]) if answer else 0
                
                # Source quality metrics with safe extraction
                similarity_scores = []
                quality_scores = []
                cross_encoder_scores = []
                
                for s in sources:
                    if isinstance(s, dict):
                        if 'similarity' in s and s['similarity'] is not None:
                            try:
                                similarity_scores.append(float(s['similarity']))
                            except (ValueError, TypeError):
                                pass
                        if 'quality_score' in s and s['quality_score'] is not None:
                            try:
                                quality_scores.append(float(s['quality_score']))
                            except (ValueError, TypeError):
                                pass
                        if 'cross_encoder_score' in s and s['cross_encoder_score'] is not None:
                            try:
                                cross_encoder_scores.append(float(s['cross_encoder_score']))
                            except (ValueError, TypeError):
                                pass
                
                test_result = TestResult(
                    config_id=config_id,
                    config_name=config_name,
                    query=query,
                    query_type=query_info.get('query_type', 'unknown'),
                    answer=answer,
                    response_time=end_time - start_time,
                    answer_length=len(answer),
                    word_count=word_count,
                    sentence_count=sentence_count,
                    sources_count=len(sources),
                    chunks_used=result.get('chunks_used', 0) or 0,
                    similarity_scores=similarity_scores,
                    quality_scores=quality_scores,
                    cross_encoder_scores=cross_encoder_scores,
                    cached=result.get('cached', False) or False,
                    error=result.get('error')
                )
                
                print(f"    ✅ Success: {word_count} words, {len(sources)} sources, {test_result.response_time:.2f}s")
                return test_result
                
            except Exception as e:
                error_msg = f"Query test error (attempt {attempt + 1}): {str(e)[:200]}"
                print(f"    ❌ {error_msg}")
                
                # Handle broken pipe specifically
                if "broken pipe" in str(e).lower() or "connection" in str(e).lower():
                    print(f"    🔌 Connection lost, attempting recovery...")
                    
                    # Try to recover by recreating the RAG instance
                    if attempt < max_retries - 1:
                        print(f"    🔄 Recreating RAG instance...")
                        try:
                            # Get current config from the existing instance
                            current_config = getattr(self.rag_instance, 'config', self.base_config)
                            
                            # Cleanup and recreate
                            self.cleanup_rag()
                            time.sleep(retry_delays[attempt])
                            
                            # Try to recreate with same config
                            if self.setup_rag(current_config):
                                print(f"    ✅ RAG instance recovered")
                            else:
                                print(f"    ❌ Failed to recover RAG instance")
                                break
                        except Exception as recovery_error:
                            print(f"    ❌ Recovery failed: {recovery_error}")
                            break
                    else:
                        break
                else:
                    # For other errors, just wait and retry
                    if attempt < max_retries - 1:
                        time.sleep(retry_delays[attempt])
                
                if attempt == max_retries - 1:
                    # Return error result
                    return TestResult(
                        config_id=config_id,
                        config_name=config_name,
                        query=query,
                        query_type='unknown',
                        answer='',
                        response_time=0.0,
                        answer_length=0,
                        word_count=0,
                        sentence_count=0,
                        sources_count=0,
                        chunks_used=0,
                        similarity_scores=[],
                        quality_scores=[],
                        cross_encoder_scores=[],
                        cached=False,
                        error=error_msg
                    )
        
        # Fallback error result
        return TestResult(
            config_id=config_id,
            config_name=config_name,
            query=query,
            query_type='unknown',
            answer='',
            response_time=0.0,
            answer_length=0,
            word_count=0,
            sentence_count=0,
            sources_count=0,
            chunks_used=0,
            similarity_scores=[],
            quality_scores=[],
            cross_encoder_scores=[],
            cached=False,
            error="Maximum retries exceeded"
        )
    
    def evaluate_result_quality(self, result: TestResult) -> Dict[str, Any]:
        """Evaluate result quality against multiple metrics."""
        quality_metrics = {}
        
        # Basic quality checks
        quality_metrics['meets_min_length'] = result.answer_length >= self.quality_thresholds['min_answer_length']
        quality_metrics['meets_min_words'] = result.word_count >= self.quality_thresholds['min_word_count']
        quality_metrics['has_sources'] = result.sources_count >= self.quality_thresholds['min_sources']
        quality_metrics['reasonable_response_time'] = result.response_time <= self.quality_thresholds['max_response_time']
        quality_metrics['no_error'] = result.error is None
        
        # Advanced quality metrics
        quality_metrics['good_similarity'] = result.avg_similarity >= self.quality_thresholds['min_similarity']
        quality_metrics['technical_content'] = result.technical_terms_count >= self.quality_thresholds['min_technical_terms']
        quality_metrics['information_dense'] = result.information_density >= self.quality_thresholds['min_information_density']
        quality_metrics['has_examples'] = result.concrete_examples > 0
        quality_metrics['has_numbers'] = result.numerical_data_count > 0
        
        # Composite scores
        basic_quality_score = sum([
            quality_metrics['meets_min_length'],
            quality_metrics['meets_min_words'],
            quality_metrics['has_sources'],
            quality_metrics['reasonable_response_time'],
            quality_metrics['no_error']
        ]) / 5.0
        
        advanced_quality_score = sum([
            quality_metrics['good_similarity'],
            quality_metrics['technical_content'],
            quality_metrics['information_dense'],
            quality_metrics['has_examples'],
            quality_metrics['has_numbers']
        ]) / 5.0
        
        quality_metrics['basic_quality_score'] = basic_quality_score
        quality_metrics['advanced_quality_score'] = advanced_quality_score
        quality_metrics['overall_quality_score'] = (basic_quality_score * 0.6 + advanced_quality_score * 0.4)
        
        return quality_metrics
    
    def run_full_test_suite(self):
        """Run the complete test suite across all configurations with improved stability."""
        print("🚀 Starting Enhanced RAG Grid Test Suite")
        print("="*80)
        
        start_time = time.time()
        configs = self.generate_test_configurations()
        
        print(f"📋 Testing {len(configs)} configurations with {len(self.test_queries)} queries each")
        print(f"📊 Total tests: {len(configs) * len(self.test_queries)}")
        print("⚠️  Press Ctrl+C to interrupt gracefully")
        print("="*80)
        
        config_counter = 0
        total_tests = len(configs) * len(self.test_queries)
        completed_tests = 0
        consecutive_failures = 0  # Track consecutive failures
        
        for config_info in configs:
            if self.interrupted:
                print("\n⚠️  Test suite interrupted by user")
                break
            
            config_counter += 1
            config = config_info['config']
            config_id = config_info['id']
            config_name = config_info['name']
            
            print(f"\n🔧 Configuration {config_counter}/{len(configs)}: {config_name}")
            print(f"   Focus: {config_info['focus']}")
            print(f"   Settings: T={config['llm_options']['temperature']}, "
                  f"P={config['llm_options']['top_p']}, "
                  f"K={config['llm_options']['top_k']}, "
                  f"Tokens={config['llm_options']['num_predict']}")
            
            # Setup RAG with this configuration
            setup_success = self.setup_rag(config)
            
            if not setup_success:
                print(f"❌ Skipping configuration {config_name} due to setup failure")
                completed_tests += len(self.test_queries)
                consecutive_failures += 1
                
                # If too many consecutive failures, pause longer
                if consecutive_failures >= 3:
                    print("⚠️  Multiple consecutive failures detected. Taking extended break...")
                    time.sleep(30)
                    consecutive_failures = 0
                
                continue
            
            consecutive_failures = 0  # Reset on successful setup
            
            # Run all queries for this configuration
            config_results = []
            config_error_count = 0
            
            for query_idx, query in enumerate(self.test_queries):
                if self.interrupted:
                    break
                
                completed_tests += 1
                progress = (completed_tests / total_tests) * 100
                print(f"  Query {query_idx + 1}/{len(self.test_queries)} [{progress:.1f}% total]")
                
                result = self.run_query_test(query, config_id, config_name)
                self.test_results.append(result)
                config_results.append(result)
                
                if result.error:
                    config_error_count += 1
                    
                    # If too many errors in this config, consider skipping remaining queries
                    if config_error_count >= 3:
                        print(f"  ⚠️  Too many errors in this configuration, may have connection issues")
                        # Brief pause to let things settle
                        time.sleep(5)
                
                # Brief pause between queries to prevent overwhelming the system
                time.sleep(0.5)
            
            if self.interrupted:
                break
            
            # Quick config summary with safe calculations
            response_times = [r.response_time for r in config_results if r.response_time > 0]
            answer_lengths = [r.answer_length for r in config_results]
            error_count = sum(1 for r in config_results if r.error)
            
            avg_time = safe_mean(response_times)
            avg_length = safe_mean(answer_lengths)
            
            print(f"   📈 Config Summary: Avg time {avg_time:.2f}s, "
                  f"Avg length {avg_length:.0f} chars, {error_count} errors")
            
            # Store config performance
            self.config_performance[config_id] = config_results
            
            # Cleanup after each configuration to prevent resource buildup
            print(f"   🧹 Cleaning up configuration...")
            self.cleanup_rag()
            time.sleep(2)  # Brief pause between configurations
        
        total_time = time.time() - start_time
        print(f"\n✅ Test suite completed in {total_time:.2f} seconds")
        print(f"📊 Total results: {len(self.test_results)}")
        
        # Final cleanup
        self.cleanup_rag()
        
        # Generate comprehensive analysis
        if self.test_results:
            self.generate_comprehensive_analysis()
        else:
            print("❌ No test results to analyze")
    
    def generate_comprehensive_analysis(self):
        """Generate detailed analysis of all test results."""
        if not self.test_results:
            print("❌ No test results to analyze")
            return
        
        print("\n" + "="*80)
        print("📊 COMPREHENSIVE TEST ANALYSIS")
        print("="*80)
        
        # Overall statistics
        total_tests = len(self.test_results)
        successful_tests = len([r for r in self.test_results if not r.error])
        error_rate = ((total_tests - successful_tests) / total_tests) * 100 if total_tests > 0 else 0
        
        print(f"\n🎯 Overall Performance:")
        print(f"   Total tests: {total_tests}")
        print(f"   Successful: {successful_tests}")
        print(f"   Error rate: {error_rate:.1f}%")
        
        # Performance metrics for successful tests
        successful_results = [r for r in self.test_results if not r.error]
        
        if successful_results:
            # Safe calculations for all metrics
            response_times = [r.response_time for r in successful_results]
            answer_lengths = [r.answer_length for r in successful_results]
            word_counts = [r.word_count for r in successful_results]
            sources_counts = [r.sources_count for r in successful_results]
            similarity_scores = [r.avg_similarity for r in successful_results if r.avg_similarity > 0]
            
            avg_response_time = safe_mean(response_times)
            avg_answer_length = safe_mean(answer_lengths)
            avg_word_count = safe_mean(word_counts)
            avg_sources = safe_mean(sources_counts)
            avg_similarity = safe_mean(similarity_scores)
            
            print(f"\n⚡ Performance Metrics (Successful Tests):")
            print(f"   Avg Response Time: {avg_response_time:.2f}s")
            print(f"   Avg Answer Length: {avg_answer_length:.0f} characters")
            print(f"   Avg Word Count: {avg_word_count:.0f} words")
            print(f"   Avg Sources Used: {avg_sources:.1f}")
            print(f"   Avg Similarity Score: {avg_similarity:.3f}")
        else:
            print(f"\n⚡ No successful tests to analyze")
        
        # Quality analysis
        print(f"\n🏆 Quality Analysis:")
        quality_results = []
        
        for result in successful_results:
            quality_metrics = self.evaluate_result_quality(result)
            quality_results.append(quality_metrics)
        
        if quality_results:
            basic_scores = [q['basic_quality_score'] for q in quality_results]
            advanced_scores = [q['advanced_quality_score'] for q in quality_results]
            overall_scores = [q['overall_quality_score'] for q in quality_results]
            
            avg_basic_quality = safe_mean(basic_scores)
            avg_advanced_quality = safe_mean(advanced_scores)
            avg_overall_quality = safe_mean(overall_scores)
            
            print(f"   Basic Quality Score: {avg_basic_quality:.3f}")
            print(f"   Advanced Quality Score: {avg_advanced_quality:.3f}")
            print(f"   Overall Quality Score: {avg_overall_quality:.3f}")
        else:
            print(f"   No quality results to analyze")
        
        # Configuration ranking
        print(f"\n🥇 Configuration Rankings:")
        config_scores = {}
        
        for config_id, results in self.config_performance.items():
            successful = [r for r in results if not r.error]
            if successful:
                # Calculate composite score with safe operations
                response_times = [r.response_time for r in successful]
                avg_time = safe_mean(response_times)
                
                quality_scores = [
                    self.evaluate_result_quality(r)['overall_quality_score'] 
                    for r in successful
                ]
                avg_quality = safe_mean(quality_scores)
                
                error_rate = (len(results) - len(successful)) / len(results) if len(results) > 0 else 1.0
                
                # Composite score: quality (60%) + speed bonus (20%) + reliability (20%)
                speed_bonus = max(0, (10 - avg_time) / 10) if avg_time > 0 else 0
                reliability_bonus = 1 - error_rate
                
                composite_score = (avg_quality * 0.6 + 
                                 speed_bonus * 0.2 + 
                                 reliability_bonus * 0.2)
                
                config_scores[config_id] = {
                    'composite_score': composite_score,
                    'avg_quality': avg_quality,
                    'avg_time': avg_time,
                    'error_rate': error_rate,
                    'results': results
                }
            else:
                # All failed
                config_scores[config_id] = {
                    'composite_score': 0.0,
                    'avg_quality': 0.0,
                    'avg_time': 0.0,
                    'error_rate': 1.0,
                    'results': results
                }
        
        # Sort configurations by composite score
        ranked_configs = sorted(
            config_scores.items(), 
            key=lambda x: x[1]['composite_score'], 
            reverse=True
        )
        
        if ranked_configs:
            print("\n   Top 5 Configurations:")
            for i, (config_id, metrics) in enumerate(ranked_configs[:5]):
                config_name = next((r.config_name for r in self.test_results if r.config_id == config_id), config_id)
                print(f"   {i+1}. {config_name}")
                print(f"      Composite Score: {metrics['composite_score']:.3f}")
                print(f"      Quality: {metrics['avg_quality']:.3f}")
                print(f"      Avg Time: {metrics['avg_time']:.2f}s")
                print(f"      Error Rate: {metrics['error_rate']*100:.1f}%")
        else:
            print("   No configurations to rank")
        
        # Query type analysis
        print(f"\n📝 Query Type Performance:")
        query_type_performance = defaultdict(list)
        
        for result in successful_results:
            if result.query_type:
                quality_score = self.evaluate_result_quality(result)['overall_quality_score']
                query_type_performance[result.query_type].append({
                    'quality': quality_score,
                    'time': result.response_time,
                    'length': result.answer_length
                })
        
        if query_type_performance:
            for query_type, performances in query_type_performance.items():
                quality_scores = [p['quality'] for p in performances]
                times = [p['time'] for p in performances]
                lengths = [p['length'] for p in performances]
                
                avg_quality = safe_mean(quality_scores)
                avg_time = safe_mean(times)
                avg_length = safe_mean(lengths)
                
                print(f"   {query_type.title()}: Quality {avg_quality:.3f}, "
                      f"Time {avg_time:.2f}s, Length {avg_length:.0f} chars")
        else:
            print("   No query type data to analyze")
        
        # Save detailed results
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        self.save_detailed_results(ranked_configs, timestamp)
        self.save_csv_results(timestamp)
        
        print(f"\n💾 Results saved:")
        print(f"   📊 CSV: 'rag_test_results_{timestamp}.csv' (for Excel/analysis)")
        print(f"   📋 JSON: 'rag_test_results_{timestamp}.json' (detailed data)")
        print("="*80)
    
    def save_detailed_results(self, ranked_configs, timestamp):
        """Save comprehensive test results to JSON file."""
        filename = f'rag_test_results_{timestamp}.json'
        
        # Prepare data for JSON serialization
        results_data = {
            'test_metadata': {
                'timestamp': timestamp,
                'total_tests': len(self.test_results),
                'total_configs': len(self.config_performance),
                'queries_per_config': len(self.test_queries),
                'test_queries': self.test_queries,
                'quality_thresholds': self.quality_thresholds,
                'interrupted': self.interrupted
            },
            'configuration_rankings': [
                {
                    'rank': i + 1,
                    'config_id': config_id,
                    'config_name': next((r.config_name for r in self.test_results if r.config_id == config_id), config_id),
                    'composite_score': metrics['composite_score'],
                    'avg_quality': metrics['avg_quality'],
                    'avg_time': metrics['avg_time'],
                    'error_rate': metrics['error_rate']
                }
                for i, (config_id, metrics) in enumerate(ranked_configs)
            ],
            'detailed_results': [
                {
                    **asdict(result),
                    'quality_evaluation': self.evaluate_result_quality(result) if not result.error else None
                }
                for result in self.test_results
            ]
        }
        
        try:
            with open(filename, 'w', encoding='utf-8') as f:
                json.dump(results_data, f, indent=2, ensure_ascii=False)
            print(f"✅ Results saved to {filename}")
        except Exception as e:
            print(f"❌ Failed to save results: {e}")
    
    def save_csv_results(self, timestamp):
        """Save test results to CSV format for easy analysis."""
        csv_filename = f'rag_test_results_{timestamp}.csv'
        
        try:
            # Prepare data for CSV
            csv_data = []
            
            for result in self.test_results:
                # Get quality evaluation
                quality_eval = self.evaluate_result_quality(result) if not result.error else {}
                
                # Create comprehensive row
                row = {
                    # Configuration info
                    'config_id': result.config_id,
                    'config_name': result.config_name,
                    'query': result.query,
                    'query_type': result.query_type,
                    
                    # Performance metrics
                    'response_time_sec': result.response_time,
                    'answer_length_chars': result.answer_length,
                    'word_count': result.word_count,
                    'sentence_count': result.sentence_count,
                    'sources_count': result.sources_count,
                    'chunks_used': result.chunks_used,
                    'cached': result.cached,
                    'error': result.error or '',
                    
                    # Similarity metrics
                    'avg_similarity': result.avg_similarity,
                    'max_similarity': result.max_similarity,
                    'min_similarity': result.min_similarity,
                    'similarity_variance': result.similarity_variance,
                    
                    # Quality metrics
                    'avg_quality_score': result.avg_quality,
                    'avg_cross_encoder_score': result.avg_cross_encoder,
                    
                    # Content analysis
                    'technical_terms_count': result.technical_terms_count,
                    'numerical_data_count': result.numerical_data_count,
                    'concrete_examples': result.concrete_examples,
                    'information_density': result.information_density,
                    
                    # Quality evaluation flags
                    'meets_min_length': quality_eval.get('meets_min_length', False),
                    'meets_min_words': quality_eval.get('meets_min_words', False),
                    'has_sources': quality_eval.get('has_sources', False),
                    'reasonable_response_time': quality_eval.get('reasonable_response_time', False),
                    'no_error': quality_eval.get('no_error', False),
                    'good_similarity': quality_eval.get('good_similarity', False),
                    'technical_content': quality_eval.get('technical_content', False),
                    'information_dense': quality_eval.get('information_dense', False),
                    'has_examples': quality_eval.get('has_examples', False),
                    'has_numbers': quality_eval.get('has_numbers', False),
                    
                    # Composite quality scores
                    'basic_quality_score': quality_eval.get('basic_quality_score', 0.0),
                    'advanced_quality_score': quality_eval.get('advanced_quality_score', 0.0),
                    'overall_quality_score': quality_eval.get('overall_quality_score', 0.0),
                    
                    # Raw data (truncated for CSV)
                    'answer_preview': result.answer[:200] + ('...' if len(result.answer) > 200 else '') if result.answer else '',
                    'similarity_scores_list': str(result.similarity_scores),
                    'quality_scores_list': str(result.quality_scores),
                    'cross_encoder_scores_list': str(result.cross_encoder_scores)
                }
                
                csv_data.append(row)
            
            # Create DataFrame and save
            df = pd.DataFrame(csv_data)
            
            # Reorder columns for better readability
            column_order = [
                'config_name', 'config_id', 'query', 'query_type',
                'overall_quality_score', 'basic_quality_score', 'advanced_quality_score',
                'response_time_sec', 'answer_length_chars', 'word_count',
                'sources_count', 'chunks_used', 'avg_similarity', 'max_similarity',
                'technical_terms_count', 'numerical_data_count', 'information_density',
                'no_error', 'meets_min_length', 'has_sources', 'good_similarity',
                'technical_content', 'has_examples', 'has_numbers',
                'cached', 'error', 'answer_preview'
            ]
            
            # Keep only columns that exist
            available_columns = [col for col in column_order if col in df.columns]
            remaining_columns = [col for col in df.columns if col not in available_columns]
            final_column_order = available_columns + remaining_columns
            
            df = df[final_column_order]
            
            # Save to CSV
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            
            # Create summary CSV
            summary_filename = f'rag_config_summary_{timestamp}.csv'
            self.save_config_summary_csv(summary_filename)
            
            print(f"✅ CSV results saved to {csv_filename}")
            print(f"✅ Config summary saved to {summary_filename}")
            
        except Exception as e:
            print(f"❌ Failed to save CSV results: {e}")
            import traceback
            traceback.print_exc()
    
    def save_config_summary_csv(self, filename):
        """Save configuration performance summary to CSV."""
        try:
            summary_data = []
            
            for config_id, results in self.config_performance.items():
                successful = [r for r in results if not r.error]
                config_name = results[0].config_name if results else config_id
                
                if successful:
                    # Calculate metrics with safe operations
                    response_times = [r.response_time for r in successful]
                    quality_scores = [
                        self.evaluate_result_quality(r)['overall_quality_score'] 
                        for r in successful
                    ]
                    answer_lengths = [r.answer_length for r in successful]
                    word_counts = [r.word_count for r in successful]
                    sources_counts = [r.sources_count for r in successful]
                    similarity_scores = [r.avg_similarity for r in successful if r.avg_similarity > 0]
                    technical_terms = [r.technical_terms_count for r in successful]
                    numerical_data = [r.numerical_data_count for r in successful]
                    info_densities = [r.information_density for r in successful]
                    
                    avg_time = safe_mean(response_times)
                    avg_quality = safe_mean(quality_scores)
                    avg_length = safe_mean(answer_lengths)
                    avg_word_count = safe_mean(word_counts)
                    avg_sources = safe_mean(sources_counts)
                    avg_similarity = safe_mean(similarity_scores)
                    avg_technical_terms = safe_mean(technical_terms)
                    avg_numerical_data = safe_mean(numerical_data)
                    avg_info_density = safe_mean(info_densities)
                    
                    error_rate = (len(results) - len(successful)) / len(results) if len(results) > 0 else 1.0
                    
                    # Composite score calculation
                    speed_bonus = max(0, (10 - avg_time) / 10) if avg_time > 0 else 0
                    reliability_bonus = 1 - error_rate
                    composite_score = (avg_quality * 0.6 + speed_bonus * 0.2 + reliability_bonus * 0.2)
                    
                    summary_data.append({
                        'config_name': config_name,
                        'config_id': config_id,
                        'composite_score': composite_score,
                        'overall_quality_score': avg_quality,
                        'avg_response_time_sec': avg_time,
                        'error_rate_percent': error_rate * 100,
                        'avg_answer_length': avg_length,
                        'avg_word_count': avg_word_count,
                        'avg_sources_used': avg_sources,
                        'avg_similarity_score': avg_similarity,
                        'avg_technical_terms': avg_technical_terms,
                        'avg_numerical_data': avg_numerical_data,
                        'avg_information_density': avg_info_density,
                        'successful_queries': len(successful),
                        'total_queries': len(results)
                    })
                else:
                    # All failed
                    summary_data.append({
                        'config_name': config_name,
                        'config_id': config_id,
                        'composite_score': 0.0,
                        'overall_quality_score': 0.0,
                        'avg_response_time_sec': 0.0,
                        'error_rate_percent': 100.0,
                        'avg_answer_length': 0,
                        'avg_word_count': 0,
                        'avg_sources_used': 0,
                        'avg_similarity_score': 0,
                        'avg_technical_terms': 0,
                        'avg_numerical_data': 0,
                        'avg_information_density': 0,
                        'successful_queries': 0,
                        'total_queries': len(results)
                    })
            
            # Sort by composite score
            summary_data.sort(key=lambda x: x['composite_score'], reverse=True)
            
            # Add ranking
            for i, config in enumerate(summary_data):
                config['rank'] = i + 1
            
            # Create DataFrame and save
            df_summary = pd.DataFrame(summary_data)
            
            # Reorder columns
            column_order = [
                'rank', 'config_name', 'composite_score', 'overall_quality_score',
                'avg_response_time_sec', 'error_rate_percent', 'avg_word_count',
                'avg_sources_used', 'avg_similarity_score', 'avg_technical_terms',
                'avg_numerical_data', 'avg_information_density', 'successful_queries', 'total_queries'
            ]
            
            df_summary = df_summary[column_order]
            df_summary.to_csv(filename, index=False, encoding='utf-8')
            
        except Exception as e:
            print(f"❌ Failed to save config summary CSV: {e}")

def main():
    """Main function to run the grid test automatically."""
    print("🚀 RAG Grid Test - Enhanced Stability Version")
    print("🔧 This will test 12 LLM configurations with 5 queries each (60 total tests)")
    print("⏱️  Estimated time: 15-20 minutes")
    print("🛡️  Enhanced error handling for broken pipe and connection issues")
    print("⚠️  Press Ctrl+C to interrupt gracefully and save partial results")
    print()
    
    try:
        tester = RAGGridTester()
        tester.run_full_test_suite()
        
        print("\n🎉 Grid test completed successfully!")
        print("📊 Check the generated JSON and CSV files for detailed results")
        
    except KeyboardInterrupt:
        print("\n⚠️  Test interrupted by user")
        print("📊 Partial results have been saved if any tests completed")
    except Exception as e:
        print(f"\n❌ Test failed with error: {e}")
        traceback.print_exc()
    finally:
        # Ensure cleanup happens
        print("🧹 Final cleanup...")
        gc.collect()

if __name__ == "__main__":
    main()

✅ Successfully imported RAG system
🚀 RAG Grid Test - Enhanced Stability Version
🔧 This will test 12 LLM configurations with 5 queries each (60 total tests)
⏱️  Estimated time: 15-20 minutes
🛡️  Enhanced error handling for broken pipe and connection issues
⚠️  Press Ctrl+C to interrupt gracefully and save partial results

🚀 Starting Enhanced RAG Grid Test Suite
📋 Testing 12 configurations with 5 queries each
📊 Total tests: 60
⚠️  Press Ctrl+C to interrupt gracefully

🔧 Configuration 1/12: Temp 0.1, Top-P 0.8
   Focus: temperature_top_p_balance
   Settings: T=0.1, P=0.8, K=15, Tokens=400
🔧 Setting up RAG (attempt 1/5)...
   📡 Creating RAG instance...
   🚀 Initializing RAG components...
🔧 Loading attempt 1 for all-MiniLM-L6-v2 on cpu
  Loading model on CPU first...
  Fixing meta tensors...
  Testing model with sample input...
  Verifying no meta tensors remain...
  ✅ Model loaded successfully on cpu
🔧 Loading CrossEncoder attempt 1 on cpu
  Loading CrossEncoder on CPU first...
  Fixing Cr