# QdrantRAG-Pro Analysis Notebook

This notebook provides comprehensive analysis and experimentation capabilities for the QdrantRAG-Pro system.

## Features
- System performance analysis
- Search quality evaluation
- Embedding visualization
- Response quality assessment
- Comparative analysis of different configurations

In [None]:
# Import required libraries
import sys
import os
import asyncio
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json
import time
from typing import List, Dict, Any

# Add project root to path
project_root = Path().absolute().parent
sys.path.insert(0, str(project_root))

# Import QdrantRAG-Pro components
from core.config.settings import Settings
from core.database.qdrant_client import QdrantManager
from core.database.document_store import DocumentStore
from core.services.embedding_service import EmbeddingService
from core.services.search_engine import HybridSearchEngine
from core.services.response_generator import ResponseGenerator
from core.models.document import Document, DocumentMetadata

# Set up plotting
plt.style.use('default')
sns.set_palette("husl")
%matplotlib inline

## System Initialization

In [None]:
# Initialize system components
settings = Settings()
qdrant_manager = QdrantManager(settings)
embedding_service = EmbeddingService(settings)
document_store = DocumentStore(qdrant_manager, settings)
search_engine = HybridSearchEngine(qdrant_manager, embedding_service, settings)
response_generator = ResponseGenerator(settings)

print("✅ System components initialized")
print(f"📊 Collection: {settings.qdrant_collection_name}")
print(f"🤖 Model: {settings.embedding_model}")

## System Status Check

In [None]:
# Check system health
health_status = qdrant_manager.health_check()
collection_info = qdrant_manager.get_collection_info()
document_count = document_store.get_document_count()
cache_stats = embedding_service.get_cache_stats()

print(f"🔌 Qdrant Health: {'✅ Connected' if health_status else '❌ Disconnected'}")
if collection_info:
    print(f"📚 Total Vectors: {collection_info.vectors_count:,}")
    print(f"📄 Main Documents: {document_count:,}")
    print(f"📐 Vector Dimension: {collection_info.config.params.vectors.size}")
    print(f"📏 Distance Metric: {collection_info.config.params.vectors.distance}")
else:
    print("❌ Collection not found")

if cache_stats.get('cache_enabled', False):
    print(f"💾 Cache: {cache_stats['size']}/{cache_stats['max_size']} entries")
else:
    print("💾 Cache: Disabled")

## Search Performance Analysis

In [None]:
# Define test queries for analysis
test_queries = [
    "vector database performance",
    "How does RAG improve language models?",
    "API v2.0 configuration",
    "machine learning embeddings",
    "What is hybrid search?",
    "production deployment best practices",
    "OpenAI embedding models",
    "similarity search algorithms",
    "information retrieval techniques",
    "LLM integration strategies"
]

print(f"📝 Prepared {len(test_queries)} test queries for analysis")

In [None]:
# Perform search performance analysis
async def analyze_search_performance(queries: List[str], weight_configs: List[Dict[str, float]]):
    """Analyze search performance across different configurations."""
    results = []
    
    for config in weight_configs:
        config_name = f"V{config['vector']:.1f}_K{config['keyword']:.1f}"
        print(f"Testing configuration: {config_name}")
        
        for query in queries:
            start_time = time.time()
            
            search_results = await search_engine.search(
                query=query,
                limit=10,
                vector_weight=config['vector'],
                keyword_weight=config['keyword'],
                auto_adjust_weights=False
            )
            
            search_time = time.time() - start_time
            
            # Calculate metrics
            avg_score = np.mean([r.combined_score for r in search_results]) if search_results else 0
            max_score = max([r.combined_score for r in search_results]) if search_results else 0
            result_count = len(search_results)
            
            results.append({
                'config': config_name,
                'vector_weight': config['vector'],
                'keyword_weight': config['keyword'],
                'query': query,
                'search_time': search_time,
                'result_count': result_count,
                'avg_score': avg_score,
                'max_score': max_score
            })
    
    return pd.DataFrame(results)

# Define weight configurations to test
weight_configs = [
    {'vector': 1.0, 'keyword': 0.0},  # Pure vector search
    {'vector': 0.8, 'keyword': 0.2},  # Vector-heavy
    {'vector': 0.7, 'keyword': 0.3},  # Default
    {'vector': 0.5, 'keyword': 0.5},  # Balanced
    {'vector': 0.3, 'keyword': 0.7},  # Keyword-heavy
    {'vector': 0.0, 'keyword': 1.0},  # Pure keyword search
]

# Run analysis
performance_df = await analyze_search_performance(test_queries[:5], weight_configs)
print(f"✅ Completed performance analysis with {len(performance_df)} data points")

In [None]:
# Visualize search performance
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Average search time by configuration
time_by_config = performance_df.groupby('config')['search_time'].mean()
axes[0, 0].bar(time_by_config.index, time_by_config.values)
axes[0, 0].set_title('Average Search Time by Configuration')
axes[0, 0].set_ylabel('Time (seconds)')
axes[0, 0].tick_params(axis='x', rotation=45)

# Average score by configuration
score_by_config = performance_df.groupby('config')['avg_score'].mean()
axes[0, 1].bar(score_by_config.index, score_by_config.values)
axes[0, 1].set_title('Average Search Score by Configuration')
axes[0, 1].set_ylabel('Average Score')
axes[0, 1].tick_params(axis='x', rotation=45)

# Result count by configuration
count_by_config = performance_df.groupby('config')['result_count'].mean()
axes[1, 0].bar(count_by_config.index, count_by_config.values)
axes[1, 0].set_title('Average Result Count by Configuration')
axes[1, 0].set_ylabel('Result Count')
axes[1, 0].tick_params(axis='x', rotation=45)

# Score distribution
axes[1, 1].boxplot([performance_df[performance_df['config'] == config]['avg_score'].values 
                   for config in performance_df['config'].unique()],
                  labels=performance_df['config'].unique())
axes[1, 1].set_title('Score Distribution by Configuration')
axes[1, 1].set_ylabel('Average Score')
axes[1, 1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# Display summary statistics
print("\n📊 Performance Summary:")
summary = performance_df.groupby('config').agg({
    'search_time': ['mean', 'std'],
    'avg_score': ['mean', 'std'],
    'result_count': 'mean'
}).round(4)
print(summary)

## Response Quality Analysis

In [None]:
# Analyze response generation quality
async def analyze_response_quality(questions: List[str]):
    """Analyze response generation quality."""
    results = []
    
    for question in questions:
        print(f"Processing: {question[:50]}...")
        
        # Search for relevant documents
        search_results = await search_engine.search(question, limit=5)
        
        if search_results:
            # Generate response
            response = await response_generator.generate_response(question, search_results)
            
            # Validate response quality
            quality_metrics = response_generator.validate_response_quality(response)
            
            results.append({
                'question': question,
                'confidence_score': response.confidence_score,
                'confidence_level': response.confidence_level,
                'source_coverage': response.source_coverage,
                'sources_used': len(response.sources_used),
                'processing_time': response.processing_time,
                'answer_length': len(response.answer.split()),
                'reasoning_steps': len(response.reasoning_steps),
                'overall_quality': quality_metrics['overall_quality'],
                'needs_review': response.needs_review,
                'has_limitations': bool(response.limitations)
            })
    
    return pd.DataFrame(results)

# Test questions for response quality analysis
test_questions = [
    "What is Qdrant and how does it work?",
    "How can I improve search performance in vector databases?",
    "What are the benefits of hybrid search over pure vector search?",
    "How do I deploy a RAG system in production?",
    "What embedding model should I use for my application?"
]

# Run response quality analysis
response_df = await analyze_response_quality(test_questions)
print(f"\n✅ Completed response quality analysis for {len(response_df)} questions")

In [None]:
# Visualize response quality metrics
fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# Confidence score distribution
axes[0, 0].hist(response_df['confidence_score'], bins=10, alpha=0.7)
axes[0, 0].set_title('Confidence Score Distribution')
axes[0, 0].set_xlabel('Confidence Score')
axes[0, 0].set_ylabel('Frequency')

# Source coverage distribution
axes[0, 1].hist(response_df['source_coverage'], bins=10, alpha=0.7)
axes[0, 1].set_title('Source Coverage Distribution')
axes[0, 1].set_xlabel('Source Coverage')
axes[0, 1].set_ylabel('Frequency')

# Processing time distribution
axes[0, 2].hist(response_df['processing_time'], bins=10, alpha=0.7)
axes[0, 2].set_title('Processing Time Distribution')
axes[0, 2].set_xlabel('Processing Time (seconds)')
axes[0, 2].set_ylabel('Frequency')

# Answer length vs confidence
axes[1, 0].scatter(response_df['answer_length'], response_df['confidence_score'])
axes[1, 0].set_title('Answer Length vs Confidence')
axes[1, 0].set_xlabel('Answer Length (words)')
axes[1, 0].set_ylabel('Confidence Score')

# Sources used vs confidence
axes[1, 1].scatter(response_df['sources_used'], response_df['confidence_score'])
axes[1, 1].set_title('Sources Used vs Confidence')
axes[1, 1].set_xlabel('Number of Sources Used')
axes[1, 1].set_ylabel('Confidence Score')

# Quality distribution
quality_counts = response_df['overall_quality'].value_counts()
axes[1, 2].pie(quality_counts.values, labels=quality_counts.index, autopct='%1.1f%%')
axes[1, 2].set_title('Overall Quality Distribution')

plt.tight_layout()
plt.show()

# Display response quality summary
print("\n📊 Response Quality Summary:")
print(f"Average Confidence: {response_df['confidence_score'].mean():.3f}")
print(f"Average Source Coverage: {response_df['source_coverage'].mean():.3f}")
print(f"Average Processing Time: {response_df['processing_time'].mean():.3f}s")
print(f"Responses Needing Review: {response_df['needs_review'].sum()}/{len(response_df)}")
print(f"Responses with Limitations: {response_df['has_limitations'].sum()}/{len(response_df)}")

## Embedding Analysis

In [None]:
# Analyze embedding characteristics
async def analyze_embeddings(sample_texts: List[str]):
    """Analyze embedding characteristics."""
    print(f"Generating embeddings for {len(sample_texts)} texts...")
    
    embedding_results = await embedding_service.create_embeddings_batch(sample_texts)
    embeddings = np.array([result.embedding for result in embedding_results])
    
    # Calculate statistics
    stats = {
        'dimension': embeddings.shape[1],
        'mean_norm': np.mean(np.linalg.norm(embeddings, axis=1)),
        'std_norm': np.std(np.linalg.norm(embeddings, axis=1)),
        'mean_value': np.mean(embeddings),
        'std_value': np.std(embeddings),
        'min_value': np.min(embeddings),
        'max_value': np.max(embeddings)
    }
    
    return embeddings, stats, embedding_results

# Sample texts for embedding analysis
sample_texts = [
    "Vector databases enable efficient similarity search",
    "Machine learning models require large datasets",
    "Natural language processing transforms text data",
    "Artificial intelligence revolutionizes technology",
    "Data science combines statistics and programming"
]

embeddings, embedding_stats, embedding_results = await analyze_embeddings(sample_texts)
print(f"\n✅ Generated embeddings with dimension {embedding_stats['dimension']}")

In [None]:
# Visualize embedding characteristics
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Embedding norms
norms = np.linalg.norm(embeddings, axis=1)
axes[0, 0].bar(range(len(norms)), norms)
axes[0, 0].set_title('Embedding Vector Norms')
axes[0, 0].set_xlabel('Text Index')
axes[0, 0].set_ylabel('L2 Norm')

# Embedding value distribution
axes[0, 1].hist(embeddings.flatten(), bins=50, alpha=0.7)
axes[0, 1].set_title('Embedding Value Distribution')
axes[0, 1].set_xlabel('Embedding Value')
axes[0, 1].set_ylabel('Frequency')

# Similarity matrix
similarity_matrix = np.dot(embeddings, embeddings.T)
im = axes[1, 0].imshow(similarity_matrix, cmap='viridis')
axes[1, 0].set_title('Cosine Similarity Matrix')
axes[1, 0].set_xlabel('Text Index')
axes[1, 0].set_ylabel('Text Index')
plt.colorbar(im, ax=axes[1, 0])

# Processing time vs token count
token_counts = [result.token_count for result in embedding_results]
processing_times = [result.processing_time for result in embedding_results]
axes[1, 1].scatter(token_counts, processing_times)
axes[1, 1].set_title('Processing Time vs Token Count')
axes[1, 1].set_xlabel('Token Count')
axes[1, 1].set_ylabel('Processing Time (seconds)')

plt.tight_layout()
plt.show()

# Display embedding statistics
print("\n📊 Embedding Statistics:")
for key, value in embedding_stats.items():
    if isinstance(value, float):
        print(f"{key}: {value:.6f}")
    else:
        print(f"{key}: {value}")

## System Recommendations

In [None]:
# Generate system recommendations based on analysis
def generate_recommendations(performance_df, response_df, embedding_stats):
    """Generate system optimization recommendations."""
    recommendations = []
    
    # Performance recommendations
    best_config = performance_df.loc[performance_df['avg_score'].idxmax()]
    recommendations.append(
        f"🎯 Optimal search configuration: Vector weight {best_config['vector_weight']:.1f}, "
        f"Keyword weight {best_config['keyword_weight']:.1f}"
    )
    
    avg_search_time = performance_df['search_time'].mean()
    if avg_search_time > 1.0:
        recommendations.append(
            f"⚡ Consider optimizing search performance (current avg: {avg_search_time:.3f}s)"
        )
    
    # Response quality recommendations
    low_confidence_count = (response_df['confidence_score'] < 0.7).sum()
    if low_confidence_count > 0:
        recommendations.append(
            f"🔍 {low_confidence_count} responses have low confidence - consider improving source quality"
        )
    
    avg_source_coverage = response_df['source_coverage'].mean()
    if avg_source_coverage < 0.6:
        recommendations.append(
            f"📚 Low source coverage ({avg_source_coverage:.2f}) - consider expanding knowledge base"
        )
    
    # Embedding recommendations
    if embedding_stats['std_norm'] > 0.1:
        recommendations.append(
            "📐 High variance in embedding norms - consider normalization"
        )
    
    return recommendations

# Generate and display recommendations
recommendations = generate_recommendations(performance_df, response_df, embedding_stats)

print("\n🎯 System Optimization Recommendations:")
print("=" * 50)
for i, rec in enumerate(recommendations, 1):
    print(f"{i}. {rec}")

if not recommendations:
    print("✅ System is performing optimally based on current analysis!")

## Export Analysis Results

In [None]:
# Export analysis results
output_dir = Path("../data/processed")
output_dir.mkdir(exist_ok=True)

# Save performance analysis
performance_df.to_csv(output_dir / "search_performance_analysis.csv", index=False)
print(f"💾 Saved search performance analysis to {output_dir / 'search_performance_analysis.csv'}")

# Save response quality analysis
response_df.to_csv(output_dir / "response_quality_analysis.csv", index=False)
print(f"💾 Saved response quality analysis to {output_dir / 'response_quality_analysis.csv'}")

# Save embedding statistics
with open(output_dir / "embedding_statistics.json", "w") as f:
    json.dump(embedding_stats, f, indent=2)
print(f"💾 Saved embedding statistics to {output_dir / 'embedding_statistics.json'}")

# Save recommendations
with open(output_dir / "system_recommendations.txt", "w") as f:
    f.write("QdrantRAG-Pro System Recommendations\n")
    f.write("=" * 40 + "\n\n")
    for i, rec in enumerate(recommendations, 1):
        f.write(f"{i}. {rec}\n")
print(f"💾 Saved recommendations to {output_dir / 'system_recommendations.txt'}")

print("\n✅ Analysis complete! All results exported to data/processed/")