# 06. End-to-End Workflow

This notebook demonstrates the complete workflow:
1. User asks a question in natural language
2. RAG retrieves relevant context
3. Text2SQL generates SQL query
4. Query is executed
5. Results are visualized
6. Everything is logged for monitoring

In [None]:
import sys
sys.path.append('/workspace')

from src.utils.db_utils import DatabaseConnection, get_database_context
from src.utils.text2sql_utils import execute_text2sql
from src.utils.embedding_utils import search_similar_documents
from src.utils.viz_utils import auto_visualize
import pandas as pd
import time

print("✓ Libraries imported successfully")

## Complete Pipeline Function

In [None]:
def complete_text2sql_workflow(user_question: str, use_rag: bool = True, 
                               visualize: bool = True):
    """
    Complete end-to-end workflow for text2sql with RAG and visualization
    """
    print("="*80)
    print(f"User Question: {user_question}")
    print("="*80)
    
    db = DatabaseConnection()
    start_time = time.time()
    
    # Step 1: RAG - Retrieve relevant context
    if use_rag:
        print("\n[1/5] Retrieving relevant documentation...")
        docs = search_similar_documents(db, user_question, limit=2)
        if docs:
            print(f"  Found {len(docs)} relevant documents:")
            for _, title, _, similarity in docs:
                print(f"    - {title} (similarity: {similarity:.4f})")
    
    # Step 2: Get database schema
    print("\n[2/5] Loading database schema...")
    schema = get_database_context()
    print(f"  Schema loaded ({len(schema)} characters)")
    
    # Step 3: Generate and execute SQL
    print("\n[3/5] Generating SQL query...")
    result = execute_text2sql(db, user_question, log_execution=True)
    
    if not result['success']:
        print(f"  ✗ Error: {result['error']}")
        return None
    
    print(f"  ✓ SQL generated successfully")
    print(f"\n  Generated SQL:\n  {result['sql_query']}")
    
    # Step 4: Display results
    print(f"\n[4/5] Query executed successfully")
    print(f"  Execution time: {result['execution_time_ms']} ms")
    print(f"  Rows returned: {result['row_count']}")
    
    print("\n  Results:")
    display(result['results'])
    
    # Step 5: Visualize
    if visualize and result['row_count'] > 1:
        print("\n[5/5] Creating visualization...")
        viz = auto_visualize(result['results'], user_question)
        if viz is not None and hasattr(viz, 'show'):
            viz.show()
        print("  ✓ Visualization created")
    
    total_time = (time.time() - start_time) * 1000
    print(f"\n{'='*80}")
    print(f"Total pipeline time: {total_time:.2f} ms")
    print(f"{'='*80}\n")
    
    return result

## Test Cases

In [None]:
# Test 1: Simple aggregation
complete_text2sql_workflow(
    "What is the total sales amount by region?",
    use_rag=True,
    visualize=True
)

In [None]:
# Test 2: Join with filtering
complete_text2sql_workflow(
    "Show me all employees in the Engineering department with their salaries",
    use_rag=True,
    visualize=True
)

In [None]:
# Test 3: Complex aggregation
complete_text2sql_workflow(
    "What is the average salary by department?",
    use_rag=True,
    visualize=True
)

In [None]:
# Test 4: Multiple joins
complete_text2sql_workflow(
    "List all active projects with their departments and team members",
    use_rag=True,
    visualize=False  # May not be suitable for visualization
)

## Performance Monitoring

In [None]:
# Check recent query performance
db = DatabaseConnection()

perf_query = """
SELECT 
    natural_language_query,
    execution_success,
    execution_time_ms,
    result_count,
    created_at
FROM query_history
ORDER BY created_at DESC
LIMIT 10
"""

perf_df = db.execute_query_df(perf_query)
print("Recent Query Performance:")
display(perf_df)

In [None]:
# Performance statistics
stats_query = """
SELECT 
    COUNT(*) as total_queries,
    AVG(execution_time_ms) as avg_time_ms,
    MIN(execution_time_ms) as min_time_ms,
    MAX(execution_time_ms) as max_time_ms,
    SUM(CASE WHEN execution_success THEN 1 ELSE 0 END)::float / COUNT(*) * 100 as success_rate
FROM query_history
"""

stats_df = db.execute_query_df(stats_query)
print("\nOverall Statistics:")
display(stats_df)

## Integration with Langfuse (Optional)

In [None]:
# Example: Langfuse integration for monitoring
# Uncomment if Langfuse is configured

# from langfuse import Langfuse
# import os

# langfuse = Langfuse(
#     public_key=os.getenv('LANGFUSE_PUBLIC_KEY'),
#     secret_key=os.getenv('LANGFUSE_SECRET_KEY'),
#     host=os.getenv('LANGFUSE_HOST', 'http://localhost:3001')
# )

# trace = langfuse.trace(name="text2sql-query")
# generation = trace.generation(
#     name="sql-generation",
#     model="llama2",
#     input={"question": "What is the total sales?"},
#     output={"sql": "SELECT SUM(total_amount) FROM sales"}
# )

print("Langfuse integration example (commented out)")
print("Configure LANGFUSE_PUBLIC_KEY and LANGFUSE_SECRET_KEY in .env to enable")

## Summary

This notebook demonstrated:
- ✓ Complete end-to-end text2sql workflow
- ✓ RAG integration for context retrieval
- ✓ Automatic visualization of results
- ✓ Performance monitoring and logging
- ✓ Integration points for observability tools

## Next Steps

1. Experiment with different queries
2. Add more documents to the RAG system
3. Configure Langfuse for monitoring
4. Try different LLM models with Ollama
5. Build custom applications using these utilities