# 02. Embedding and RAG

This notebook covers:
- Generating embeddings for text
- Storing embeddings in PostgreSQL with pgvector
- Performing semantic search
- Building a simple RAG (Retrieval-Augmented Generation) system

## 1. Import Libraries

In [None]:
import sys
sys.path.append('/workspace')

from src.utils.db_utils import DatabaseConnection
from src.utils.embedding_utils import (
    EmbeddingGenerator,
    chunk_text,
    store_document_with_embedding,
    search_similar_documents
)
import pandas as pd

print("âœ“ Libraries imported successfully")

## 2. Initialize Components

In [None]:
# Initialize database connection
db = DatabaseConnection()

# Initialize embedding generator
embedder = EmbeddingGenerator()
print(f"âœ“ Using embedding model: {embedder.model_name}")
print(f"âœ“ Embedding dimension: {embedder.embedding_dim}")

## 3. Generate Embeddings

In [None]:
# Test embedding generation
test_texts = [
    "What is the total revenue from sales?",
    "Show me all employees in the engineering department",
    "List all active projects"
]

embeddings = embedder.generate_embeddings(test_texts)
print(f"Generated {len(embeddings)} embeddings")
print(f"Each embedding has {len(embeddings[0])} dimensions")
print(f"\nFirst embedding (first 10 values): {embeddings[0][:10]}")

## 4. Test Semantic Similarity

In [None]:
# Compare similarity between queries
query1 = "Show me the revenue data"
query2 = "What is the total sales amount?"
query3 = "List all employees"

emb1 = embedder.generate_embedding(query1)
emb2 = embedder.generate_embedding(query2)
emb3 = embedder.generate_embedding(query3)

sim_1_2 = embedder.cosine_similarity(emb1, emb2)
sim_1_3 = embedder.cosine_similarity(emb1, emb3)

print(f"Similarity between '{query1}' and '{query2}': {sim_1_2:.4f}")
print(f"Similarity between '{query1}' and '{query3}': {sim_1_3:.4f}")
print("\nðŸ’¡ Higher scores indicate more semantic similarity")

## 5. Update Lexicon with Embeddings

In [None]:
# Get existing lexicon entries without embeddings
query = "SELECT * FROM lexicon WHERE embedding IS NULL"
lexicon_entries = db.execute_query(query)

print(f"Found {len(lexicon_entries)} lexicon entries without embeddings")

# Generate and update embeddings
for entry in lexicon_entries:
    # Combine term and definition for better semantic representation
    text = f"{entry['term']}: {entry['definition']}"
    embedding = embedder.generate_embedding(text)
    
    # Update database
    update_query = "UPDATE lexicon SET embedding = %s WHERE lexicon_id = %s"
    conn = db.get_connection()
    try:
        with conn.cursor() as cursor:
            cursor.execute(update_query, (embedding, entry['lexicon_id']))
        conn.commit()
    finally:
        conn.close()
    
    print(f"âœ“ Updated embedding for: {entry['term']}")

print("\nâœ“ All lexicon entries updated with embeddings")

## 6. Update Documents with Embeddings

In [None]:
# Get existing documents without embeddings
query = "SELECT * FROM documents WHERE embedding IS NULL"
documents = db.execute_query(query)

print(f"Found {len(documents)} documents without embeddings")

# Generate and update embeddings
for doc in documents:
    # Use title and content for embedding
    text = f"{doc['title']}: {doc['content']}"
    embedding = embedder.generate_embedding(text)
    
    # Update database
    update_query = "UPDATE documents SET embedding = %s WHERE document_id = %s"
    conn = db.get_connection()
    try:
        with conn.cursor() as cursor:
            cursor.execute(update_query, (embedding, doc['document_id']))
        conn.commit()
    finally:
        conn.close()
    
    print(f"âœ“ Updated embedding for: {doc['title']}")

print("\nâœ“ All documents updated with embeddings")

## 7. Semantic Search - Lexicon

In [None]:
# Search for relevant lexicon terms
search_query = "How do I convert natural language to SQL?"
query_embedding = embedder.generate_embedding(search_query)

sql = """
SELECT 
    term,
    definition,
    category,
    1 - (embedding <=> %s::vector) as similarity
FROM lexicon
WHERE embedding IS NOT NULL
ORDER BY embedding <=> %s::vector
LIMIT 3
"""

results = db.execute_query(sql, (query_embedding, query_embedding))

print(f"Search query: '{search_query}'")
print("\nTop 3 relevant lexicon entries:\n")
for i, result in enumerate(results, 1):
    print(f"{i}. {result['term']} (similarity: {result['similarity']:.4f})")
    print(f"   Category: {result['category']}")
    print(f"   Definition: {result['definition'][:100]}...")
    print()

## 8. Semantic Search - Documents

In [None]:
# Search for relevant documents using utility function
search_query = "How to implement RAG system?"
results = search_similar_documents(db, search_query, limit=3)

print(f"Search query: '{search_query}'")
print("\nTop 3 relevant documents:\n")
for i, (doc_id, title, content, similarity) in enumerate(results, 1):
    print(f"{i}. {title} (similarity: {similarity:.4f})")
    print(f"   Content: {content[:150]}...")
    print()

## 9. Add New Document with Embedding

In [None]:
# Add a new document with automatic embedding
new_doc = {
    'title': 'Text2SQL Implementation Tips',
    'content': '''Key tips for implementing text2sql:
    1. Always include database schema in your prompt
    2. Use few-shot examples to improve accuracy
    3. Validate generated SQL before execution
    4. Handle errors gracefully
    5. Log all queries for monitoring and improvement
    6. Consider using semantic similarity for schema retrieval
    7. Test with various natural language phrasings
    ''',
    'doc_type': 'Guide',
    'metadata': {'category': 'text2sql', 'author': 'system'}
}

doc_id = store_document_with_embedding(
    db,
    title=new_doc['title'],
    content=new_doc['content'],
    doc_type=new_doc['doc_type'],
    metadata=new_doc['metadata']
)

print(f"âœ“ Added new document with ID: {doc_id}")
print(f"  Title: {new_doc['title']}")

## 10. Build Simple RAG Context

In [None]:
def build_rag_context(query: str, max_docs: int = 3) -> str:
    """
    Build RAG context by retrieving relevant documents
    """
    # Search for relevant documents
    results = search_similar_documents(db, query, limit=max_docs)
    
    # Build context string
    context = "Relevant Information:\n\n"
    for i, (doc_id, title, content, similarity) in enumerate(results, 1):
        context += f"{i}. {title}\n{content}\n\n"
    
    return context

# Test RAG context building
user_query = "What are best practices for text2sql?"
rag_context = build_rag_context(user_query)

print(f"User Query: {user_query}")
print("\n" + "="*60)
print("RAG Context (to be used in LLM prompt):")
print("="*60)
print(rag_context)

## 11. Text Chunking Example

In [None]:
# Example of chunking long text
long_text = """
Text2SQL is a technology that converts natural language queries into SQL statements.
It uses large language models to understand the user's intent and generate appropriate
database queries. The process typically involves: 1) Understanding the database schema,
2) Parsing the natural language query, 3) Mapping entities to database tables and columns,
4) Generating the SQL query, and 5) Validating and executing the query.

Best practices include providing clear schema information, using few-shot examples,
implementing proper error handling, and logging all queries for monitoring. It's also
important to validate generated SQL to prevent injection attacks and ensure query safety.

RAG can enhance text2sql by retrieving relevant schema information and examples.
This helps the model generate more accurate queries by providing additional context.
""" * 3  # Repeat to make it longer

chunks = chunk_text(long_text, chunk_size=200, overlap=50)

print(f"Original text length: {len(long_text)} characters")
print(f"Number of chunks: {len(chunks)}")
print("\nChunks:")
for i, chunk in enumerate(chunks, 1):
    print(f"\nChunk {i} ({len(chunk)} chars):")
    print(chunk[:100] + "...")

## Summary

In this notebook, you learned:
- âœ“ How to generate embeddings using sentence transformers
- âœ“ How to store embeddings in PostgreSQL with pgvector
- âœ“ How to perform semantic similarity search
- âœ“ How to build a simple RAG system
- âœ“ How to chunk text for processing

Next: Move to `03_text2sql_basic.ipynb` to start working with text2sql.