In [None]:
# Path setup to resolve package imports (dynamic project root)
import sys, os

# Derive project root from current notebook directory: <project>/notebook
notebook_dir = os.getcwd()
project_root = os.path.dirname(notebook_dir)

if project_root not in sys.path:
    sys.path.insert(0, project_root)

In [None]:
# Cell 1: VECTOR DB SETUP + INDEXING
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from src.db.db_schema_wrapper import db_schema_wrapper
from langchain_core.messages import HumanMessage

print("üóÉÔ∏è Setting up Vector DB for schemas...")

# 1. Extract ALL schema info
all_tables = db_schema_wrapper.get_usable_table_names().split(", ")
print(f"üìä Indexing {len(all_tables)} tables...")

# 2. Get schema docs for ALL tables
schema_docs = []
for table in all_tables:  # Limit for demo
    schema_text = db_schema_wrapper.get_table_info([table])
    doc = Document(
        page_content=schema_text,
        metadata={"table": table, "type": "schema"}
    )
    schema_docs.append(doc)

# 3. Create embeddings + split long schemas
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)

split_docs = splitter.split_documents(schema_docs)

# 4. Build vector store
vector_db = FAISS.from_documents(split_docs, embeddings)
print(f"‚úÖ Vector DB ready: {vector_db.index.ntotal} chunks")

# Save for reuse
vector_db.save_local("schema_vector_db")
print("üíæ Saved to schema_vector_db/")

In [None]:
from src.agents.tools.db_tools import db_tool_manager
# Test tool manager
tools = db_tool_manager.get_tools()
print(f"‚úÖ {len(tools)} tools ready")
print("Tool names:", [t.name for t in tools])

In [None]:
# Cell 2: VECTOR SEARCH TOOLS
from langchain.tools import tool

@tool
def semantic_table_search(query: str, k: int = 3) -> str:
    """Find most relevant tables by semantic search on schemas"""
    print(f"üîç Semantic search: '{query}'")
    
    # Vector search on schemas
    relevant_docs = vector_db.similarity_search(query, k=k)
    
    results = []
    for doc in relevant_docs:
        table = doc.metadata['table']
        score = 1.0 - doc.metadata.get('score', 0.5)  # Convert distance to similarity
        results.append(f"{table} (similarity: {score:.2f})")
    
    return f"Top {k} tables for '{query}':\n" + "\n".join(results)

@tool
def get_schema_by_semantic_search(query: str) -> str:
    """Get schema for semantically relevant tables"""
    print(f"üìã Semantic schema search: '{query}'")
    
    docs = vector_db.similarity_search(query, k=2)
    tables = [doc.metadata['table'] for doc in docs]
    
    schema_info = db_schema_wrapper.get_table_info(tables)
    return schema_info

# Add to your existing tools
VECTOR_TOOLS = [semantic_table_search, get_schema_by_semantic_search]
ALL_TOOLS = tools + VECTOR_TOOLS  # Combine with your SQL tools

In [None]:
# Cell 3: VECTOR-POWERED AGENT
from langchain.agents import create_agent
from langchain_openai import ChatOpenAI
from langgraph.checkpoint.memory import MemorySaver
from src.config.prompt import system_prompt

# Vector-powered agent
vector_llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)
vector_checkpointer = MemorySaver()

vector_agent = create_agent(
    model=vector_llm,
    tools=ALL_TOOLS,
    system_prompt=system_prompt + """ü§ñ VECTOR + SQL AGENT

NEW TOOLS:
1Ô∏è‚É£ semantic_table_search("sales customers") ‚Üí Find relevant tables
2Ô∏è‚É£ get_schema_by_semantic_search("sales customers") ‚Üí Get schemas semantically

Then use regular SQL tools. ALWAYS start with semantic search!""",
    checkpointer=vector_checkpointer
)

In [None]:
# Cell 4: VECTOR SEARCH TEST
print("üß™ Vector Search Examples")
print("=" * 50)

# Test semantic search
print("\n1. üîç 'sales customers':")
print(semantic_table_search.invoke({"query": "sales customers", "k": 3}))

print("\n2. üîç 'employee department':") 
print(semantic_table_search.invoke({"query": "employee department", "k": 3}))

print("\n3. üìã Full schema by similarity:")
print(get_schema_by_semantic_search.invoke({"query": "sales customers"}))

In [None]:
# Cell 5: VECTOR AGENT STREAMING TEST
from langchain_core.messages import AIMessage  # Add this import

def stream_vector_agent(question):
    """Test vector-powered agent"""
    config = {"configurable": {"thread_id": "vector_test"}}
    
    print(f"\nüöÄ Vector Agent: '{question}'")
    print("-" * 60)
    
    for chunk in vector_agent.stream(
        {"messages": [HumanMessage(content=question)]},
        config,
        stream_mode="values"
    ):
        if "messages" in chunk:
            msg = chunk["messages"][-1]
            
            # ‚úÖ FIXED: Only check AIMessage for tool_calls
            if isinstance(msg, AIMessage):
                if msg.tool_calls:
                    tool_name = msg.tool_calls[0]['name']
                    print(f"üü° {tool_name}: {msg.tool_calls[0]['args']}")
                if msg.content:
                    print(msg.content, end="", flush=True, sep="")
            # HumanMessage has no tool_calls - skip safely
    
    print("\n‚úÖ Complete!")
# Test natural language ‚Üí vector ‚Üí SQL flow
stream_vector_agent("Show top customers by sales")
stream_vector_agent("How many employees in sales department?")

In [None]:
# See ALL documents in your vectorstore
all_docs = vector_db.similarity_search("anything", k=1000)  # k=big number
for i, doc in enumerate(all_docs):
    print(f"{i+1}. {doc.page_content[:500]}...")
    print(f"   Metadata: {doc.metadata}")
    print()