In [None]:
# Path setup to resolve package imports (dynamic project root)
import sys, os

# Derive project root from current notebook directory: <project>/notebook
notebook_dir = os.getcwd()
project_root = os.path.dirname(notebook_dir)

if project_root not in sys.path:
    sys.path.insert(0, project_root)

In [None]:
# Cell X: FETCH ‚Üí BUILD ‚Üí STORE ‚Üí QUERY COMPARE (Separate DBs per model)

import time
from langchain_openai import OpenAIEmbeddings
from langchain_ollama import OllamaEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from src.db.db_schema_wrapper import db_schema_wrapper

print("üîÑ FETCH ‚Üí BUILD ‚Üí STORE ‚Üí QUERY COMPARE (Separate DBs)")
print("=" * 90)

# 1. FETCH (unchanged)
def fetch_schema_documents(table_names=None):
    if table_names is None:
        table_names = db_schema_wrapper.get_usable_table_names().split(", ")
    
    print(f"üì• Fetching schemas for {len(table_names)} tables...")
    schema_docs = []
    
    for table in table_names:
        schema_text = db_schema_wrapper.get_table_info([table])
        doc = Document(
            page_content=schema_text,
            metadata={"table": table.strip(), "type": "schema", "source": "db_wrapper"}
        )
        schema_docs.append(doc)
        print(f"  ‚ûï {table}: {len(schema_text)} chars")
    
    print(f"‚úÖ Fetched {len(schema_docs)} schema documents")
    return schema_docs

# 2. BUILD & STORE separate DBs for each model
def build_and_store_model_dbs(schema_docs, models_config):
    """Build + save separate vector DB for each embedding model"""
    
    splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
    split_docs = splitter.split_documents(schema_docs)
    
    model_dbs = {}
    
    print(f"\nüèóÔ∏è  Building {len(models_config)} separate vector DBs...")
    
    for model_name, (embeddings_class, model_id, folder) in models_config.items():
        print(f"\nüî® {model_name}...")
        
        # Create embeddings
        if embeddings_class == OpenAIEmbeddings:
            embeddings = embeddings_class(model=model_id)
        else:  # Ollama
            embeddings = embeddings_class(model=model_id)
        
        # Build DB
        start_time = time.time()
        db = FAISS.from_documents(split_docs, embeddings)
        build_time = time.time() - start_time
        
        # Store separately
        db.save_local(folder)
        model_dbs[model_name] = {
            "db": db,
            "embeddings": embeddings,
            "folder": folder,
            "chunks": db.index.ntotal,
            "build_time": build_time
        }
        
        print(f"   ‚úÖ Saved {db.index.ntotal} chunks ‚Üí {folder}/ ({build_time:.2f}s)")
    
    return model_dbs

# 3. LOAD & QUERY COMPARE (using saved DBs)
def query_compare_saved_dbs(model_dbs, test_queries, k=3):
    """Query comparison using PRE-BUILT saved vector DBs"""
    
    print(f"\nüîç Query comparison ({len(test_queries)} queries)...")
    
    results = {}
    
    for model_name, data in model_dbs.items():
        print(f"\n‚ö° Querying {model_name} DB...")
        
        db = data["db"]
        embeddings = data["embeddings"]
        
        query_results = {}
        total_query_time = 0
        
        for query in test_queries:
            q_start = time.time()
            
            # Load DB if needed (production usage)
            # db = FAISS.load_local(data["folder"], embeddings, allow_dangerous_deserialization=True)
            
            docs_scores = db.similarity_search_with_score(query, k=k)
            q_time = time.time() - q_start
            
            best_score = docs_scores[0][1] if docs_scores else 999
            avg_score = sum(score for _, score in docs_scores) / k
            
            query_results[query] = {
                "best_score": best_score,
                "avg_score": avg_score,
                "top_tables": [doc.metadata.get("table", "?") for doc, _ in docs_scores],
                "top_docs": [doc.page_content[:60] + "..." for doc, _ in docs_scores],
                "query_time": q_time
            }
            total_query_time += q_time
        
        avg_query_time = total_query_time / len(test_queries)
        results[model_name] = {
            "avg_query_time": avg_query_time,
            "queries": query_results,
            "overall_best": min(r["best_score"] for r in query_results.values()),
            "overall_avg": sum(r["avg_score"] for r in query_results.values()) / len(test_queries)
        }
        
        print(f"   ‚è±Ô∏è Avg query: {avg_query_time:.3f}s | Overall: {results[model_name]['overall_avg']:.3f}")
    
    return results

# 4. MAIN EXECUTION
test_queries = [
    "Show top customers by sales",
    "employees in sales department", 
    "monthly revenue by region",
    "customer lifetime value",
    "active orders this quarter"
]

# FETCH schemas
schema_docs = fetch_schema_documents()

# MODEL CONFIG (name ‚Üí class+id+folder)
models_config = {
    "üîµ OpenAI-3-small": (OpenAIEmbeddings, "text-embedding-3-small", "db_openai_small"),
    "üü¢ Ollama-nomic": (OllamaEmbeddings, "nomic-embed-text", "db_ollama_nomic"),
    # "üü¢ Ollama-mxbai": (OllamaEmbeddings, "mxbai-embed-large", "db_ollama_mxbai"),
}

# BUILD & STORE separate DBs
model_dbs = build_and_store_model_dbs(schema_docs, models_config)

# QUERY COMPARE using saved DBs
query_results = query_compare_saved_dbs(model_dbs, test_queries)

# 5. RANKING TABLE
print("\n" + "="*110)
print("üèÜ PRODUCTION QUERY RANKING (Lower scores = BETTER relevance)")
print("="*110)

print(f"{'Model':<20} {'Overall Avg':<12} {'Best Query':<12} {'Query Time':<12} {'Top Tables'}")
print("-"*110)

ranked = sorted(query_results.items(), key=lambda x: x[1]["overall_avg"])
medals = ["ü•á", "ü•à", "ü•â"]

for i, (model_name, data) in enumerate(ranked):
    medal = medals[i] if i < 3 else ""
    best_query_score = data["overall_best"]
    sample_tables = ", ".join(data["queries"][test_queries[0]]["top_tables"][:2])
    
    print(f"{medal} {model_name:<18} {data['overall_avg']:<11.3f} "
          f"{best_query_score:<11.3f} {data['avg_query_time']:<11.3f}s "
          f"{sample_tables}")

winner_name = ranked[0][0]
print(f"\nüéâ PRODUCTION WINNER: {winner_name}")
print(f"   Use folder: {model_dbs[winner_name]['folder']}/")

# 6. SAVE WINNER AS DEFAULT
print(f"\nüíæ Saved DB folders:")
for name, data in model_dbs.items():
    print(f"   {data['folder']}/ ‚Üê {data['chunks']} chunks ({name})")