# Data Preparation Notebook

This notebook handles document loading, preprocessing, and vector index creation for the Maverick RAG system.

## Features
- Document loading from local files or Unity Catalog volumes
- Text chunking and preprocessing
- FAISS index creation (local development)
- Databricks vector search setup (enterprise deployment)
- Delta Lake table creation
- Configuration validation


## Setup and Configuration


In [None]:
# Import required libraries
import os
import sys
from pathlib import Path
import pandas as pd
from dotenv import load_dotenv

# Add src to path for imports
sys.path.append(str(Path.cwd().parent / "src"))

# Load environment variables
load_dotenv()

# Import our modules
from utils import config_manager
from rag.ingest import load_documents, build_index, build_databricks_vector_index

print("✅ Setup complete!")


## Configuration Validation


In [None]:
# Display current configuration
config_summary = config_manager.get_config_summary()
print("📋 Current Configuration:")
for section, config in config_summary.items():
    print(f"\n{section.upper()}:")
    for key, value in config.items():
        print(f"  {key}: {value}")

# Validate configuration
validation = config_manager.validate_config()
print(f"\n🔍 Configuration Validation: {'✅ Valid' if validation['valid'] else '❌ Invalid'}")

if validation['errors']:
    print("\n❌ Errors:")
    for error in validation['errors']:
        print(f"  - {error}")

if validation['warnings']:
    print("\n⚠️ Warnings:")
    for warning in validation['warnings']:
        print(f"  - {warning}")


## Document Loading


In [None]:
# Load documents
docs_dir = config_manager.data.docs_dir
print(f"📁 Loading documents from: {docs_dir}")

try:
    documents = load_documents(docs_dir)
    print(f"✅ Loaded {len(documents)} documents")
    
    # Display document information
    if documents:
        print("\n📄 Document Summary:")
        for i, doc in enumerate(documents[:5]):  # Show first 5
            source = doc.metadata.get('source', 'Unknown')
            content_preview = doc.page_content[:100] + "..." if len(doc.page_content) > 100 else doc.page_content
            print(f"  {i+1}. Source: {source}")
            print(f"     Content: {content_preview}")
            print(f"     Length: {len(doc.page_content)} characters")
            print()
        
        if len(documents) > 5:
            print(f"  ... and {len(documents) - 5} more documents")
    else:
        print("⚠️ No documents found")
        
except Exception as e:
    print(f"❌ Error loading documents: {e}")


## Local FAISS Index Creation


In [None]:
# Create FAISS index for local development
if not config_manager.data.use_databricks:
    print("🏗️ Creating FAISS index for local development...")
    
    try:
        index_path = build_index(
            docs_dir=config_manager.data.docs_dir,
            index_dir=config_manager.data.index_dir,
            use_databricks=False
        )
        print(f"✅ FAISS index created at: {index_path}")
        
        # Test the index
        from langchain_community.vectorstores import FAISS
        from langchain_community.embeddings import HuggingFaceEmbeddings
        
        embeddings = HuggingFaceEmbeddings(
            model_name=config_manager.embedding.model_name
        )
        
        store = FAISS.load_local(
            config_manager.data.index_dir, 
            embeddings, 
            allow_dangerous_deserialization=True
        )
        
        print(f"📊 Index contains {store.index.ntotal} vectors")
        
        # Test similarity search
        test_query = "What is the project charter about?"
        results = store.similarity_search(test_query, k=3)
        
        print(f"\n🔍 Test query: '{test_query}'")
        print("📄 Top 3 results:")
        for i, result in enumerate(results):
            print(f"  {i+1}. {result.page_content[:150]}...")
            
    except Exception as e:
        print(f"❌ Error creating FAISS index: {e}")
else:
    print("⏭️ Skipping FAISS index creation (Databricks mode enabled)")
