# Pipeline Stage 2: Vector Database Population

This notebook is the second stage of the Elyra pipeline. It handles:
1. Loading processed chunks from the previous stage
2. Creating embeddings using local models
3. Populating LanceDB vector database
4. Creating searchable indexes
5. Preparing data for graph database integration

## Environment Setup

In [None]:
# Install required packages for vector processing
!pip install lancedb
!pip install sentence-transformers
!pip install pyarrow
!pip install pandas

import json
import pandas as pd
import lancedb
from sentence_transformers import SentenceTransformer
from pathlib import Path
from datetime import datetime
import numpy as np
from typing import List, Dict, Any

print(f"Vector processing started at: {datetime.now()}")
print("✅ All required packages imported")

## Load Previous Stage Data

In [None]:
# Load data from previous pipeline stage
INPUT_DIR = Path("pipeline_outputs")
VECTOR_DB_DIR = Path("vector_db")
VECTOR_DB_DIR.mkdir(exist_ok=True)

# Load vector-ready chunks
vector_chunks_file = INPUT_DIR / "vector_ready_chunks.json"
with open(vector_chunks_file, 'r') as f:
    vector_chunks = json.load(f)

# Load pipeline metadata
metadata_file = INPUT_DIR / "pipeline_metadata.json"
with open(metadata_file, 'r') as f:
    pipeline_metadata = json.load(f)

print(f"📥 Loaded {len(vector_chunks)} chunks from previous stage")
print(f"📄 Source documents: {pipeline_metadata['successful_documents']}")
print(f"📊 Total tokens: {pipeline_metadata['total_tokens']:,}")

# Show sample chunk
if vector_chunks:
    print(f"\n📋 Sample chunk structure:")
    sample = vector_chunks[0]
    print(f"  ID: {sample['id']}")
    print(f"  Content length: {len(sample['content'])} chars")
    print(f"  Document type: {sample['metadata']['document_type']}")
    print(f"  Element path: {sample['metadata']['element_path']}")

## Initialize Embedding Model

In [None]:
# Initialize sentence transformer for embeddings
# Using a lightweight model that works well for technical documents
model_name = "all-MiniLM-L6-v2"  # 384 dimensions, good performance/size ratio

print(f"🤖 Loading embedding model: {model_name}")
embedding_model = SentenceTransformer(model_name)

# Test the model
test_embedding = embedding_model.encode("This is a test sentence.")
print(f"✅ Model loaded successfully")
print(f"📏 Embedding dimension: {len(test_embedding)}")
print(f"🎯 Model max sequence length: {embedding_model.max_seq_length}")

## Create Embeddings

In [None]:
def create_embeddings_batch(chunks: List[Dict], batch_size: int = 32) -> List[Dict]:
    """Create embeddings for chunks in batches for efficiency"""
    
    print(f"🔄 Creating embeddings for {len(chunks)} chunks...")
    
    enriched_chunks = []
    
    for i in range(0, len(chunks), batch_size):
        batch = chunks[i:i + batch_size]
        batch_texts = [chunk['content'] for chunk in batch]
        
        # Create embeddings for batch
        batch_embeddings = embedding_model.encode(
            batch_texts, 
            show_progress_bar=True,
            convert_to_numpy=True
        )
        
        # Add embeddings to chunks
        for j, chunk in enumerate(batch):
            enriched_chunk = {
                **chunk,
                'vector': batch_embeddings[j].tolist(),  # Convert to list for JSON serialization
                'embedding_model': model_name,
                'vector_created_at': datetime.now().isoformat()
            }
            enriched_chunks.append(enriched_chunk)
        
        print(f"  ✅ Processed batch {i//batch_size + 1}/{(len(chunks) + batch_size - 1)//batch_size}")
    
    return enriched_chunks

# Create embeddings for all chunks
enriched_chunks = create_embeddings_batch(vector_chunks)
print(f"\n🎉 Created embeddings for {len(enriched_chunks)} chunks")

## Initialize LanceDB

In [None]:
# Initialize LanceDB
db_path = VECTOR_DB_DIR / "xml_analysis_db"
db = lancedb.connect(str(db_path))

print(f"🗃️ Connected to LanceDB at: {db_path}")

# Prepare data for LanceDB (requires specific format)
def prepare_lancedb_data(chunks: List[Dict]) -> List[Dict]:
    """Convert enriched chunks to LanceDB format"""
    
    lancedb_data = []
    
    for chunk in chunks:
        # Flatten metadata for easier querying
        metadata = chunk['metadata']
        
        record = {
            'id': chunk['id'],
            'content': chunk['content'],
            'vector': chunk['vector'],
            
            # Document metadata
            'source_file': metadata['source_file'],
            'document_type': metadata['document_type'],
            'confidence': metadata['confidence'],
            
            # Chunk metadata
            'chunk_id': metadata['chunk_id'],
            'element_path': metadata['element_path'],
            'token_estimate': metadata['token_estimate'],
            
            # AI use cases (convert list to string for simplicity)
            'ai_use_cases': ', '.join(metadata['ai_use_cases']),
            
            # Processing metadata
            'embedding_model': chunk['embedding_model'],
            'vector_created_at': chunk['vector_created_at']
        }
        
        lancedb_data.append(record)
    
    return lancedb_data

# Prepare data
lancedb_data = prepare_lancedb_data(enriched_chunks)
print(f"📊 Prepared {len(lancedb_data)} records for LanceDB")

# Show sample record structure
if lancedb_data:
    sample_record = lancedb_data[0]
    print(f"\n📋 Sample record keys: {list(sample_record.keys())}")
    print(f"🔢 Vector dimension: {len(sample_record['vector'])}")

## Populate Vector Database

In [None]:
# Create table in LanceDB
table_name = "xml_documents"

# Drop table if it exists (for testing)
try:
    db.drop_table(table_name)
    print(f"🗑️ Dropped existing table: {table_name}")
except:
    pass

# Create new table with data
print(f"📝 Creating table: {table_name}")
table = db.create_table(table_name, lancedb_data)

print(f"✅ Table created with {len(lancedb_data)} records")
print(f"📊 Table schema: {table.schema}")

# Create vector index for faster similarity search
print(f"🔍 Creating vector index...")
table.create_index("vector")
print(f"✅ Vector index created")

## Test Vector Search

In [None]:
def test_vector_search(query: str, limit: int = 5) -> pd.DataFrame:
    """Test vector similarity search"""
    
    # Create query embedding
    query_vector = embedding_model.encode(query)
    
    # Search for similar chunks
    results = table.search(query_vector).limit(limit).to_pandas()
    
    return results

# Test searches
test_queries = [
    "geographic coordinates and location data",
    "security vulnerabilities and compliance",
    "XML schema and document structure"
]

print(f"🔍 Testing vector search functionality:")
print("=" * 50)

for query in test_queries:
    print(f"\n🔎 Query: '{query}'")
    results = test_vector_search(query, limit=3)
    
    if len(results) > 0:
        print(f"📊 Found {len(results)} results:")
        for i, row in results.iterrows():
            print(f"  {i+1}. {row['source_file']} | {row['document_type']} | Score: {row['_distance']:.3f}")
            print(f"     Content: {row['content'][:100]}...")
    else:
        print("❌ No results found")

print(f"\n✅ Vector search testing completed")

## Prepare Graph Database Data

In [None]:
def extract_graph_relationships(chunks: List[Dict]) -> Dict[str, List]:
    """Extract relationships for graph database population"""
    
    nodes = []
    relationships = []
    
    # Group chunks by document
    docs_by_file = {}
    for chunk in chunks:
        file_name = chunk['metadata']['source_file']
        if file_name not in docs_by_file:
            docs_by_file[file_name] = []
        docs_by_file[file_name].append(chunk)
    
    # Create document nodes and chunk relationships
    for file_name, file_chunks in docs_by_file.items():
        if not file_chunks:
            continue
            
        # Document node
        doc_metadata = file_chunks[0]['metadata']
        doc_node = {
            'id': f"doc_{file_name}",
            'type': 'Document',
            'properties': {
                'name': file_name,
                'document_type': doc_metadata['document_type'],
                'confidence': doc_metadata['confidence'],
                'ai_use_cases': doc_metadata['ai_use_cases'],
                'total_chunks': len(file_chunks)
            }
        }
        nodes.append(doc_node)
        
        # Chunk nodes and relationships
        for chunk in file_chunks:
            chunk_node = {
                'id': chunk['id'],
                'type': 'Chunk',
                'properties': {
                    'chunk_id': chunk['metadata']['chunk_id'],
                    'element_path': chunk['metadata']['element_path'],
                    'token_estimate': chunk['metadata']['token_estimate'],
                    'content_length': len(chunk['content'])
                }
            }
            nodes.append(chunk_node)
            
            # Document contains chunk relationship
            relationship = {
                'from': doc_node['id'],
                'to': chunk['id'],
                'type': 'CONTAINS',
                'properties': {
                    'element_path': chunk['metadata']['element_path']
                }
            }
            relationships.append(relationship)
    
    # Create document type similarity relationships
    doc_types = {}
    for node in nodes:
        if node['type'] == 'Document':
            doc_type = node['properties']['document_type']
            if doc_type not in doc_types:
                doc_types[doc_type] = []
            doc_types[doc_type].append(node)
    
    # Add similarity relationships between documents of same type
    for doc_type, docs in doc_types.items():
        if len(docs) > 1:
            for i in range(len(docs)):
                for j in range(i + 1, len(docs)):
                    relationship = {
                        'from': docs[i]['id'],
                        'to': docs[j]['id'],
                        'type': 'SIMILAR_TYPE',
                        'properties': {
                            'document_type': doc_type
                        }
                    }
                    relationships.append(relationship)
    
    return {
        'nodes': nodes,
        'relationships': relationships
    }

# Extract graph data
graph_data = extract_graph_relationships(enriched_chunks)

print(f"🕸️ Graph database preparation:")
print(f"  📊 Nodes: {len(graph_data['nodes'])}")
print(f"  🔗 Relationships: {len(graph_data['relationships'])}")

# Show node type distribution
node_types = {}
for node in graph_data['nodes']:
    node_type = node['type']
    node_types[node_type] = node_types.get(node_type, 0) + 1

print(f"\n📋 Node types:")
for node_type, count in node_types.items():
    print(f"  • {node_type}: {count}")

# Show relationship type distribution
rel_types = {}
for rel in graph_data['relationships']:
    rel_type = rel['type']
    rel_types[rel_type] = rel_types.get(rel_type, 0) + 1

print(f"\n🔗 Relationship types:")
for rel_type, count in rel_types.items():
    print(f"  • {rel_type}: {count}")

## Save Pipeline Outputs

In [None]:
# Save enriched chunks with embeddings
enriched_chunks_file = INPUT_DIR / "enriched_chunks_with_vectors.json"
with open(enriched_chunks_file, 'w') as f:
    json.dump(enriched_chunks, f, indent=2)

print(f"💾 Saved enriched chunks to: {enriched_chunks_file}")

# Save graph data for next stage
graph_data_file = INPUT_DIR / "graph_data.json"
with open(graph_data_file, 'w') as f:
    json.dump(graph_data, f, indent=2)

print(f"💾 Saved graph data to: {graph_data_file}")

# Save vector database info
vector_db_info = {
    'database_path': str(db_path),
    'table_name': table_name,
    'total_records': len(lancedb_data),
    'embedding_model': model_name,
    'vector_dimension': len(lancedb_data[0]['vector']) if lancedb_data else 0,
    'created_at': datetime.now().isoformat(),
    'schema': str(table.schema) if 'table' in locals() else None
}

vector_db_info_file = INPUT_DIR / "vector_db_info.json"
with open(vector_db_info_file, 'w') as f:
    json.dump(vector_db_info, f, indent=2)

print(f"💾 Saved vector DB info to: {vector_db_info_file}")

# Update pipeline metadata
pipeline_metadata.update({
    'vector_processing': {
        'processed_at': datetime.now().isoformat(),
        'embedding_model': model_name,
        'vector_dimension': len(lancedb_data[0]['vector']) if lancedb_data else 0,
        'total_vectors': len(enriched_chunks),
        'database_path': str(db_path),
        'table_name': table_name
    },
    'graph_preparation': {
        'total_nodes': len(graph_data['nodes']),
        'total_relationships': len(graph_data['relationships']),
        'node_types': node_types,
        'relationship_types': rel_types
    }
})

updated_metadata_file = INPUT_DIR / "pipeline_metadata_updated.json"
with open(updated_metadata_file, 'w') as f:
    json.dump(pipeline_metadata, f, indent=2)

print(f"💾 Updated pipeline metadata: {updated_metadata_file}")

print(f"\n🎉 Vector database population stage completed successfully!")
print(f"📊 Vector database contains {len(lancedb_data)} searchable documents")
print(f"🕸️ Graph data prepared with {len(graph_data['nodes'])} nodes")
print(f"Ready for next pipeline stage: Graph Database Population & RAG")