# Pipeline Stage 3: Graph Database & RAG System

This notebook is the final stage of the Elyra pipeline. It handles:
1. Loading vector and graph data from previous stages
2. Populating Memgraph database with relationships
3. Creating a unified RAG system combining vector and graph search
4. Demonstrating intelligent document Q&A
5. Generating insights and recommendations

## Environment Setup

In [None]:
# Install required packages for graph and RAG processing
%pip install pymgclient --upgrade -q --force-reinstall --no-cache-dir
%pip install lancedb --upgrade -q --force-reinstall --no-cache-dir
%pip install sentence-transformers --upgrade -q --force-reinstall --no-cache-dir
%pip install pandas --upgrade -q --force-reinstall --no-cache-dir

import json
import pandas as pd
import lancedb
from sentence_transformers import SentenceTransformer
from pathlib import Path
from datetime import datetime
import numpy as np
from typing import List, Dict, Any, Tuple
import mgclient

print(f"Graph and RAG processing started at: {datetime.now()}")
print("✅ All required packages imported")

## Load Previous Stage Data

In [None]:
# Load data from previous pipeline stages
INPUT_DIR = Path("pipeline_outputs")
VECTOR_DB_DIR = Path("vector_db")

# Load graph data
graph_data_file = INPUT_DIR / "graph_data.json"
with open(graph_data_file, 'r') as f:
    graph_data = json.load(f)

# Load vector database info
vector_db_info_file = INPUT_DIR / "vector_db_info.json"
with open(vector_db_info_file, 'r') as f:
    vector_db_info = json.load(f)

# Load updated pipeline metadata
metadata_file = INPUT_DIR / "pipeline_metadata_updated.json"
with open(metadata_file, 'r') as f:
    pipeline_metadata = json.load(f)

print(f"📊 Loaded graph data: {len(graph_data['nodes'])} nodes, {len(graph_data['relationships'])} relationships")
print(f"🗃️ Vector database: {vector_db_info['total_records']} records")
print(f"📄 Source documents: {pipeline_metadata['successful_documents']}")

# Reconnect to vector database
db_path = vector_db_info['database_path']
table_name = vector_db_info['table_name']
db = lancedb.connect(db_path)
vector_table = db.open_table(table_name)

print(f"✅ Reconnected to vector database: {table_name}")

# Reload embedding model
model_name = vector_db_info['embedding_model']
embedding_model = SentenceTransformer(model_name)
print(f"🤖 Reloaded embedding model: {model_name}")

## Initialize Memgraph (Simulated)

**Note**: This assumes you have Memgraph running locally. If not, the code will simulate the operations.

In [None]:
class MemgraphConnector:
    """Wrapper for Memgraph operations with fallback simulation"""
    
    def __init__(self, host="localhost", port=7687):
        self.host = host
        self.port = port
        self.connection = None
        self.simulated = False
        
        try:
            self.connection = mgclient.connect(host=host, port=port)
            print(f"✅ Connected to Memgraph at {host}:{port}")
        except Exception as e:
            print(f"⚠️ Could not connect to Memgraph: {e}")
            print(f"🔄 Running in simulation mode")
            self.simulated = True
            self.simulated_data = {'nodes': [], 'relationships': []}
    
    def execute_query(self, query: str, parameters: Dict = None) -> List[Dict]:
        """Execute Cypher query"""
        if self.simulated:
            print(f"🔄 [SIMULATED] Query: {query[:100]}...")
            return []
        
        try:
            cursor = self.connection.cursor()
            cursor.execute(query, parameters or {})
            results = cursor.fetchall()
            return [dict(record) for record in results]
        except Exception as e:
            print(f"❌ Query failed: {e}")
            return []
    
    def create_node(self, node: Dict) -> bool:
        """Create a node in the graph"""
        query = f"""
        CREATE (n:{node['type']} {{
            id: $id,
            {', '.join([f'{k}: ${k}' for k in node['properties'].keys()])}
        }})
        """
        
        parameters = {'id': node['id'], **node['properties']}
        
        if self.simulated:
            self.simulated_data['nodes'].append(node)
            return True
        
        result = self.execute_query(query, parameters)
        return result is not None
    
    def create_relationship(self, rel: Dict) -> bool:
        """Create a relationship in the graph"""
        query = f"""
        MATCH (a), (b)
        WHERE a.id = $from_id AND b.id = $to_id
        CREATE (a)-[r:{rel['type']} {{
            {', '.join([f'{k}: ${k}' for k in rel['properties'].keys()])}
        }}]->(b)
        """
        
        parameters = {
            'from_id': rel['from'],
            'to_id': rel['to'],
            **rel['properties']
        }
        
        if self.simulated:
            self.simulated_data['relationships'].append(rel)
            return True
        
        result = self.execute_query(query, parameters)
        return result is not None
    
    def clear_database(self):
        """Clear all data from the database"""
        if self.simulated:
            self.simulated_data = {'nodes': [], 'relationships': []}
            print("🔄 [SIMULATED] Database cleared")
            return
        
        self.execute_query("MATCH (n) DETACH DELETE n")
        print("🗑️ Database cleared")

# Initialize graph database connection
graph_db = MemgraphConnector()

# Clear database for clean start
graph_db.clear_database()

## Populate Graph Database

In [None]:
def populate_graph_database(graph_db: MemgraphConnector, graph_data: Dict) -> Dict[str, int]:
    """Populate the graph database with nodes and relationships"""
    
    stats = {'nodes_created': 0, 'relationships_created': 0, 'errors': 0}
    
    # Create nodes
    print(f"📝 Creating {len(graph_data['nodes'])} nodes...")
    for i, node in enumerate(graph_data['nodes']):
        try:
            success = graph_db.create_node(node)
            if success:
                stats['nodes_created'] += 1
            else:
                stats['errors'] += 1
        except Exception as e:
            print(f"❌ Error creating node {node['id']}: {e}")
            stats['errors'] += 1
        
        if (i + 1) % 10 == 0:
            print(f"  ✅ Created {i + 1}/{len(graph_data['nodes'])} nodes")
    
    # Create relationships
    print(f"\n🔗 Creating {len(graph_data['relationships'])} relationships...")
    for i, rel in enumerate(graph_data['relationships']):
        try:
            success = graph_db.create_relationship(rel)
            if success:
                stats['relationships_created'] += 1
            else:
                stats['errors'] += 1
        except Exception as e:
            print(f"❌ Error creating relationship {rel['from']}->{rel['to']}: {e}")
            stats['errors'] += 1
        
        if (i + 1) % 10 == 0:
            print(f"  ✅ Created {i + 1}/{len(graph_data['relationships'])} relationships")
    
    return stats

# Populate the database
population_stats = populate_graph_database(graph_db, graph_data)

print(f"\n📊 Graph database population completed:")
print(f"  ✅ Nodes created: {population_stats['nodes_created']}")
print(f"  ✅ Relationships created: {population_stats['relationships_created']}")
print(f"  ❌ Errors: {population_stats['errors']}")

## Create Unified RAG System

In [None]:
class UnifiedRAGSystem:
    """RAG system combining vector similarity and graph relationships"""
    
    def __init__(self, vector_table, graph_db, embedding_model):
        self.vector_table = vector_table
        self.graph_db = graph_db
        self.embedding_model = embedding_model
    
    def vector_search(self, query: str, limit: int = 5) -> List[Dict]:
        """Perform vector similarity search"""
        query_vector = self.embedding_model.encode(query)
        results = self.vector_table.search(query_vector).limit(limit).to_pandas()
        return results.to_dict('records')
    
    def graph_search(self, document_type: str = None, relationship_type: str = None) -> List[Dict]:
        """Perform graph-based search"""
        if self.graph_db.simulated:
            # Simulate graph search results
            simulated_results = []
            for node in self.graph_db.simulated_data['nodes']:
                if document_type and node['type'] == 'Document':
                    if node['properties'].get('document_type') == document_type:
                        simulated_results.append(node)
                elif not document_type:
                    simulated_results.append(node)
            return simulated_results[:5]  # Limit results
        
        # Real graph query
        query = "MATCH (n) "
        conditions = []
        
        if document_type:
            conditions.append(f"n.document_type = '{document_type}'")
        
        if conditions:
            query += "WHERE " + " AND ".join(conditions)
        
        query += " RETURN n LIMIT 5"
        
        return self.graph_db.execute_query(query)
    
    def find_related_documents(self, document_id: str) -> List[Dict]:
        """Find documents related to a given document"""
        if self.graph_db.simulated:
            # Simulate related document search
            related = []
            for rel in self.graph_db.simulated_data['relationships']:
                if rel['from'] == document_id or rel['to'] == document_id:
                    other_id = rel['to'] if rel['from'] == document_id else rel['from']
                    for node in self.graph_db.simulated_data['nodes']:
                        if node['id'] == other_id:
                            related.append(node)
                            break
            return related[:3]
        
        query = f"""
        MATCH (d1)-[r]-(d2)
        WHERE d1.id = '{document_id}' AND d2.type = 'Document'
        RETURN d2, r.type as relationship_type
        LIMIT 3
        """
        
        return self.graph_db.execute_query(query)
    
    def unified_search(self, query: str, context: Dict = None) -> Dict[str, Any]:
        """Perform unified search combining vector and graph approaches"""
        
        # Vector search for content similarity
        vector_results = self.vector_search(query, limit=3)
        
        # Extract document types from vector results for graph search
        doc_types = list(set([r.get('document_type') for r in vector_results if r.get('document_type')]))
        
        # Graph search for structural relationships
        graph_results = []
        for doc_type in doc_types[:2]:  # Limit to 2 document types
            graph_results.extend(self.graph_search(document_type=doc_type))
        
        # Find related documents
        related_docs = []
        if vector_results:
            # Use the top vector result to find related documents
            top_result = vector_results[0]
            source_file = top_result.get('source_file', '')
            doc_id = f"doc_{source_file}"
            related_docs = self.find_related_documents(doc_id)
        
        return {
            'query': query,
            'vector_results': vector_results,
            'graph_results': graph_results,
            'related_documents': related_docs,
            'search_timestamp': datetime.now().isoformat()
        }
    
    def answer_question(self, question: str) -> Dict[str, Any]:
        """Generate an answer to a question using RAG approach"""
        
        # Perform unified search
        search_results = self.unified_search(question)
        
        # Extract relevant content
        contexts = []
        sources = []
        
        for result in search_results['vector_results']:
            contexts.append(result.get('content', ''))
            sources.append({
                'file': result.get('source_file', ''),
                'document_type': result.get('document_type', ''),
                'chunk_id': result.get('chunk_id', ''),
                'similarity_score': result.get('_distance', 0.0)
            })
        
        # Generate answer (simulated - in real implementation, use LLM)
        answer = self._generate_simulated_answer(question, contexts, search_results)
        
        return {
            'question': question,
            'answer': answer,
            'sources': sources,
            'related_documents': len(search_results['related_documents']),
            'confidence': self._calculate_confidence(search_results)
        }
    
    def _generate_simulated_answer(self, question: str, contexts: List[str], search_results: Dict) -> str:
        """Generate a simulated answer (replace with actual LLM in production)"""
        
        if not contexts:
            return "I couldn't find relevant information to answer your question."
        
        # Analyze question type
        question_lower = question.lower()
        
        if "what is" in question_lower or "what are" in question_lower:
            doc_types = [r.get('document_type', '') for r in search_results['vector_results']]
            unique_types = list(set(doc_types))
            
            answer = f"Based on the analyzed documents, I found information related to {', '.join(unique_types)} documents. "
            answer += f"The most relevant content includes: {contexts[0][:200]}..."
            
        elif "how many" in question_lower or "count" in question_lower:
            answer = f"I found {len(search_results['vector_results'])} relevant document chunks. "
            answer += f"These span across {len(search_results['related_documents'])} related documents."
            
        elif "where" in question_lower or "location" in question_lower:
            sources = [r.get('source_file', '') for r in search_results['vector_results']]
            answer = f"The information can be found in: {', '.join(set(sources))}. "
            answer += "These documents contain geographic or location-related data."
            
        else:
            answer = f"Based on my analysis of the document collection, here's what I found: "
            answer += f"The most relevant information comes from {len(search_results['vector_results'])} document sections. "
            if contexts:
                answer += f"Key content: {contexts[0][:150]}..."
        
        return answer
    
    def _calculate_confidence(self, search_results: Dict) -> float:
        """Calculate confidence score for the answer"""
        vector_results = search_results['vector_results']
        
        if not vector_results:
            return 0.0
        
        # Base confidence on similarity scores and number of results
        similarity_scores = [1 - r.get('_distance', 1.0) for r in vector_results]
        avg_similarity = sum(similarity_scores) / len(similarity_scores) if similarity_scores else 0
        
        # Boost confidence if we have related documents
        related_boost = min(0.1, len(search_results['related_documents']) * 0.05)
        
        return min(1.0, avg_similarity + related_boost)

# Initialize the unified RAG system
rag_system = UnifiedRAGSystem(vector_table, graph_db, embedding_model)
print("🤖 Unified RAG system initialized")

## Test RAG System

In [None]:
# Test the RAG system with various questions
test_questions = [
    "What geographic information is available in the documents?",
    "How many document types were analyzed?",
    "What are the main AI use cases for these documents?",
    "Where can I find location or coordinate data?",
    "What XML schemas are represented in the collection?"
]

print("🤖 Testing RAG System:")
print("=" * 60)

rag_results = []

for i, question in enumerate(test_questions, 1):
    print(f"\n❓ Question {i}: {question}")
    
    # Get answer from RAG system
    result = rag_system.answer_question(question)
    rag_results.append(result)
    
    print(f"💡 Answer: {result['answer']}")
    print(f"📊 Confidence: {result['confidence']:.2f}")
    print(f"📚 Sources: {len(result['sources'])} document chunks")
    
    if result['sources']:
        print(f"📁 Top source: {result['sources'][0]['file']} ({result['sources'][0]['document_type']})")
    
    print("-" * 40)

print(f"\n✅ RAG system testing completed: {len(rag_results)} questions processed")

## Generate Insights and Recommendations

In [None]:
def generate_collection_insights(rag_system: UnifiedRAGSystem, pipeline_metadata: Dict) -> Dict[str, Any]:
    """Generate insights about the entire document collection"""
    
    insights = {
        'collection_overview': {
            'total_documents': pipeline_metadata['successful_documents'],
            'total_chunks': pipeline_metadata['total_chunks'],
            'total_tokens': pipeline_metadata['total_tokens'],
            'processing_date': pipeline_metadata['vector_processing']['processed_at']
        },
        'document_types': {},
        'ai_applications': set(),
        'technical_recommendations': [],
        'search_performance': {}
    }
    
    # Analyze document types through vector search
    type_queries = [
        "geographic data coordinates",
        "security configuration",
        "XML schema structure",
        "data format specification"
    ]
    
    for query in type_queries:
        results = rag_system.vector_search(query, limit=5)
        for result in results:
            doc_type = result.get('document_type', 'Unknown')
            if doc_type not in insights['document_types']:
                insights['document_types'][doc_type] = {
                    'count': 0,
                    'ai_use_cases': set(),
                    'avg_confidence': 0.0
                }
            
            insights['document_types'][doc_type]['count'] += 1
            
            # Extract AI use cases
            use_cases = result.get('ai_use_cases', '').split(', ')
            for use_case in use_cases:
                if use_case.strip():
                    insights['document_types'][doc_type]['ai_use_cases'].add(use_case.strip())
                    insights['ai_applications'].add(use_case.strip())
    
    # Convert sets to lists for JSON serialization
    for doc_type in insights['document_types']:
        insights['document_types'][doc_type]['ai_use_cases'] = list(
            insights['document_types'][doc_type]['ai_use_cases']
        )
    
    insights['ai_applications'] = list(insights['ai_applications'])
    
    # Generate recommendations
    insights['technical_recommendations'] = [
        {
            'category': 'Vector Search Optimization',
            'recommendation': 'Consider using domain-specific embedding models for better semantic understanding',
            'priority': 'Medium',
            'rationale': f'Current model handles {len(insights["ai_applications"])} different AI use cases'
        },
        {
            'category': 'Graph Relationships',
            'recommendation': 'Expand graph schema to include semantic similarity relationships',
            'priority': 'High',
            'rationale': 'Would improve cross-document discovery and related content suggestions'
        },
        {
            'category': 'RAG Enhancement',
            'recommendation': 'Implement query expansion using document type ontologies',
            'priority': 'Medium',
            'rationale': f'Could improve recall across {len(insights["document_types"])} document types'
        }
    ]
    
    # Analyze search performance
    performance_queries = [
        "geographic coordinates",
        "security settings",
        "XML structure"
    ]
    
    performance_scores = []
    for query in performance_queries:
        results = rag_system.vector_search(query, limit=1)
        if results:
            score = 1 - results[0].get('_distance', 1.0)
            performance_scores.append(score)
    
    insights['search_performance'] = {
        'avg_similarity_score': sum(performance_scores) / len(performance_scores) if performance_scores else 0,
        'test_queries': len(performance_queries),
        'vector_dimension': pipeline_metadata['vector_processing']['vector_dimension']
    }
    
    return insights

# Generate comprehensive insights
print("📊 Generating collection insights...")
collection_insights = generate_collection_insights(rag_system, pipeline_metadata)

print(f"\n🎯 Collection Insights Summary:")
print(f"📄 Documents: {collection_insights['collection_overview']['total_documents']}")
print(f"📝 Chunks: {collection_insights['collection_overview']['total_chunks']:,}")
print(f"🔤 Tokens: {collection_insights['collection_overview']['total_tokens']:,}")

print(f"\n📋 Document Types:")
for doc_type, info in collection_insights['document_types'].items():
    print(f"  • {doc_type}: {info['count']} occurrences")
    print(f"    AI use cases: {', '.join(info['ai_use_cases'][:3])}")

print(f"\n🤖 AI Applications ({len(collection_insights['ai_applications'])}):")
for app in collection_insights['ai_applications'][:5]:
    print(f"  • {app}")

print(f"\n💡 Top Recommendations:")
for rec in collection_insights['technical_recommendations']:
    print(f"  • {rec['category']}: {rec['recommendation']}")
    print(f"    Priority: {rec['priority']} | {rec['rationale']}")

print(f"\n⚡ Search Performance:")
perf = collection_insights['search_performance']
print(f"  • Average similarity: {perf['avg_similarity_score']:.3f}")
print(f"  • Vector dimension: {perf['vector_dimension']}")

## Save Final Pipeline Results

In [None]:
# Save RAG test results
rag_results_file = INPUT_DIR / "rag_test_results.json"
with open(rag_results_file, 'w') as f:
    json.dump(rag_results, f, indent=2, default=str)

print(f"💾 Saved RAG test results to: {rag_results_file}")

# Save collection insights
insights_file = INPUT_DIR / "collection_insights.json"
with open(insights_file, 'w') as f:
    json.dump(collection_insights, f, indent=2, default=str)

print(f"💾 Saved collection insights to: {insights_file}")

# Save graph database stats
graph_stats = {
    'population_stats': population_stats,
    'simulated_mode': graph_db.simulated,
    'total_nodes': len(graph_data['nodes']),
    'total_relationships': len(graph_data['relationships']),
    'node_types': pipeline_metadata['graph_preparation']['node_types'],
    'relationship_types': pipeline_metadata['graph_preparation']['relationship_types']
}

graph_stats_file = INPUT_DIR / "graph_database_stats.json"
with open(graph_stats_file, 'w') as f:
    json.dump(graph_stats, f, indent=2)

print(f"💾 Saved graph database stats to: {graph_stats_file}")

# Create final pipeline summary
final_summary = {
    'pipeline_completed_at': datetime.now().isoformat(),
    'stages_completed': ['data_ingestion', 'vector_population', 'graph_rag'],
    'input_files': pipeline_metadata['input_files'],
    'processing_summary': {
        'documents_processed': pipeline_metadata['successful_documents'],
        'chunks_created': pipeline_metadata['total_chunks'],
        'vectors_generated': pipeline_metadata['vector_processing']['total_vectors'],
        'graph_nodes': graph_stats['total_nodes'],
        'graph_relationships': graph_stats['total_relationships']
    },
    'rag_system': {
        'questions_tested': len(rag_results),
        'avg_confidence': sum(r['confidence'] for r in rag_results) / len(rag_results) if rag_results else 0,
        'search_performance': collection_insights['search_performance']
    },
    'recommendations': collection_insights['technical_recommendations'],
    'output_files': {
        'vector_database': vector_db_info['database_path'],
        'rag_results': str(rag_results_file),
        'insights': str(insights_file),
        'graph_stats': str(graph_stats_file)
    }
}

final_summary_file = INPUT_DIR / "pipeline_final_summary.json"
with open(final_summary_file, 'w') as f:
    json.dump(final_summary, f, indent=2)

print(f"💾 Saved final pipeline summary to: {final_summary_file}")

print(f"\n🎉 Complete pipeline execution finished successfully!")
print(f"📊 Processed {final_summary['processing_summary']['documents_processed']} documents")
print(f"🔍 Created searchable vector database with {final_summary['processing_summary']['vectors_generated']} embeddings")
print(f"🕸️ Built knowledge graph with {final_summary['processing_summary']['graph_nodes']} nodes")
print(f"🤖 Tested RAG system with {final_summary['rag_system']['questions_tested']} questions")
print(f"💡 Generated {len(final_summary['recommendations'])} improvement recommendations")

print(f"\n📁 All outputs saved to: {INPUT_DIR}")
print(f"✨ Ready for production deployment and further development!")