# Visual RAG with Knowledge Graph Demo

This notebook demonstrates the new Visual RAG capabilities that extend the original RAG-knowledgegraph project with:

1. **Multimodal Visual RAG**: Image ingestion, OCR, captioning, and visual question answering
2. **Interactive Web UI**: Streamlit-based interface with knowledge graph visualization
3. **Enhanced Provenance**: Detailed tracking of text and image sources

## Features Demonstrated

- 🖼️ **Image Processing**: OCR text extraction, image captioning, object detection
- 🧠 **Multimodal Knowledge Graph**: Combining text documents with visual information
- 🔍 **Visual Question Answering**: Queries that leverage both text and image content
- 📊 **Interactive Visualization**: Knowledge graph visualization with provenance
- 💬 **Feedback Collection**: User feedback system for continuous improvement

Let's start by setting up the environment and exploring these new capabilities!

In [None]:
# Install and import required packages
import os
import sys

# Add parent directory to path for imports
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath('__file__'))))

from llama_index.core import SimpleDirectoryReader, KnowledgeGraphIndex, Settings, StorageContext
from llama_index.core.graph_stores import SimpleGraphStore
from llama_index.llms.huggingface import HuggingFaceLLM
from langchain_community.embeddings import HuggingFaceEmbeddings
from llama_index.embeddings.langchain import LangchainEmbedding
import networkx as nx
import matplotlib.pyplot as plt
from PIL import Image
import pandas as pd
from datetime import datetime

# Import our custom modules
from pipeline.visual_rag import VisualRAGPipeline, VisualRAGResult
from ingest.image_ingest import ImageIngestor
from tools.knowledge_graph_visualizer import KnowledgeGraphVisualizer

print("✅ All imports successful!")

## 1. Setup Configuration

First, let's configure our models and set up the necessary tokens. You'll need a HuggingFace token for this demo.

In [None]:
# Configuration
HF_TOKEN = 'your_huggingface_token_here'  # Replace with your HuggingFace token

# Model configurations
LLM_MODEL = "HuggingFaceH4/zephyr-7b-beta"
EMBEDDING_MODEL = "thenlper/gte-large"
CAPTION_MODEL = "Salesforce/blip-image-captioning-base"

# Paths
DOCUMENTS_PATH = "../documents/sample_ai"
IMAGES_PATH = "../documents"
STORAGE_PATH = "../storage"

# Display configuration
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Configuration loaded!")
print(f"📄 Documents path: {DOCUMENTS_PATH}")
print(f"🖼️ Images path: {IMAGES_PATH}")
print(f"💾 Storage path: {STORAGE_PATH}")

# Check if paths exist
if not os.path.exists(DOCUMENTS_PATH):
    print(f"⚠️ Warning: Documents path does not exist: {DOCUMENTS_PATH}")
if not os.path.exists(IMAGES_PATH):
    print(f"⚠️ Warning: Images path does not exist: {IMAGES_PATH}")

if HF_TOKEN == 'your_huggingface_token_here':
    print("⚠️ Please set your HuggingFace token in the configuration above!")

## 2. Initialize Models and Load Documents

Let's set up our LLM, embedding model, and load the existing documents to create our knowledge graph.

In [None]:
import os

HF_TOKEN = os.environ.get('HUGGINGFACEHUB_API_TOKEN', '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')

llm = HuggingFaceLLM(
    model_name="HuggingFaceH4/zephyr-7b-beta",
    model_kwargs={"use_auth_token": HF_TOKEN}
)


In [None]:
# Load Documents and Create Knowledge Graph
print("📚 Loading documents and creating knowledge graph...")

try:
    # Load documents
    documents = SimpleDirectoryReader(DOCUMENTS_PATH).load_data()
    print(f"✅ Loaded {len(documents)} documents")
    
    # Display document information
    for i, doc in enumerate(documents):
        print(f"  📄 Document {i+1}: {len(doc.text)} characters")
        if hasattr(doc, 'metadata') and 'file_name' in doc.metadata:
            print(f"     File: {doc.metadata['file_name']}")
    
    # Setup storage context
    graph_store = SimpleGraphStore()
    storage_context = StorageContext.from_defaults(graph_store=graph_store)
    
    # Create Knowledge Graph Index
    print("🔨 Building knowledge graph...")
    kg_index = KnowledgeGraphIndex.from_documents(
        documents=documents,
        max_triplets_per_chunk=3,
        storage_context=storage_context,
        embed_model=embed_model,
        include_embeddings=True
    )
    
    print("✅ Knowledge graph created successfully!")
    
    # Get basic statistics
    graph = kg_index.get_networkx_graph()
    print(f"📊 Graph statistics:")
    print(f"   • Nodes: {len(graph.nodes)}")
    print(f"   • Edges: {len(graph.edges)}")
    
except Exception as e:
    print(f"❌ Error creating knowledge graph: {e}")

## 3. Add Visual Capabilities

Now let's initialize our Visual RAG pipeline and add images to the knowledge graph!

In [None]:
# Initialize Visual RAG Pipeline
print("🖼️ Initializing Visual RAG pipeline...")

try:
    # Create Visual RAG Pipeline
    visual_rag = VisualRAGPipeline(kg_index)
    print("✅ Visual RAG pipeline created")
    
    # Initialize Image Ingestor
    image_ingestor = ImageIngestor(
        caption_model_name=CAPTION_MODEL
    )
    print("✅ Image ingestor initialized")
    
    # Find available images
    image_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp'}
    available_images = []
    
    if os.path.exists(IMAGES_PATH):
        for filename in os.listdir(IMAGES_PATH):
            if any(filename.lower().endswith(ext) for ext in image_extensions):
                available_images.append(os.path.join(IMAGES_PATH, filename))
    
    print(f"🔍 Found {len(available_images)} images:")
    for img_path in available_images:
        print(f"   📸 {os.path.basename(img_path)}")
    
    if available_images:
        # Process images and add to knowledge graph
        print("🔨 Processing images and adding to knowledge graph...")
        num_nodes_added = visual_rag.add_images_to_kg(available_images)
        print(f"✅ Added {num_nodes_added} image-related nodes to knowledge graph")
        
        # Update graph statistics
        updated_graph = kg_index.get_networkx_graph()
        print(f"📊 Updated graph statistics:")
        print(f"   • Nodes: {len(updated_graph.nodes)} (+{len(updated_graph.nodes) - len(graph.nodes)})")
        print(f"   • Edges: {len(updated_graph.edges)} (+{len(updated_graph.edges) - len(graph.edges)})")
    else:
        print("⚠️ No images found to process")
        
except Exception as e:
    print(f"❌ Error initializing visual capabilities: {e}")
    import traceback
    traceback.print_exc()

## 4. Test Multimodal Queries

Now let's test our Visual RAG system with various types of queries that combine text and visual information.

In [None]:
# Define test queries
test_queries = [
    "What is artificial intelligence and how is it represented visually?",
    "Who coined the term AI? Are there any images related to this topic?",
    "Explain machine learning with any visual diagrams or charts available",
    "What visual information is available about AI research?",
    "How many PhD researchers are in India according to the documents and images?"
]

print("🔍 Testing Visual RAG with multimodal queries...")
print("=" * 60)

results = []

# Test each query
for i, query in enumerate(test_queries, 1):
    print(f"\n🔸 Query {i}: {query}")
    print("-" * 50)
    
    try:
        # Execute visual RAG query
        result = visual_rag.query_with_visual_context(
            query=query,
            include_images=True,
            max_text_results=3,
            max_image_results=2
        )
        
        results.append((query, result))
        
        # Display results
        print(f"💡 Answer: {result.answer[:200]}...")
        print(f"📊 Confidence: {result.confidence_score:.2%}")
        print(f"📄 Text sources: {len(result.text_sources)}")
        print(f"🖼️ Image sources: {len(result.image_sources)}")
        print(f"🕸️ Graph nodes: {len(result.graph_context.get('nodes', {}))}")
        
        # Show source breakdown
        if result.text_sources:
            print("   Text sources:")
            for j, source in enumerate(result.text_sources[:2]):
                preview = source['text'][:80] + "..." if len(source['text']) > 80 else source['text']
                print(f"     • {preview}")
        
        if result.image_sources:
            print("   Image sources:")
            for j, source in enumerate(result.image_sources[:2]):
                img_type = source['metadata'].get('type', 'unknown')
                print(f"     • {img_type}: {source['text'][:60]}...")
        
    except Exception as e:
        print(f"❌ Error processing query: {e}")
        results.append((query, None))

print(f"\n✅ Completed testing {len(test_queries)} queries")

## 5. Visualize Results and Provenance

Let's create visualizations to better understand our results and their provenance.

In [None]:
# Visualize Query Performance
if results:
    print("📊 Creating performance visualizations...")
    
    # Extract metrics from results
    query_data = []
    for query, result in results:
        if result:
            query_data.append({
                'Query': query[:30] + "..." if len(query) > 30 else query,
                'Confidence': result.confidence_score,
                'Text Sources': len(result.text_sources),
                'Image Sources': len(result.image_sources),
                'Graph Nodes': len(result.graph_context.get('nodes', {})),
                'Total Sources': len(result.text_sources) + len(result.image_sources)
            })
    
    if query_data:
        df = pd.DataFrame(query_data)
        
        # Create subplots
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
        fig.suptitle('Visual RAG Performance Analysis', fontsize=16, fontweight='bold')
        
        # 1. Confidence scores
        ax1 = axes[0, 0]
        bars1 = ax1.bar(range(len(df)), df['Confidence'], color='skyblue', alpha=0.7)
        ax1.set_title('Confidence Scores by Query')
        ax1.set_xlabel('Query Index')
        ax1.set_ylabel('Confidence Score')
        ax1.set_ylim(0, 1)
        
        # Add value labels on bars
        for i, bar in enumerate(bars1):
            height = bar.get_height()
            ax1.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                    f'{height:.2f}', ha='center', va='bottom')
        
        # 2. Source distribution
        ax2 = axes[0, 1]
        source_data = df[['Text Sources', 'Image Sources']].sum()
        ax2.pie(source_data, labels=source_data.index, autopct='%1.1f%%', 
                colors=['lightcoral', 'lightgreen'])
        ax2.set_title('Distribution of Source Types')
        
        # 3. Sources per query
        ax3 = axes[1, 0]
        x = range(len(df))
        width = 0.35
        ax3.bar([i - width/2 for i in x], df['Text Sources'], width, 
                label='Text Sources', color='lightcoral', alpha=0.7)
        ax3.bar([i + width/2 for i in x], df['Image Sources'], width,
                label='Image Sources', color='lightgreen', alpha=0.7)
        ax3.set_title('Sources per Query')
        ax3.set_xlabel('Query Index')
        ax3.set_ylabel('Number of Sources')
        ax3.legend()
        
        # 4. Graph nodes vs confidence
        ax4 = axes[1, 1]
        scatter = ax4.scatter(df['Graph Nodes'], df['Confidence'], 
                             c=df['Total Sources'], cmap='viridis', alpha=0.7, s=100)
        ax4.set_title('Graph Nodes vs Confidence')
        ax4.set_xlabel('Number of Graph Nodes')
        ax4.set_ylabel('Confidence Score')
        plt.colorbar(scatter, ax=ax4, label='Total Sources')
        
        plt.tight_layout()
        plt.show()
        
        # Display summary table
        print("\n📋 Query Results Summary:")
        print(df.to_string(index=False))
        
    else:
        print("⚠️ No valid results to visualize")
else:
    print("⚠️ No results available for visualization")

In [None]:
# Detailed Analysis of Best Result
if results:
    # Find the result with highest confidence
    valid_results = [(q, r) for q, r in results if r is not None]
    
    if valid_results:
        best_query, best_result = max(valid_results, key=lambda x: x[1].confidence_score)
        
        print(f"🏆 Best Result Analysis")
        print("=" * 60)
        print(f"🔸 Query: {best_query}")
        print(f"📊 Confidence: {best_result.confidence_score:.2%}")
        print(f"💡 Answer: {best_result.answer}")
        print("\n" + "=" * 60)
        
        # Show detailed sources
        print(f"\n📄 Text Sources ({len(best_result.text_sources)}):")
        for i, source in enumerate(best_result.text_sources, 1):
            print(f"   {i}. Score: {source.get('score', 0):.3f}")
            print(f"      Text: {source['text'][:150]}...")
            print(f"      Type: {source['metadata'].get('type', 'text')}")
            print()
        
        print(f"\n🖼️ Image Sources ({len(best_result.image_sources)}):")
        for i, source in enumerate(best_result.image_sources, 1):
            print(f"   {i}. Score: {source.get('score', 0):.3f}")
            print(f"      Text: {source['text'][:100]}...")
            print(f"      Type: {source['metadata'].get('type', 'image')}")
            if 'image_path' in source['metadata']:
                print(f"      Image: {os.path.basename(source['metadata']['image_path'])}")
            print()
        
        # Show graph context
        graph_context = best_result.graph_context
        print(f"\n🕸️ Knowledge Graph Context:")
        print(f"   • Nodes in context: {len(graph_context.get('nodes', {}))}")
        print(f"   • Edges in context: {len(graph_context.get('edges', []))}")
        print(f"   • Total graph nodes: {graph_context.get('total_graph_nodes', 0)}")
        
        # Show provenance
        print(f"\n🔍 Provenance Information:")
        provenance = best_result.provenance
        print(f"   • Text node IDs: {len(provenance.get('text_node_ids', []))}")
        print(f"   • Image node IDs: {len(provenance.get('image_node_ids', []))}")
        print(f"   • Graph nodes used: {len(provenance.get('graph_nodes', []))}")
        
    else:
        print("⚠️ No valid results to analyze")
else:
    print("⚠️ No results available for analysis")

## 6. Demonstrate Web UI Capabilities

Let's show how to launch the interactive web interface for visual exploration.

In [None]:
# Web UI Launch Instructions
print("🌐 Interactive Web UI")
print("=" * 40)
print()
print("To launch the interactive Streamlit web interface:")
print()
print("1. Open a terminal in the project root directory")
print("2. Run the following command:")
print()
print("   streamlit run webapp/app.py")
print()
print("3. The web interface will open in your browser with features:")
print("   • 🔧 Model configuration")
print("   • 🔍 Interactive querying")
print("   • 📊 Real-time results visualization")
print("   • 🕸️ Knowledge graph exploration")
print("   • 💬 Feedback collection")
print("   • 📈 Performance analytics")
print()
print("4. Configure your settings in the sidebar:")
print("   • Add your HuggingFace token")
print("   • Select models")
print("   • Set document and image paths")
print("   • Click 'Initialize System'")
print()
print("5. Start querying with natural language!")
print()

# Show sample web UI interactions
sample_interactions = [
    {
        "query": "What is artificial intelligence?",
        "description": "Basic text-based query to test the system"
    },
    {
        "query": "Show me visual information about AI",
        "description": "Image-focused query to retrieve visual content"
    },
    {
        "query": "Explain machine learning with diagrams",
        "description": "Multimodal query combining text and visual sources"
    },
    {
        "query": "What images are available in the knowledge base?",
        "description": "Query to explore available visual content"
    }
]

print("💡 Sample queries to try in the web interface:")
print("-" * 50)
for i, interaction in enumerate(sample_interactions, 1):
    print(f"{i}. Query: \"{interaction['query']}\"")
    print(f"   Purpose: {interaction['description']}")
    print()

print("✨ The web interface provides:")
print("• Real-time visualization of knowledge graph relationships")
print("• Interactive feedback collection for continuous improvement")
print("• Detailed provenance tracking for transparency")
print("• Side-by-side comparison of text and image sources")
print("• Performance metrics and confidence scoring")

## 7. Summary and Next Steps

Congratulations! You've successfully explored the enhanced Visual RAG capabilities. Here's what we accomplished:

In [None]:
# Demo Summary
print("🎉 Visual RAG Demo Summary")
print("=" * 50)
print()

achievements = [
    "✅ Successfully integrated image processing capabilities",
    "✅ Created multimodal knowledge graph with text and visual nodes",
    "✅ Implemented OCR, captioning, and object detection",
    "✅ Built Visual RAG pipeline for multimodal queries",
    "✅ Demonstrated provenance tracking and visualization",
    "✅ Created interactive web interface with Streamlit",
    "✅ Tested various query types and analyzed performance"
]

for achievement in achievements:
    print(achievement)

print()
print("📊 Key Metrics from this Demo:")
if 'df' in locals() and not df.empty:
    print(f"   • Queries tested: {len(df)}")
    print(f"   • Average confidence: {df['Confidence'].mean():.2%}")
    print(f"   • Total sources used: {df['Total Sources'].sum()}")
    print(f"   • Images processed: {len(available_images) if 'available_images' in locals() else 0}")

print()
print("🚀 Next Steps:")
next_steps = [
    "1. Experiment with more complex multimodal queries",
    "2. Add more diverse image types (diagrams, charts, screenshots)",
    "3. Fine-tune confidence scoring and relevance ranking",
    "4. Implement advanced object detection models",
    "5. Add support for video content",
    "6. Enhance the web UI with more visualization options",
    "7. Collect user feedback to improve the system",
    "8. Scale to larger document and image collections"
]

for step in next_steps:
    print(step)

print()
print("💡 Advanced Usage Ideas:")
advanced_ideas = [
    "• Medical document analysis with X-ray/MRI integration",
    "• Technical manual understanding with diagram support",
    "• Research paper analysis with figure comprehension",
    "• Educational content with visual learning materials",
    "• Legal document review with evidence images",
    "• Scientific literature with data visualizations"
]

for idea in advanced_ideas:
    print(idea)

print()
print("📚 Files Created in this Extension:")
files_created = [
    "ingest/image_ingest.py - Image processing and node creation",
    "pipeline/visual_rag.py - Multimodal RAG pipeline",
    "webapp/app.py - Streamlit web interface",
    "webapp/provenance.py - Provenance tracking system",
    "webapp/feedback_sink.py - User feedback collection",
    "tools/visual_utils.py - Image utilities and visualization",
    "notebooks/visual_rag_demo.ipynb - This demonstration notebook"
]

for file_info in files_created:
    print(f"   📄 {file_info}")

print()
print("🎯 This enhanced RAG system now provides:")
print("   • Multimodal understanding (text + images)")
print("   • Interactive web interface")
print("   • Detailed provenance tracking")
print("   • User feedback collection")
print("   • Visual knowledge graph exploration")
print("   • Comprehensive testing and evaluation")
print()
print("Ready to revolutionize your document understanding! 🚀")