# Local RAG Pipeline - Interactive Notebook

This notebook demonstrates how to use the RAG pipeline interactively.

## 1. Setup and Initialization

In [None]:
from rag_pipeline import LocalRAGPipeline
import json
from pathlib import Path

# Initialize the RAG pipeline
rag = LocalRAGPipeline(
    embedding_model="all-MiniLM-L6-v2",
    llm_model="microsoft/phi-2",
    chunk_size=500,
    chunk_overlap=50,
    storage_path="./notebook_rag_storage"
)

print("‚úÖ RAG Pipeline initialized!")

## 2. Load Documents

In [None]:
# Point to your documents directory
DOCUMENTS_PATH = "./my_documents"

# Load documents
documents = rag.load_documents(DOCUMENTS_PATH)

print(f"üìö Loaded {len(documents)} document chunks")
print(f"\nFirst chunk preview:")
print(documents[0].content[:200] if documents else "No documents loaded")

## 3. Build Indices

In [None]:
# Build vector index
print("üîß Building vector index...")
rag.build_vector_index(documents)

# Build knowledge graph
print("üï∏Ô∏è  Building knowledge graph...")
rag.build_knowledge_graph(documents)

print("\n‚úÖ Indices built successfully!")

## 4. Explore Pipeline Statistics

In [None]:
print("üìä Pipeline Statistics")
print("=" * 50)
print(f"Total documents: {len(rag.documents)}")
print(f"Embedding dimension: {rag.embedding_dim}")
print(f"Knowledge graph nodes: {rag.knowledge_graph.number_of_nodes()}")
print(f"Knowledge graph edges: {rag.knowledge_graph.number_of_edges()}")

## 5. Vector Search Example

In [None]:
query = "What are the main topics in the documents?"

# Perform vector search
results = rag.vector_search(query, top_k=3)

print(f"üîç Vector Search Results for: '{query}'\n")
for i, (doc, score) in enumerate(results, 1):
    print(f"{i}. Score: {score:.4f}")
    print(f"   Source: {doc.metadata.get('source', 'Unknown')}")
    print(f"   Content: {doc.content[:150]}...\n")

## 6. Graph Search Example

In [None]:
# Perform graph search
results = rag.graph_search(query, top_k=3)

print(f"üï∏Ô∏è  Graph Search Results for: '{query}'\n")
for i, doc in enumerate(results, 1):
    print(f"{i}. Source: {doc.metadata.get('source', 'Unknown')}")
    print(f"   Content: {doc.content[:150]}...\n")

## 7. Hybrid Search with Answer Generation

In [None]:
# Ask a question and get an answer
query = "What are the key concepts discussed?"

result = rag.query(query, search_type="hybrid", top_k=5)

print("="*80)
print(f"Question: {result['question']}")
print("="*80)
print(f"\nü§ñ Answer:\n{result['answer']}")
print("\n" + "="*80)
print("üìö Retrieved Documents:")
print("="*80)
for i, doc in enumerate(result['retrieved_documents'][:3], 1):
    print(f"\n{i}. {doc['metadata']['source']}")
    print(f"   {doc['content'][:200]}...")

## 8. Compare Search Types

In [None]:
query = "Explain the important concepts"

# Try all three search types
search_types = ['vector', 'graph', 'hybrid']
results = {}

for search_type in search_types:
    result = rag.query(query, search_type=search_type, top_k=3)
    results[search_type] = result
    
    print(f"\n{'='*80}")
    print(f"{search_type.upper()} SEARCH")
    print(f"{'='*80}")
    print(f"Answer: {result['answer'][:200]}...\n")

## 9. Visualize Knowledge Graph

In [None]:
import networkx as nx
import matplotlib.pyplot as plt

# Get a subgraph of the most connected nodes
degrees = dict(rag.knowledge_graph.degree())
top_nodes = sorted(degrees.items(), key=lambda x: x[1], reverse=True)[:20]
top_node_ids = [node for node, _ in top_nodes]

subgraph = rag.knowledge_graph.subgraph(top_node_ids)

# Draw the graph
plt.figure(figsize=(15, 10))
pos = nx.spring_layout(subgraph, k=2, iterations=50)

# Color nodes by type
node_colors = [
    'lightblue' if subgraph.nodes[node].get('type') == 'document' else 'lightgreen'
    for node in subgraph.nodes()
]

nx.draw(
    subgraph,
    pos,
    node_color=node_colors,
    node_size=500,
    with_labels=True,
    font_size=8,
    font_weight='bold',
    arrows=True,
    edge_color='gray',
    alpha=0.7
)

plt.title("Knowledge Graph - Top 20 Connected Nodes\n(Blue=Documents, Green=Entities)")
plt.tight_layout()
plt.show()

## 10. Save the Pipeline

In [None]:
# Save for later use
rag.save()
print("üíæ Pipeline saved successfully!")
print(f"Location: {rag.storage_path}")

## 11. Interactive Query Cell

Run this cell multiple times with different questions!

In [None]:
# Change this question and re-run the cell
my_question = "What are the main topics?"

result = rag.query(my_question, search_type="hybrid", top_k=5)

print("Question:", my_question)
print("\nAnswer:", result['answer'])
print("\nSources:")
for i, doc in enumerate(result['retrieved_documents'][:3], 1):
    print(f"{i}. {doc['metadata']['source']}")