# GraphRAG Complete

## Overview

Next-generation RAG: build knowledge graph, generate embeddings, store in vector database, implement hybrid RAG, and integrate with LLM.

## Workflow: Build KG → Generate Embeddings → Vector Store → Hybrid RAG → LLM Integration


In [None]:
from semantica.ingest import FileIngestor, WebIngestor
from semantica.parse import DocumentParser, WebParser
from semantica.semantic_extract import NERExtractor, RelationExtractor
from semantica.kg import GraphBuilder
from semantica.embeddings import EmbeddingGenerator
from semantica.vector_store import VectorStore, HybridSearch
from semantica.context import ContextRetriever
import numpy as np
import os
import tempfile


## Step 1: Build Knowledge Graph


In [None]:
file_ingestor = FileIngestor()
web_ingestor = WebIngestor()
document_parser = DocumentParser()
web_parser = WebParser()
ner_extractor = NERExtractor()
relation_extractor = RelationExtractor()
builder = GraphBuilder()

temp_dir = tempfile.mkdtemp()

doc1_file = os.path.join(temp_dir, "ai_intro.txt")
doc2_file = os.path.join(temp_dir, "ml_basics.txt")
doc3_file = os.path.join(temp_dir, "dl_guide.txt")

with open(doc1_file, 'w') as f:
    f.write("Introduction to AI: Artificial Intelligence is transforming industries. Neural Networks are key components.")
with open(doc2_file, 'w') as f:
    f.write("Machine Learning Basics: ML algorithms learn from data patterns. Neural Networks enable complex learning.")
with open(doc3_file, 'w') as f:
    f.write("Deep Learning Guide: Deep neural networks enable complex learning. Backpropagation is used for training.")

file_objects = []
for doc_file in [doc1_file, doc2_file, doc3_file]:
    file_obj = file_ingestor.ingest_file(doc_file, read_content=True)
    if file_obj:
        file_objects.append(file_obj)

parsed_documents = []
for file_obj in file_objects:
    parsed = document_parser.extract_text(file_obj.path)
    parsed_documents.append({
        "file": file_obj.name,
        "content": parsed,
        "metadata": file_obj.metadata
    })

all_entities = []
all_relationships = []
entity_map = {}

for i, doc in enumerate(parsed_documents, 1):
    doc_id = f"doc{i}"
    doc_name = doc["file"].replace(".txt", "").replace("_", " ").title()
    
    all_entities.append({
        "id": doc_id,
        "type": "Document",
        "name": doc_name,
        "properties": {"content": doc["content"][:100]}
    })
    
    extracted_entities = ner_extractor.extract(doc["content"])
    extracted_relations = relation_extractor.extract(doc["content"], extracted_entities)
    
    for entity in extracted_entities[:5]:
        entity_text = entity.get("text", entity.get("entity", ""))
        if entity_text and entity_text not in entity_map:
            entity_id = f"concept_{len(entity_map) + 1}"
            entity_map[entity_text] = entity_id
            all_entities.append({
                "id": entity_id,
                "type": entity.get("type", "Concept"),
                "name": entity_text,
                "properties": {}
            })
            
            all_relationships.append({
                "source": doc_id,
                "target": entity_id,
                "type": "mentions"
            })
    
    for rel in extracted_relations[:3]:
        source_text = rel.get("source", "")
        target_text = rel.get("target", "")
        if source_text in entity_map and target_text in entity_map:
            all_relationships.append({
                "source": entity_map[source_text],
                "target": entity_map[target_text],
                "type": rel.get("type", "related_to")
            })

knowledge_graph = builder.build(all_entities, all_relationships)

print(f"Ingested {len(file_objects)} documents")
print(f"Extracted {len([e for e in all_entities if e['type'] != 'Document'])} concepts")
print(f"Built knowledge graph with {len(all_entities)} entities and {len(all_relationships)} relationships")


## Step 2: Generate Embeddings


In [None]:
documents = [doc["content"] for doc in parsed_documents]

generator = EmbeddingGenerator()
embeddings = generator.generate(documents)

print(f"Generated embeddings for {len(documents)} parsed documents")
print(f"Embedding dimension: {len(embeddings[0]) if embeddings else 0}")


## Step 3: Store in Vector Store


In [None]:
vector_store = VectorStore()

vector_ids = [f"doc_{i+1}" for i in range(len(documents))]
metadata = [
    {"doc_id": "doc1", "topic": "AI", "type": "introduction"},
    {"doc_id": "doc2", "topic": "ML", "type": "tutorial"},
    {"doc_id": "doc3", "topic": "DL", "type": "guide"}
]

vector_ids_stored = vector_store.store_vectors(embeddings, metadata)

print(f"Stored {len(vector_ids_stored)} vectors in vector store")


## Step 4: Hybrid RAG


In [None]:
hybrid_search = HybridSearch()
context_retriever = ContextRetriever(
    knowledge_graph=knowledge_graph,
    vector_store=vector_store
)

query = "What is deep learning?"
query_embedding = generator.generate([query])[0]

vector_results = vector_store.search_vectors(query_embedding, k=3)

graph_context_results = context_retriever.retrieve(
    query=query,
    max_results=5,
    use_graph_expansion=True,
    max_hops=2
)

graph_context = []
for result in graph_context_results:
    graph_context.append({
        "entity": result.content,
        "type": result.metadata.get("type", "unknown"),
        "related": [e.get("name", e.get("id")) for e in result.related_entities[:3]]
    })

print(f"Retrieved {len(vector_results)} vector search results")
print(f"Found {len(graph_context)} relevant graph entities from ContextRetriever")


In [None]:
def format_context(vector_results, graph_context):
    context_parts = []
    
    context_parts.append("Retrieved Documents:")
    for i, result in enumerate(vector_results[:3], 1):
        doc_id = result.get("id", "unknown")
        score = result.get("score", 0)
        meta = result.get("metadata", {})
        context_parts.append(f"{i}. Document {doc_id} (relevance: {score:.3f}, topic: {meta.get('topic', 'N/A')})")
    
    if graph_context:
        context_parts.append("\nKnowledge Graph Context:")
        for ctx in graph_context:
            context_parts.append(f"- {ctx['entity']} ({ctx['type']})")
            if ctx['related']:
                context_parts.append(f"  Related: {', '.join(ctx['related'])}")
    
    return "\n".join(context_parts)

def generate_response(query, context):
    response_template = f"""
Query: {query}

Context:
{context}

Response: Based on the retrieved context, {query.lower()} is a topic covered in the knowledge base. 
The relevant documents and graph entities provide comprehensive information about this subject.
"""
    return response_template

context = format_context(vector_results, graph_context)
response = generate_response(query, context)

print("Generated Response:")
print(response)
print(f"\nUsed {len(vector_results)} graph-enhanced results")


## Summary

Next-generation RAG workflow:
- Knowledge graph built
- Embeddings generated
- Vectors stored
- Hybrid RAG implemented
- LLM integration ready
