# Graph RAG
## 1. Data Ingestion and Chunking

In [3]:
from langchain.document_loaders import PyMuPDFLoader
from langchain.schema import Document

# Step 1: Load PDF
loader = PyMuPDFLoader("data/RAMAYANA.pdf")
documents = loader.load()




In [42]:
# Example: define chapters by page ranges (you can adjust this)
chapter_map = {
    "Introduction".upper(): range(0, 2),
    "THE BIRTH OF RAMA".upper(): range(2, 3),
    "The Valiant Princes".upper(): range(3, 6),
    "SITA'S SWAYAMVAR".upper(): range(6, 8),
    "KAIKEYI AND HER WISHES".upper(): range(7, 21),
    "The demons in the forests".upper(): range(21, 24),
    "The Kidnapping of Sita".upper(): range(24, 27),
    "Rama searches for Sita".upper(): range(27, 29),
    "The land of the monkeys".upper(): range(29, 33),
    "Hanuman meets Sita - Lanka is destroyed".upper(): range(33, 37),
    "The War".upper(): range(37, 46),  
}

# Assign chapter metadata
tagged_documents = []
for i, doc in enumerate(documents):
    # print(f"Processing page {i + 1} of {doc.page_content[0:50]}")
    for chapter, pages in chapter_map.items():
        pages = list(pages)
        # print("Pages:",pages)
        if i in pages:
            chapter_name = chapter
            break
        else:
            chapter_name = "Unknown Chapter"
    
    # print(f"Chapter: {chapter_name} for page {i + 1}")
    new_doc = Document(page_content=doc.page_content, metadata={"chapter": chapter_name, **doc.metadata})
    tagged_documents.append(new_doc)

In [None]:
chapter_map = {}
for doc in tagged_documents:
    page_number = doc.metadata.get('page', 'N/A')
    title = doc.metadata.get('chapter', 'Unknown Chapter')
    page_content = doc.page_content
    # print(f"📄 Page {page_number} - Chapter: {title}")
    # print(f"{page_content[:50]}...\n")
    if title in chapter_map:
        chapter_map[title] = chapter_map[title] + "\n" + page_content
    else:
        chapter_map[title] = page_content

In [None]:
# Step 2: Create a map of chapters with their content
for title, content in chapter_map.items():
    print(f"📖 Chapter: {title[:30]}")
    print(f"{content[:20]}...\n")

📖 Chapter: INTRODUCTION
SRI RAMA JAYAM 
RAMA...

📖 Chapter: THE BIRTH OF RAMA
2 
 
1 RAMAYANA FOR ...

📖 Chapter: THE VALIANT PRINCES
3 
 
1.2 The Valiant...

📖 Chapter: SITA'S SWAYAMVAR
6 
 
for Rama, Laksh...

📖 Chapter: KAIKEYI AND HER WISHES
8 
 
invited to the ...

📖 Chapter: THE DEMONS IN THE FORESTS
21 
 
A thatched hut...

📖 Chapter: THE KIDNAPPING OF SITA
24 
 
When Maricha’s...

📖 Chapter: RAMA SEARCHES FOR SITA
27 
 
hurried back t...

📖 Chapter: THE LAND OF THE MONKEYS
29 
 
against a demo...

📖 Chapter: HANUMAN MEETS SITA - LANKA IS 
33 
 
1.9 Hanuman me...

📖 Chapter: THE WAR
37 
 
among us who i...



## Perform Entity Linking

In [62]:
import spacy
nlp = spacy.load("en_core_web_sm")

# Extract named entities per chapter
entity_map = {}
for title, content in chapter_map.items():
    doc = nlp(content)
    entities = set(ent.text for ent in doc.ents if ent.label_ in ["PERSON", "GPE"])
    entity_map[title] = list(entities)

# Print linked entities
for title, ents in entity_map.items():
    print(f"{title}: {ents}")

INTRODUCTION: ['Visalakshi Gopalan', 'Sita- Lanka']
THE BIRTH OF RAMA: ['Agni', 'Kaushalya', 'Shatrugna', 'Sarayu', 'Kaikeyi', 'Dasharatha', 'Dasahratha', 'Lakshmana', 'Rama']
THE VALIANT PRINCES: ['Sri \nRama', 'Lanka', 'Shatrugna', 'Gautama', 'Vishwamitra', 'Rakshasas', 'Indra', 'Rakshass', 'Subahu', 'Dasharatha', 'Vashistha', 'Rama', 'Sita', 'Bharatha', 'Lakshmana', 'Maithili', 'Yagna', 'Mareecha', 'Gurus', 'Sarayu', 'Ganga', 'Vashishta', 'Taraka', 'Ashram', 'Janaka']
SITA'S SWAYAMVAR: ['Shatrugna', 'Ram', 'Brahmin', 'Sri Rama', 'Dasharatha', 'Ramas', 'Rama', 'Ayodhya', 'Parushurama', 'Rishi', 'Mondovi', 'Kshatriyas', 'Kling Janaka', 'Brahmins', 'Lakshmana', 'Bharata', 'Mithila', 'Vashishta', 'Janaka']
KAIKEYI AND HER WISHES: ['Parnakuti', 'Kosala', 'Kaushalya', 'Vedas', 'Shatrugna', 'Dasahratha', 'Sumnathara', 'Sri Rama', 'Sumanthara', 'Sumitra', 'Dasharatha', 'Vashistha', 'Kaika', 'Bharadwaj', 'Chitrkut', 'Rama', 'Yamuna', 'Asuras', 'Rama Spoke', 'Ayodhya', 'Dandakaranya', 'Mantha

In [68]:
# !pip install spacy
# !python -m spacy download en_core_web_sm
!pip install pyvis


Collecting pyvis
  Downloading pyvis-0.3.2-py3-none-any.whl.metadata (1.7 kB)
Collecting jsonpickle>=1.4.1 (from pyvis)
  Downloading jsonpickle-4.1.1-py3-none-any.whl.metadata (8.1 kB)
Downloading pyvis-0.3.2-py3-none-any.whl (756 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m756.0/756.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jsonpickle-4.1.1-py3-none-any.whl (47 kB)
Installing collected packages: jsonpickle, pyvis
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [pyvis]32m1/2[0m [pyvis]
[1A[2KSuccessfully installed jsonpickle-4.1.1 pyvis-0.3.2


## Construct the Knowledge Graph

In [63]:
import networkx as nx

# Create graph
G = nx.Graph()

# Add nodes
for title in chapter_map.keys():
    G.add_node(title, content=chapter_map[title], entities=entity_map[title])

# Add edges if chapters share common entities
titles = list(chapter_map.keys())
for i in range(len(titles)):
    for j in range(i+1, len(titles)):
        common = set(entity_map[titles[i]]) & set(entity_map[titles[j]])
        if common:
            G.add_edge(titles[i], titles[j], shared_entities=list(common))

# Print graph structure
print("Graph edges:")
for edge in G.edges(data=True):
    print(edge)


Graph edges:
('INTRODUCTION', 'HANUMAN MEETS SITA - LANKA IS DESTROYED', {'shared_entities': ['Sita- Lanka']})
('THE BIRTH OF RAMA', 'THE VALIANT PRINCES', {'shared_entities': ['Shatrugna', 'Sarayu', 'Dasharatha', 'Lakshmana', 'Rama']})
('THE BIRTH OF RAMA', "SITA'S SWAYAMVAR", {'shared_entities': ['Lakshmana', 'Rama', 'Shatrugna', 'Dasharatha']})
('THE BIRTH OF RAMA', 'KAIKEYI AND HER WISHES', {'shared_entities': ['Kaushalya', 'Shatrugna', 'Dasharatha', 'Kaikeyi', 'Dasahratha', 'Lakshmana', 'Rama']})
('THE BIRTH OF RAMA', 'THE DEMONS IN THE FORESTS', {'shared_entities': ['Lakshmana', 'Rama']})
('THE BIRTH OF RAMA', 'THE KIDNAPPING OF SITA', {'shared_entities': ['Lakshmana', 'Rama']})
('THE BIRTH OF RAMA', 'RAMA SEARCHES FOR SITA', {'shared_entities': ['Lakshmana', 'Rama']})
('THE BIRTH OF RAMA', 'THE LAND OF THE MONKEYS', {'shared_entities': ['Lakshmana', 'Rama']})
('THE BIRTH OF RAMA', 'HANUMAN MEETS SITA - LANKA IS DESTROYED', {'shared_entities': ['Kaushalya', 'Rama', 'Shatrugna']})

In [74]:
from pyvis.network import Network
import networkx as nx

G_nx = G.copy()
net = Network(height="900px", width="1000px", notebook=True, directed=False)

# Enable physics and set options for better spacing
net.set_options("""
var options = {
  "nodes": {
    "font": {
      "size": 24
    },
    "size": 30
  },
  "edges": {
    "font": {
      "size": 18,
      "align": "middle"
    },
    "color": {
      "inherit": true
    },
    "smooth": false
  },
  "physics": {
    "enabled": true,
    "barnesHut": {
      "gravitationalConstant": -30000,
      "centralGravity": 0.3,
      "springLength": 250,
      "springConstant": 0.04,
      "damping": 0.09,
      "avoidOverlap": 1
    },
    "minVelocity": 0.75
  }
}
""")

for node in G_nx.nodes():
    net.add_node(node, label=node)

for u, v, data in G_nx.edges(data=True):
    label = ', '.join(data.get('shared_entities', [])[:2])
    if len(data.get('shared_entities', [])) > 2:
        label += '...'
    net.add_edge(u, v, title=label, label=label)

net.show("graph_rag_ramayana.html")
print("Graph saved as graph_rag_ramayana.html")

graph_rag_ramayana.html
Graph saved as graph_rag_ramayana.html


## Query-Time Path-Based Retrieval

In [83]:
from difflib import get_close_matches

query_entity = "Kaikeyi"
connected_nodes = []

def entity_match(entity_list, query):
    return any(
        query.lower() == ent.lower() or query.lower() in ent.lower().split()
        for ent in entity_list
    )

# Search nodes containing Kaikeyi exactly
for node, data in G.nodes(data=True):
    
    if entity_match(data["entities"], query_entity):
        connected_nodes.append(node)
        # connected_nodes.extend(nx.neighbors(G, node))
        # print(f"Checking node: {node} with entities: {data['entities']}")

# Deduplicate and print
unique_nodes = sorted(set(connected_nodes))
for node in unique_nodes:
    content_preview = G.nodes[node]['content'][:300].replace('\n', ' ')
    print(f"🔗 Node: {node}\n{content_preview}...\n")


🔗 Node: KAIKEYI AND HER WISHES
8    invited to the city to provide entertainment to the people. The chiefs  of the city were instructed to decorate it on a very grand scale. Even  while these preparations were going on. Dasahratha had a talk with  Rama and advised him about how king should conduct himself. "A  king should never f...

🔗 Node: THE BIRTH OF RAMA
2    1 RAMAYANA FOR CHILDREN    1.1 THE BIRTH OF RAMA    Ayodhya was a magnificent city on the banks of the river Sarayu in  Kosala Country .It had wide roads, huge buildings, beautiful parks  and glittering shops. The people of the city lived a happy and  contented life as they were ruled by a wond...

🔗 Node: THE WAR
37    among us who is capable of flying across the ocean. So, Please tell  us how we can get there.” Rama also asked his trusted friend about  Lanka’s city plan, about its main gates, about trenches built around  the fort and many more such information to plan the attack: Though  Hanuman had burnt d...



## RAG Answer Generation

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
import os

# Load OpenAI API key from a file
file = open('data/key.txt', 'r')
# Read the API key from the file
api_key = file.read().strip()
# Close the file
file.close()
# 🧠 Set your OpenAI API key
os.environ["OPENAI_API_KEY"] = api_key

llm = ChatOpenAI(temperature=0)

# Concatenate context
context = "\n".join(G.nodes[node]["content"] for node in unique_nodes)
print("Number of nodes in context:", len(unique_nodes))
print("Number of words in context:", len(context.split()))
print("-------------------------------------------------------------")
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="Answer the question based on the context below:\n\n{context}\n\nQuestion: {question}\nAnswer:"
)

# Query
query = "What role did Kaikeyi play in Rama's exile?"
final_prompt = prompt.format(context=context, question=query)
response = llm.invoke(final_prompt)
print("🧠 Answer:\n", response.content)


Number of nodes in context: 3
Number of words in context: 7707
-------------------------------------------------------------
🧠 Answer:
 Kaikeyi played a negative role in Rama's exile. She was manipulated by her maid, Manthara, to demand that Rama be banished to the forest for 14 years and that Bharatha be crowned king instead. This led to Rama, Sita, and Lakshmana going into exile in the forest.


In [91]:
print("🧠 Answer:\n", '\n'.join([' '.join(response.content.split()[i:i + 7]) for i in range(0, len(response.content.split()), 7)]))

🧠 Answer:
 Kaikeyi played a negative role in Rama's
exile. She was manipulated by her maid,
Manthara, to demand that Rama be banished
to the forest for 14 years and
that Bharatha be crowned king instead. This
led to Rama, Sita, and Lakshmana going
into exile in the forest.
