## Graph based Chunking
## Data Ingestion

In [15]:
from langchain.document_loaders import PyMuPDFLoader
from langchain.schema import Document

# Step 1: Load PDF
loader = PyMuPDFLoader("data/RAMAYANA.pdf")
documents = loader.load()




## Data Preparation

In [16]:
# Example: define chapters by page ranges (you can adjust this)
chapter_map = {
    "Introduction".upper(): range(0, 2),
    "THE BIRTH OF RAMA".upper(): range(2, 3),
    "The Valiant Princes".upper(): range(3, 6),
    "SITA'S SWAYAMVAR".upper(): range(6, 8),
    "KAIKEYI AND HER WISHES".upper(): range(7, 21),
    "The demons in the forests".upper(): range(21, 24),
    "The Kidnapping of Sita".upper(): range(24, 27),
    "Rama searches for Sita".upper(): range(27, 29),
    "The land of the monkeys".upper(): range(29, 33),
    "Hanuman meets Sita - Lanka is destroyed".upper(): range(33, 37),
    "The War".upper(): range(37, 46),  
}

# Assign chapter metadata
tagged_documents = []
for i, doc in enumerate(documents):
    # print(f"Processing page {i + 1} of {doc.page_content[0:50]}")
    for chapter, pages in chapter_map.items():
        pages = list(pages)
        # print("Pages:",pages)
        if i in pages:
            chapter_name = chapter
            break
        else:
            chapter_name = "Unknown Chapter"
    
    # print(f"Chapter: {chapter_name} for page {i + 1}")
    new_doc = Document(page_content=doc.page_content, metadata={"chapter": chapter_name, **doc.metadata})
    tagged_documents.append(new_doc)

In [17]:
chapter_map = {}
for doc in tagged_documents:
    page_number = doc.metadata.get('page', 'N/A')
    title = doc.metadata.get('chapter', 'Unknown Chapter')
    page_content = doc.page_content
    # print(f"📄 Page {page_number} - Chapter: {title}")
    # print(f"{page_content[:50]}...\n")
    if title in chapter_map:
        chapter_map[title] = chapter_map[title] + "\n" + page_content
    else:
        chapter_map[title] = page_content

In [21]:
# Step 2: Create a map of chapters with their content
for title, content in chapter_map.items():
    print(f"📖 Chapter: {title[:30]}")
    # print(f"{content[:20]}...\n")

📖 Chapter: INTRODUCTION
📖 Chapter: THE BIRTH OF RAMA
📖 Chapter: THE VALIANT PRINCES
📖 Chapter: SITA'S SWAYAMVAR
📖 Chapter: KAIKEYI AND HER WISHES
📖 Chapter: THE DEMONS IN THE FORESTS
📖 Chapter: THE KIDNAPPING OF SITA
📖 Chapter: RAMA SEARCHES FOR SITA
📖 Chapter: THE LAND OF THE MONKEYS
📖 Chapter: HANUMAN MEETS SITA - LANKA IS 
📖 Chapter: THE WAR


# Paragraph chunking
# Connecting entities and paragraphs

In [105]:
import networkx as nx
import re
from collections import defaultdict
import spacy

# Load spaCy NER model (you can use 'en_core_web_sm' or a larger model)
nlp = spacy.load("en_core_web_sm")

# Initialize graph
G = nx.Graph()

# Helper: split into paragraphs
def split_paragraphs(text):
    return re.split(r'\n\s*\n|(?<=[.!?])\s+(?=[A-Z])', text.strip())

# Entity co-occurrence mapping
edge_chunks = defaultdict(list)

# Build Graph: Entity co-occurrence
for chapter, content in chapter_map.items():
    for para in split_paragraphs(content):
        doc = nlp(para)
        entities = list(set(ent.text.strip() for ent in doc.ents if ent.label_ in ["PERSON", "ORG", "GPE"]))
        # if len(entities)==0:
        #     print(f"⚠️ No entities found in paragraph: {para[:10]}...")
        for i in range(len(entities)):
            for j in range(i+1, len(entities)):
                ent1, ent2 = sorted([entities[i], entities[j]])
                G.add_node(ent1)
                G.add_node(ent2)
                if G.has_edge(ent1, ent2):
                    G[ent1][ent2]["content"].append(para)
                    G[ent1][ent2]["chapters"].add(chapter)
                else:
                    G.add_edge(ent1, ent2, content=[para], chapters={chapter})

# Remove isolated nodes (optional)
isolated = list(nx.isolates(G))
G.remove_nodes_from(isolated)

# Show nodes and edges
print("📌 Nodes:")
nodes= G.nodes(data=True)
print(''.join([f"{item[0]}{'\n' if (i + 1) % 10 == 0 and (i + 1) != len(nodes) else ', '}" for i, item in enumerate(nodes)]))

print("\n🔗 Edges and shared chunks:")
for u, v, data in G.edges(data=True):
    print(f"{u} ↔ {v}")
    for para in data["content"]:
        print(f"  🧩 Chunk: {para[:100]}...")


📌 Nodes:
Kosala Country, Sarayu, Kaushalya, Sumitra, King Dasahratha, Kaikeyi, Agni, Bharata, Shatrugna, Lakshmana
Rama, Dasharatha, Bharatha, Vishwamitra, Subahu, Yagna, Mareecha, Vashishta, Rakshasas, Ganga
Maithili, Janaka, Siva, Ashram, Gautama, Mithila, Indra, Sri 
Rama, Sita, Lanka
Ravana, Ayodhya, Rishi, Sri Rama, King Dasharatha, Mondovi, Brahmins, Kling Janaka, Dasahratha, Parushurama
Kshatriyas, Brahmin, Ramas, Manthara, Kaika, Kaushalaya, Asuras, Dashratha, Sumnathara, Please
Sumantra, Sumanthara, Utterly, Kosala, Guha, Welcoming Rama, Chitrakut, Yamuna, Parnakuti, Chitrkut
Vashistha, Enraged, kingdom, Surpanaka, Khara, Dushana, Dandakaranya, Maricha, Panchavati, Viman
Rakshasa, Sanyasini, Jatayu, Rakshasis, Kabandha, Pampa, Rishyamukha, Varanas*, Hanuman, Vanara
Sugriva, Vali, Kishkinda, Rishyamuka, Sugriva’s, Vanaras, Mother Sita, Gunuman, Angada, Jatayu’s
Sampati, Vayu, Sita- Lanka, Suras, Surasa, Devas, Lankini, Mandodari, Ashoka, Rakshasi
Trijata, Ravenna, Indrajit, Dev

## Path Based Retrieval

In [None]:
query = "What are the whishes of Kaikeyi to king Dasahratha ?"

print(f"\n🔍 Query: {query}")
doc = nlp(query)
query_entities = list(set(ent.text.strip() for ent in doc.ents if ent.label_ in ["PERSON", "ORG", "GPE"]))
print("Entities:",query_entities)
connected_nodes = []

def entity_match(entity_list, query_entities):
    count = 0
    for entity in query_entities:
        if entity in entity_list:
            count += 1
    return count == len(query_entities)

# Search nodes containing Kaikeyi exactly
for u, v, data in G.edges(data=True):
    
    if entity_match([u,v], query_entities):
        # print(f"Found exact match for {query_entities} in edge: {u} ↔ {v}")
        connected_nodes.append([[u,[v, data["chapters"], data["content"]]]])
        # connected_nodes.extend()
        # print(f"Checking node: {node} with entities: {data['entities']}")

# Deduplicate and print
# unique_nodes = sorted(set(connected_nodes))
print(f"Found {len(connected_nodes)} connected nodes for the query.")
for node in connected_nodes:
    # print(node[0][0], "is connected to:")
    for neighbor, chapter, content in node[0][1:]:
        # print(f"  - {neighbor}, content: {content[0]}...")
        print(f"  - {content[0]}...")




🔍 Query: What are the whishes of Kaikeyi to king Dasahratha ?
Entities: ['Dasahratha', 'Kaikeyi']
Found 1 connected nodes for the query.
  - Rama wiped his father’s tears softly, “Father, I am duty bound to 
fulfill your wish, Please bless us.” 
Dasahratha realized that he could not make Kaikeyi change her mind 
nor could he make Rama go back on what he had said....


## Path based retrieval - approach-2

In [87]:
import networkx as nx

def path_based_retrieval(G, source_entity, target_entity, max_hops=4):
    if not G.has_node(source_entity) or not G.has_node(target_entity):
        return f"One or both entities ('{source_entity}', '{target_entity}') not found in the graph."

    try:
        # Find shortest path
        paths = [nx.shortest_path(G, source=source_entity, target=target_entity)]
        # paths = nx.all_simple_paths(G, source=source_entity, target=target_entity, cutoff=max_hops)

        # print(paths)
        # Collect all paragraphs (chunks) from edges in path
        chunks = []
        for path in paths:
            # print(f"🔗 Path found: {' → '.join(path)}")
            for i in range(len(path)-1):
                u, v = path[i], path[i+1]
                edge_data = G.get_edge_data(u, v)
                if edge_data:
                    chunks.extend(edge_data["content"])  # or use set() to deduplicate
                    # print(edge_data["content"])

        # Deduplicate and return
        unique_chunks = list(set(chunks))
        return unique_chunks

    except nx.NetworkXNoPath:
        return f"No path found between '{source_entity}' and '{target_entity}' within {max_hops} hops."

# Example usage
chunks = path_based_retrieval(G, source_entity="Hanuman", target_entity="Sita")
for i, chunk in enumerate(chunks, 1):
    print(f"\n📄 Chunk {i}:\n{chunk}")



📄 Chunk 1:
Hanuman who was watching Sita said softly “Rama, Rama,” Sita 
stopped and looked up.


In [22]:
import networkx as nx
import spacy
from collections import defaultdict

# Step 1: Extract entities and create initial graph
G = nx.Graph()
chunk_map = defaultdict(list)

for i, (chapter, page) in enumerate(chapter_map.items()):
    text = page.strip()
    doc_nlp = nlp(text)
    entities = [ent.text for ent in doc_nlp.ents if ent.label_ in {"PERSON", "GPE", "ORG"}]

    # Add to graph
    for ent in entities:
        G.add_node(ent, type="entity")

    for j in range(len(entities)-1):
        G.add_edge(entities[j], entities[j+1], source_page=i)

    # Save paragraph to each node it's linked to
    for ent in set(entities):
        chunk_map[ent].append((i, text))

In [30]:
print(f"Total entities found: {len(G.nodes)}")
print(f"Total chunks generated: {len(chunk_map)}")

for node in G.nodes:
    # print(f"Node: {node}, Type: {G.nodes[node]['type']}, Chunks: {chunk_map[node]}...")  # Show first two chunks for brevity
    for chunk in chunk_map[node][:2]:
        print(f"Node: {node}, Chunk Size: {len(chunk[1])}\nChunk: {chunk[1][:50]}...")
    # print(f"Chunks Size: {len(chunk_map[node])}...")  # Show first two chunks for brevity


Total entities found: 155
Total chunks generated: 155
Node: Visalakshi Gopalan, Chunk Size: 1502
Chunk: SRI RAMA JAYAM 
RAMAYANA FOR CHILDREN 
Compiled by...
Node: Sita- Lanka, Chunk Size: 1502
Chunk: SRI RAMA JAYAM 
RAMAYANA FOR CHILDREN 
Compiled by...
Node: Sita- Lanka, Chunk Size: 8421
Chunk: 33 
 
1.9 Hanuman meets Sita- Lanka is destroyed 
...
Node: Sarayu, Chunk Size: 1910
Chunk: 2 
 
1 RAMAYANA FOR CHILDREN 
 
1.1 THE BIRTH OF R...
Node: Sarayu, Chunk Size: 6824
Chunk: 3 
 
1.2 The Valiant Princes 
 
The four princes g...
Node: Kosala Country, Chunk Size: 1910
Chunk: 2 
 
1 RAMAYANA FOR CHILDREN 
 
1.1 THE BIRTH OF R...
Node: Dasharatha, Chunk Size: 1910
Chunk: 2 
 
1 RAMAYANA FOR CHILDREN 
 
1.1 THE BIRTH OF R...
Node: Dasharatha, Chunk Size: 6824
Chunk: 3 
 
1.2 The Valiant Princes 
 
The four princes g...
Node: Kaushalya, Chunk Size: 1910
Chunk: 2 
 
1 RAMAYANA FOR CHILDREN 
 
1.1 THE BIRTH OF R...
Node: Kaushalya, Chunk Size: 26503
Chunk: 8 
 
invited to the city to provide

In [34]:
# Step 2: Generate Graph-based Chunks
chunks = []
visited = set()

for entity, refs in chunk_map.items():
    print(f"Processing entity: {entity} with {len(refs)} references, {refs[:2]}...")  # Show first two references for brevity
    if entity not in visited:
        neighbors = nx.node_connected_component(G, entity)
        visited.update(neighbors)

        related_texts = []
        for neighbor in neighbors:
            related_texts.extend([text for _, text in chunk_map.get(neighbor, [])])
        
        combined_chunk = "\n".join(set(related_texts))
        chunks.append(combined_chunk)

Processing entity: Visalakshi Gopalan with 1 references, [(0, "SRI RAMA JAYAM \nRAMAYANA FOR CHILDREN \nCompiled by  \n \nVisalakshi Gopalan \n14-Apr-13 \n \n \n \nFor children’s reading \n\n1 \n \nContents \n \n1 RAMAYANA FOR CHILDREN ............................................................................................... 2 \n1.1 THE BIRTH OF RAMA ..................................................................................................... 2 \n1.2 The Valiant Princes ........................................................................................................ 3 \n1.3 SITA'S SWAYAMVAR ..................................................................................................... 5 \n1.4 KAIKEYI AND HER WISHES ....................................................................................... 7 \n1.5 The demons in the forests ........................................................................................ 20 \n1.6 The Kidnapping of Sita ......

In [32]:
for i, chunk in enumerate(chunks):
    print(f"Chunk {i+1}:\nChunk Size: {len(chunk)} Characters\n{chunk[0:50]}\n")
    
print(f"Total entities found: {len(G.nodes)}")
print(f"Total chunks generated: {len(chunks)}")

Chunk 1:
Chunk Size: 89212 Characters
29 
 
against a demon and chased him into a cave, 

Total entities found: 155
Total chunks generated: 1
