### Check for isolated nodes

In [1]:
import sys
from pathlib import Path

# Add project root to path
PROJECT_ROOT = Path.cwd().parent.parent  # Go up 2 levels from KnowledgeGraph/NodeRag/
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

# Add vendor NodeRAG to path for PPR import
VENDOR_NODERAG = PROJECT_ROOT / "vendor" / "NodeRAG"
if str(VENDOR_NODERAG) not in sys.path:
    sys.path.insert(0, str(VENDOR_NODERAG))

import networkx as nx
from KnowledgeGraph.NodeRag import load_cached_outputs

outputs = load_cached_outputs()
graph = outputs.graph

# Find all isolated nodes (degree 0)
isolated_nodes = [node for node in graph.nodes() if graph.degree(node) == 0]
print(f"=== Isolated Nodes Analysis ===")
print(f"Total nodes: {graph.number_of_nodes()}")
print(f"Isolated nodes (degree 0): {len(isolated_nodes)}")
print(f"Isolation rate: {len(isolated_nodes) / graph.number_of_nodes() * 100:.1f}%")

# Break down isolated nodes by type
isolated_by_type = {}
for node_id in isolated_nodes:
    node_type = graph.nodes[node_id].get('type', 'unknown')
    isolated_by_type[node_type] = isolated_by_type.get(node_type, 0) + 1

print("\nIsolated nodes by type:")
for node_type, count in sorted(isolated_by_type.items(), key=lambda x: x[1], reverse=True):
    print(f"  {node_type}: {count}")

# Sample some isolated nodes to see what they are
print("\n=== Sample Isolated Nodes ===")
for i, node_id in enumerate(isolated_nodes[:5]):
    data = graph.nodes[node_id]
    node_type = data.get('type', 'unknown')
    context = data.get('context', 'N/A')[:150]
    print(f"\n{i+1}. Type: {node_type}")
    print(f"   ID: {node_id[:30]}...")
    print(f"   Content: {context}...")

# Check connectivity components
components = list(nx.connected_components(graph))
component_sizes = [len(comp) for comp in components]
print(f"\n=== Graph Connectivity ===")
print(f"Number of connected components: {len(components)}")
print(f"Largest component: {max(component_sizes)} nodes")
print(f"Smallest component: {min(component_sizes)} nodes")
print(f"Components with only 1 node: {sum(1 for s in component_sizes if s == 1)}")
print(f"Components with 2-5 nodes: {sum(1 for s in component_sizes if 2 <= s <= 5)}")
print(f"Components with >5 nodes: {sum(1 for s in component_sizes if s > 5)}")

# Check what appears isolated in the visualization subgraph (like the HTML viz does)
print(f"\n=== Visualization Subgraph Analysis ===")
print("(This simulates what you see in the HTML visualization)")
from NodeRAG.utils.PPR import sparse_PPR

page_rank = sparse_PPR(graph).PR()
top_n = 200  # Same as your -n 200 visualization
top_nodes = [node for node, score in page_rank[:top_n]]
viz_subgraph = graph.subgraph(top_nodes).copy()

viz_isolated = [node for node in viz_subgraph.nodes() if viz_subgraph.degree(node) == 0]
print(f"Top {top_n} nodes by PageRank:")
print(f"  Nodes in subgraph: {viz_subgraph.number_of_nodes()}")
print(f"  Edges in subgraph: {viz_subgraph.number_of_edges()}")
print(f"  Nodes that appear isolated in subgraph: {len(viz_isolated)}")
print(f"  Subgraph is connected: {nx.is_connected(viz_subgraph)}")

if viz_isolated:
    print(f"\n  Sample nodes that appear isolated in visualization:")
    for i, node_id in enumerate(viz_isolated[:5]):
        node_type = graph.nodes[node_id].get('type', 'unknown')
        degree_in_full = graph.degree(node_id)
        print(f"    {i+1}. {node_type} (degree in full graph: {degree_in_full})")
        print(f"       Context: {graph.nodes[node_id].get('context', 'N/A')[:80]}...")

Config file already exists at /mnt/c/Users/sepeh/OneDrive/Documents/Git/GraphRag_Developer_Challenge3/.noderag/Node_config.yaml
Config file already exists at /mnt/c/Users/sepeh/OneDrive/Documents/Git/GraphRag_Developer_Challenge3/.noderag/Node_config.yaml
=== Isolated Nodes Analysis ===
Total nodes: 8361
Isolated nodes (degree 0): 0
Isolation rate: 0.0%

Isolated nodes by type:

=== Sample Isolated Nodes ===

=== Graph Connectivity ===
Number of connected components: 7
Largest component: 8072 nodes
Smallest component: 25 nodes
Components with only 1 node: 0
Components with 2-5 nodes: 0
Components with >5 nodes: 7

=== Visualization Subgraph Analysis ===
(This simulates what you see in the HTML visualization)
Top 200 nodes by PageRank:
  Nodes in subgraph: 200
  Edges in subgraph: 118
  Nodes that appear isolated in subgraph: 78
  Subgraph is connected: False

  Sample nodes that appear isolated in visualization:
    1. entity (degree in full graph: 13)
       Context: N/A...
    2. sem

### Investgiate specific isolated nodes

In [2]:
# Pick a specific isolated node and investigate
if isolated_nodes:
    sample_id = isolated_nodes[0]
    print(f"\n=== Investigating Isolated Node ===")
    print(f"Node ID: {sample_id}")
    print(f"Type: {graph.nodes[sample_id].get('type')}")
    print(f"Context: {graph.nodes[sample_id].get('context', 'N/A')[:300]}")
    
    # Check if it appears in the parquet files
    if not outputs.entities.empty:
        entity_match = outputs.entities[outputs.entities['hash_id'] == sample_id]
        if not entity_match.empty:
            print(f"\nFound in entities table:")
            print(entity_match.iloc[0].to_dict())
    
    if not outputs.semantic_units.empty:
        sem_match = outputs.semantic_units[outputs.semantic_units['hash_id'] == sample_id]
        if not sem_match.empty:
            print(f"\nFound in semantic_units table:")
            print(sem_match.iloc[0].to_dict())

### Health Check

In [3]:
def check_detached_nodes_health(graph):
    """Assess if detached nodes are a problem"""
    isolated = [n for n in graph.nodes() if graph.degree(n) == 0]
    isolation_rate = len(isolated) / graph.number_of_nodes()
    
    # Check isolated node types
    isolated_types = {}
    for node_id in isolated:
        node_type = graph.nodes[node_id].get('type', 'unknown')
        isolated_types[node_type] = isolated_types.get(node_type, 0) + 1
    
    issues = []
    
    if isolation_rate > 0.3:
        issues.append(f"High isolation rate: {isolation_rate*100:.1f}% (>30% is concerning)")
    
    # Semantic units should usually be connected
    if 'semantic_unit' in isolated_types:
        sem_isolated = isolated_types['semantic_unit']
        total_sem = sum(1 for _, d in graph.nodes(data=True) if d.get('type') == 'semantic_unit')
        if total_sem > 0 and sem_isolated / total_sem > 0.2:
            issues.append(f"{sem_isolated}/{total_sem} semantic units are isolated (>20% is concerning)")
    
    # Entities should often connect to other entities
    if 'entity' in isolated_types:
        entity_isolated = isolated_types['entity']
        total_entities = sum(1 for _, d in graph.nodes(data=True) if d.get('type') == 'entity')
        if total_entities > 0 and entity_isolated / total_entities > 0.5:
            issues.append(f"{entity_isolated}/{total_entities} entities are isolated (>50% is concerning)")
    
    if not issues:
        print("Detached nodes look reasonable")
        print(f"   {len(isolated)} isolated nodes ({isolation_rate*100:.1f}%)")
    else:
        print("Issues with detached nodes:")
        for issue in issues:
            print(f"  {issue}")
    
    return len(issues) == 0

check_detached_nodes_health(graph)

Detached nodes look reasonable
   0 isolated nodes (0.0%)


True

### Investigate Specific Node That Appears Isolated with n = 200

In [4]:
from KnowledgeGraph.NodeRag import load_cached_outputs

outputs = load_cached_outputs()
graph = outputs.graph

snippet = "various individuals with their names in both English and Arabic"

# Find the semantic_unit row whose context contains that snippet
mask = outputs.semantic_units["context"].str.contains(snippet, na=False)
match = outputs.semantic_units[mask]

print("Matches:", len(match))
display(match.head(3))

# Take the first match's hash_id as the node id
sem_id = str(match.iloc[0]["hash_id"])
print("Semantic unit node id:", sem_id)

Config file already exists at /mnt/c/Users/sepeh/OneDrive/Documents/Git/GraphRag_Developer_Challenge3/.noderag/Node_config.yaml
Config file already exists at /mnt/c/Users/sepeh/OneDrive/Documents/Git/GraphRag_Developer_Challenge3/.noderag/Node_config.yaml
Matches: 1


Unnamed: 0,hash_id,human_readable_id,type,context,text_hash_id,weight,embedding,insert
10,56bf5399c57f5a3aa03aeda9bdd7c304ce54572e6d8aa4...,9404,semantic_unit,The text lists various individuals with their ...,2afd96c75556d927cbfaa6c9ef9cceeff17ad983c72966...,1,done,True


Semantic unit node id: 56bf5399c57f5a3aa03aeda9bdd7c304ce54572e6d8aa439a72c4bb52dac447f


In [5]:
print("Degree in full graph:", graph.degree(sem_id))
print("\nNeighbors of this semantic_unit:")

for neighbor in graph.neighbors(sem_id):
    data = graph.nodes[neighbor]
    n_type = data.get("type", "unknown")
    ctx = data.get("context", "N/A")
    print(f"\n- Neighbor id: {neighbor}")
    print(f"  Type: {n_type}")
    print(f"  Context: {str(ctx)[:200]}...")

Degree in full graph: 27

Neighbors of this semantic_unit:

- Neighbor id: a84cfcd293b1c844964a9eb6f5b6819628d017837d250a3c1e40f5ab4b37fed3
  Type: entity
  Context: N/A...

- Neighbor id: a78cd05cb7542addef386123a607420122fbdeecd752b8b0ed5962e55801a281
  Type: entity
  Context: N/A...

- Neighbor id: 9e9aa327dd19c9b4c6e4517afc131fb19c80f12f6432b1efe0661eac366f3dab
  Type: entity
  Context: N/A...

- Neighbor id: 70c9438d13447d10c2de477341e4c0d5a7be485382b4375701670209306a6ec3
  Type: entity
  Context: N/A...

- Neighbor id: 91337978c386628b91b02fa9a7e32a01aa798e4db5223e21358457ccce9e82c6
  Type: entity
  Context: N/A...

- Neighbor id: 5f8d5ab9c36a652c69514dc29119dd114766b943aea741cd0fc47cd976ab5424
  Type: entity
  Context: N/A...

- Neighbor id: ab3bc175e631b149f2df72ed065bc50183b1ff4a94a9180e8565cea6f05ed8d3
  Type: entity
  Context: N/A...

- Neighbor id: 34565e2fbce33c285f4bdcf6e18f69778e153bc9d30df4315a8f0ad723fdf5ea
  Type: entity
  Context: N/A...

- Neighbor id: bcdf87bfb1e3d