# Comprehensive Implementation of Graph RAG for Job Profile Analysis Using Local Ollama Models

This notebook demonstrates a complete Graph RAG implementation for analyzing job profile documents using local Ollama models.

## 1. Environment Setup

In [None]:
# !pip install -qU langchain langchain_community langchain-experimental neo4j pyvis ollama python-dotenv
# !ollama pull llama3.1  # 8B parameter model recommended
# !ollama pull nomic-embed-text  # Embedding model

import os
import sys
sys.path.append(os.path.abspath('..'))

## 2. Document Processing Pipeline

In [None]:

TESTING=False           # Set to True to run with toy data
RUN_IN_BATCH=True       # If False, will try to run all chunks at once to get entity connections, otherwise will save after each one
USE_OLLAMA=False         # Whether to use local Ollama or Azure API
LIMIT_CHUNKS=1       # Set to a number to limit the number of chunk to be processed
CHUNK_SIZE=1000          
csvPath="../data/job profiles/2025-02-07_profiles.csv"

Load data:

In [None]:
# tokenizer = MistralTokenizer.from_model("mistral-small", strict=True)
# text = "Your text here"
# tokens = tokenizer.encode_chat_completion(text)
# token_count = len(tokens)

In [None]:
from typing import List, Optional
from pydantic import BaseModel, Field
import pandas as pd
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
from mistral_common.protocol.instruct.request import ChatCompletionRequest
from mistral_common.protocol.instruct.messages import UserMessage

class JobProfile(BaseModel):
    title: str = Field(description="Official job title")
    classifications: List[str] = Field(description="Classification codes")
    organizations: List[str] 
    behavioural_competencies: List[str]
    education: List[str] = Field(description="Education requirements")
    job_experience: List[str]
    knowledge_skills_abilities: List[str]
    security_screenings: List[str]
    accountabilities: List[str]
    role_type: Optional[str] = Field(description="Role category")
    scopes: Optional[List[str]] = Field(description="Areas of responsibility")
    professional_registration: Optional[List[str]]

if not TESTING:
    from notebooks.utils import get_job_profile_documents
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    from transformers import AutoTokenizer

    documents=get_job_profile_documents(csvPath, include_org_class_sections=False)
    tokenizer = MistralTokenizer.from_model("mistral-small", strict=True)

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=200,
        separators=["\n\n", "\n", "•", " ", ""],
        length_function=lambda text: len(tokenizer.encode_chat_completion(
            ChatCompletionRequest(
                messages=[
                    UserMessage(content=text)
                ],
                model="mistral-small-latest"
            )
        ).tokens)
    )
    
    chunks = splitter.split_documents(documents)


In [None]:
if LIMIT_CHUNKS is not None:
    chunks=chunks[0:LIMIT_CHUNKS]

In [None]:
csvPath="../data/job profiles/2025-02-07_profiles.csv"
df=pd.read_csv(csvPath)
df.columns

In [None]:
if TESTING:
    from langchain_community.document_loaders import TextLoader
    from langchain_text_splitters import RecursiveCharacterTextSplitter

    # Custom job profile document format
    job_profiles = """
    [Job Profile: Data Scientist]
    Accountabilities:
    - Develop ML models for customer segmentation
    - Collaborate with engineering teams on deployment

    Knowledge:
    - Advanced statistics
    - Python programming

    Skills:
    - TensorFlow/PyTorch
    - SQL optimization

    [Job Profile: Cloud Architect]
    Accountabilities:
    - Design AWS infrastructure
    - Implement security protocols

    Knowledge:
    - Networking fundamentals
    - IaaS/PaaS/SaaS models

    Skills:
    - Terraform infrastructure as code
    - Cost optimization techniques
    """

    with open("job_profiles.txt", "w") as f:
        f.write(job_profiles)

    loader = TextLoader("job_profiles.txt")
    docs = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        separators=["\n\n[Job Profile:", "\n\nAccountabilities:", "\n\nKnowledge:"]
    )
    chunks = text_splitter.split_documents(docs)

## 3. Graph Construction with LLM Extraction

To View available models in ollama

In [None]:
if USE_OLLAMA:
    import ollama
    from datetime import datetime

    # Get models
    models = ollama.Client().list()['models']

    print("Available Models:")
    print("-" * 80)
    for model in models:
        # Extract model name from model field
        model_name = model['model']
        
        # Format size in GB
        size_gb = model['size'] / 1_000_000_000
        
        # Format datetime
        modified = model['modified_at'].strftime("%Y-%m-%d %H:%M:%S %Z")
        
        # Print main model info
        print(f"Model: {model_name}")
        print(f"Modified: {modified}")
        print(f"Size: {size_gb:.2f} GB")
        
        # Print model details if available
        if 'details' in model:
            details = model['details']
            print("Details:")
            print(f"  Format: {details.format}")
            print(f"  Family: {details.family}")
            if hasattr(details, 'parameter_size'):
                print(f"  Parameter Size: {details.parameter_size}")
            if hasattr(details, 'quantization_level'):
                print(f"  Quantization: {details.quantization_level}")
        
        print("-" * 80)


Set up the llm and graph_transformer (communicates with the llm and generates queries for entity relationship extraction out of chunks):

In [None]:
# Updated code
from langchain_ollama import OllamaLLM  # New import path
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_azure_ai.chat_models import AzureAIChatCompletionsModel

# Initialize with updated model naming format
if USE_OLLAMA:
    llm = OllamaLLM(model="hf.co/bartowski/cognitivecomputations_Dolphin3.0-Mistral-24B-GGUF:latest", temperature=0)
else:
    llm = AzureAIChatCompletionsModel(
                endpoint=os.getenv('AZURE_ENDPOINT'),
                credential=os.getenv('AZURE_API_KEY'),
                model_name="Mistral-small",
                api_version="2024-05-01-preview",
                model_kwargs={"max_tokens": 4000},
                
                temperature=0.5,
                top_p=0.4
            )
graph_transformer = LLMGraphTransformer(llm=llm)

In [None]:
from langchain.globals import set_debug
# import logging

# Enable verbose logging for all components
set_debug(True)

# import logging
# logging.basicConfig(level=logging.INFO) # DEBUG

In [None]:
## Check LLM connectivity
# from langchain.callbacks.base import BaseCallbackHandler

# class CleanOutputHandler(BaseCallbackHandler):
#     def on_llm_start(self, serialized, prompts, **kwargs):
#         # print("\n=== LLM Start ===")
#         print(f"====== LLM INPUT ====== \n\n: {prompts[0]}")
#         # print(f"Serialized data: {serialized}")
#         # print(f"Additional kwargs: {kwargs}")
        
#     def on_llm_end(self, response, **kwargs):
#         # print("\n=== LLM End ===")
#         if hasattr(response, 'generations'):
#             for i, generation_list in enumerate(response.generations):
#                 for j, generation in enumerate(generation_list):
#                     print(f"====== LLM OUTPUT ====== \n\n {i}.{j}: {generation.text}")
#         # print(f"Additional kwargs: {kwargs}")

# test_response = llm.invoke("Say 'hello' if you can read this.", config={"callbacks": [CleanOutputHandler()]})
# print("Connection successful. Response:", test_response)

# Non-debug configuration
# graph_transformer = LLMGraphTransformer(llm=llm)

# # Transform documents into graph nodes/relationships
# graph_documents = graph_transformer.convert_to_graph_documents(chunks)


In [None]:
import hashlib
from langchain_core.documents import Document
from langchain.graphs.graph_document import Node, Relationship

class DebugLLMGraphTransformer(LLMGraphTransformer):
    def process_response(self, document: Document, config=None):
        print(f"Processing document: {document.page_content}")
        
        graph_doc = super().process_response(document, config)
        print(f"Transformed result: {graph_doc}")
        
        # Extract classifications from metadata
        # if 'classifications' in document.metadata:

            # Create source document node with consistent ID
            # doc_id = document.metadata.get("id") or hashlib.md5(document.page_content.encode()).hexdigest()
            # doc_node = Node(id=doc_id,type="Document")
            
            # Node(
            #     id=doc_id,
            #     type="Document",
            #     properties={
            #         "text": document.page_content,
            #         **document.metadata
            #     }
            # )
            # graph_doc.nodes.append(doc_node)

            # classifications = document.metadata['classifications'].split(',')
            
            # # Create Classification nodes and relationships
            # for classification in classifications:
            #     classification = classification.strip()
            #     if classification:
            #         # Create a Classification node
            #         class_node = Node(id=classification, type="Classification", properties={"code": classification})
            #         graph_doc.nodes.append(class_node)
                    
            #         # Link it to the source Document (not individual entities)
            #         # The source will become a Document node in Neo4j
            #         graph_doc.relationships.append(
            #             Relationship(source=doc_node, target=class_node, type="HAS_CLASSIFICATION")
            #         )

        return graph_doc
    
        # return result

# Initialize with debug transformer
if TESTING:
    debug_transformer = DebugLLMGraphTransformer(
        llm=llm,
        # strict_mode=False,  # Disable strict filtering during debugging
    )
else:
    if not USE_OLLAMA:
        debug_transformer = DebugLLMGraphTransformer(
            llm=llm,
            # strict_mode=True  # Keep strict filtering if needed
        )
        # debug_transformer = DebugLLMGraphTransformer(
        # llm=llm,
        # node_properties={
        #     "Jobprofile": ["id"],  # Matches database label and property
        #     "Classification": ["id"],
        #     "Organization": ["id"],
        #     "Behaviouralcompetency": ["id"],  # Matches exact label spelling
        #     "Document": ["title"],  # Title exists on Document nodes
        #     "__Entity__": ["id"],  # From constraint in visualization
        #     "Education": ["requirement"],
        #     "Experience": ["requirement"],
        #     "SecurityScreening": ["requirement"],
        #     "Accountability": ["description"]
        # },
        # relationship_properties={
        #     # All observed relationships
        #     "HAS_CLASSIFICATION": {},
        #     "BELONGS_TO_ORGANIZATION": {},
        #     "REQUIRES_COMPETENCY": {},
        #     "MENTIONS": {},  # Critical missing relationship
        #     "HAS_EDUCATION_REQUIREMENT": {},
        #     "REQUIRES_EXPERIENCE": {},
        #     "REQUIRES_SCREENING": {},
        #     "HAS_ACCOUNTABILITY": {}
        # }
        # )
    else:
        # must remove restrictions for local ollama - this will process text as unstructured data
        # an extract arbitrary entities and relationships
        debug_transformer = DebugLLMGraphTransformer(
            llm=llm,
            # strict_mode=True  # Keep strict filtering if needed
        )


## 4. Neo4j Graph Database Integration

Initialize db connection

In [None]:
from langchain_neo4j import Neo4jGraph, GraphCypherQAChain
import os

os.environ["NEO4J_URI"] = "bolt://localhost:7687"
os.environ["NEO4J_USERNAME"] = "neo4j"
os.environ["NEO4J_PASSWORD"] = "your_password"

graph = Neo4jGraph()
print(graph.query("CALL db.info()"))

### Testing of neo4j db

In [None]:
# # Create nodes and relationships
# graph.query(
#     """
# MERGE (m:Movie {name:"Top Gun", runtime: 120})
# WITH m
# UNWIND ["Tom Cruise", "Val Kilmer", "Anthony Edwards", "Meg Ryan"] AS actor
# MERGE (a:Actor {name:actor})
# MERGE (a)-[:ACTED_IN]->(m)
# """
# )
# graph.refresh_schema()
# print(graph.schema)
# chain = GraphCypherQAChain.from_llm(
#     llm=llm, graph=graph, verbose=True, allow_dangerous_requests=True
# )
# chain.invoke({"query": "Who played in Top Gun?"})

### Load structured data into neo4j

In [None]:
import sys
import os
sys.path.append(os.path.abspath('..'))

from notebooks.utils import get_clean_job_profiles_df


df = get_clean_job_profiles_df()

In [None]:
print(df.iloc[0])

In [None]:
from neo4j import GraphDatabase
import pandas as pd
import json
from tqdm import tqdm

def load_job_profiles_to_neo4j(df, uri, username, password):
    """
    Load job profiles data from DataFrame to Neo4j
    
    Parameters:
    df: pandas DataFrame containing job profiles
    uri: Neo4j connection URI (e.g., "neo4j://localhost:7687")
    username: Neo4j username
    password: Neo4j password
    """
    # Connect to Neo4j
    driver = GraphDatabase.driver(uri, auth=(username, password))
    
    with driver.session() as session:
        print("Deleting all existing data...")
        session.run("MATCH (n) DETACH DELETE n")
        
        # Create constraints and indexes for better performance
        session.run("CREATE CONSTRAINT job_profile_id IF NOT EXISTS FOR (j:JobProfile) REQUIRE j.id IS UNIQUE")
        session.run("CREATE INDEX job_profile_title IF NOT EXISTS FOR (j:JobProfile) ON (j.title)")
        
        # Create indexes for related nodes
        for label in ["BehavioralCompetency", "Accountability", "Education", "Experience", 
                     "Registration", "Preference", "KSA", "WillingnessStatement", 
                     "OptionalRequirement", "SecurityScreening", "Role", "RoleType", 
                     "Classification", "Organization", "Scope", "JobFamily", "Stream", "ReportsTo"]:
            session.run(f"CREATE INDEX {label.lower()}_name IF NOT EXISTS FOR (n:{label}) ON (n.name)")
        
        # Process each job profile
        # i=0
        for _, row in tqdm(df.iterrows(), total=len(df), desc="Loading job profiles"):
            # i+=1
            # if i>10:
            #     break
            # print('processing: ', i, '/', len(df))
            
            # Create job profile node
            create_job_profile_query = """
            MERGE (j:JobProfile {id: $id})
            SET j.version = $version,
                j.title = $title,
                j.number = $number,
                j.overview = $overview,
                j.program_overview = $program_overview,
                j.state = $state,
                j.type = $type,
                j.context = $context,
                j.is_archived = $is_archived,
                j.all_reports_to = $all_reports_to,
                j.valid_from = datetime($valid_from),
                j.valid_to = CASE WHEN $valid_to IS NULL THEN NULL ELSE datetime($valid_to) END,
                j.views = $views,
                j.created_at = datetime($created_at),
                j.updated_at = datetime($updated_at),
                j.published_at = CASE WHEN $published_at IS NULL THEN NULL ELSE datetime($published_at) END
            RETURN j
            """
            
            # Convert timestamps to ISO format for Neo4j
            valid_from = row['valid_from'].isoformat() if pd.notna(row.get('valid_from')) else None
            valid_to = row['valid_to'].isoformat() if pd.notna(row.get('valid_to')) else None
            created_at = row['created_at'].isoformat() if pd.notna(row.get('created_at')) else None
            updated_at = row['updated_at'].isoformat() if pd.notna(row.get('updated_at')) else None
            published_at = row['published_at'].isoformat() if pd.notna(row.get('published_at')) else None
            
            # Create job profile node
            job_profile = session.run(
                create_job_profile_query,
                id=int(row['id']),
                version=int(row['version']) if pd.notna(row.get('version')) else None,
                title=row['title'] if pd.notna(row.get('title')) else None,
                number=int(row['number']) if pd.notna(row.get('number')) else None,
                overview=row['overview'] if pd.notna(row.get('overview')) else None,
                program_overview=row['program_overview'] if pd.notna(row.get('program_overview')) else None,
                state=row['state'] if pd.notna(row.get('state')) else None,
                type=row['type'] if pd.notna(row.get('type')) else None,
                context=row['context'] if pd.notna(row.get('context')) else None,
                is_archived=bool(row['is_archived']) if pd.notna(row.get('is_archived')) else False,
                all_reports_to=bool(row['all_reports_to']) if pd.notna(row.get('all_reports_to')) else False,
                valid_from=valid_from,
                valid_to=valid_to,
                views=int(row['views']) if pd.notna(row.get('views')) else 0,
                created_at=created_at,
                updated_at=updated_at,
                published_at=published_at
            ).single()
            
            # Create relationships for list fields
            create_relationships(session, row['id'], 'behavioural_competencies', 'BehavioralCompetency', 'HAS_COMPETENCY', row)
            create_relationships(session, row['id'], 'accountabilities', 'Accountability', 'HAS_ACCOUNTABILITY', row)
            create_relationships(session, row['id'], 'education', 'Education', 'REQUIRES_EDUCATION', row)
            create_relationships(session, row['id'], 'job_experience', 'Experience', 'REQUIRES_EXPERIENCE', row)
            create_relationships(session, row['id'], 'professional_registration_requirements', 'Registration', 'REQUIRES_REGISTRATION', row)
            create_relationships(session, row['id'], 'preferences', 'Preference', 'HAS_PREFERENCE', row)
            create_relationships(session, row['id'], 'knowledge_skills_abilities', 'KSA', 'REQUIRES_KSA', row)
            create_relationships(session, row['id'], 'willingness_statements', 'WillingnessStatement', 'HAS_WILLINGNESS', row)
            create_relationships(session, row['id'], 'optional_requirements', 'OptionalRequirement', 'HAS_OPTIONAL_REQUIREMENT', row)
            create_relationships(session, row['id'], 'security_screenings', 'SecurityScreening', 'REQUIRES_SCREENING', row)
            create_relationships(session, row['id'], 'role', 'Role', 'HAS_ROLE', row)
            create_relationships(session, row['id'], 'role_type', 'RoleType', 'HAS_ROLE_TYPE', row)
            create_relationships(session, row['id'], 'classifications', 'Classification', 'HAS_CLASSIFICATION', row)
            create_relationships(session, row['id'], 'organizations', 'Organization', 'BELONGS_TO_ORGANIZATION', row)
            create_relationships(session, row['id'], 'scopes', 'Scope', 'HAS_SCOPE', row)
            create_relationships(session, row['id'], 'job_families', 'JobFamily', 'BELONGS_TO_JOB_FAMILY', row)
            create_relationships(session, row['id'], 'streams', 'Stream', 'BELONGS_TO_STREAM', row)
            create_relationships(session, row['id'], 'reports_to', 'ReportsTo', 'REPORTS_TO', row)
    
    driver.close()
    print("Data loading completed successfully!")

def create_relationships(session, job_id, field_name, node_label, relationship_type, row):
    """Create relationships between job profile and related entities"""
    if isinstance(row.get(field_name), list) and len(row.get(field_name)) > 0:
        for item in row[field_name]:
            query = f"""
            MATCH (j:JobProfile {{id: $job_id}})
            MERGE (n:{node_label} {{name: $name}})
            MERGE (j)-[r:{relationship_type}]->(n)
            RETURN j, n
            """
            session.run(query, job_id=int(job_id), name=item)

In [None]:
URI = "neo4j://localhost:7687"
USERNAME = "neo4j"
PASSWORD = "your_password"

# Load data to Neo4j
load_job_profiles_to_neo4j(df, URI, USERNAME, PASSWORD)

### Transform documents to graph objects with LLM

In [None]:
# Processed up to:
# Inserted 1 graph elements from batch 188
# Processing document batch 189/1130
# Clear existing data
graph.query("MATCH (n) DETACH DELETE n")

if not RUN_IN_BATCH:
    graph_documents = debug_transformer.convert_to_graph_documents(chunks)
    print(graph_documents)

    # Load your GraphDocument data
    graph.add_graph_documents(
        graph_documents,
        baseEntityLabel=True,    # Adds __Entity__ label for better indexing
        include_source=True      # Maintains document source relationships
    )
else:
    # Process documents in batches
    for i, chunk in enumerate(chunks):
        print(f"Processing document batch {i+1}/{len(chunks)}")
        
        # Convert single chunk to graph document
        graph_doc = debug_transformer.convert_to_graph_documents([chunk])  # Wrap in list
        
        # Add to database immediately
        graph.add_graph_documents(
            graph_doc,
            baseEntityLabel=True,
            include_source=True
        )
        
        # Optional: Add progress tracking
        print(f"Inserted {len(graph_doc)} graph elements from batch {i+1}")

In [None]:
graph.query("CALL db.schema.visualization()")

In [None]:
graph.refresh_schema()
print(graph.schema)

In [None]:
# Check relationship count in database
result = graph.query("""
    MATCH ()-[r]->() 
    RETURN count(r) AS relationship_count,
           collect(distinct type(r)) AS relationship_types
""")
print(f"Relationships Found: {result[0]['relationship_count']}")
print(f"Relationship Types: {result[0]['relationship_types']}")

### Debug

In [None]:
# Get all Jobprofiles and their organizations
result = graph.query("MATCH (jp:Jobprofile)-[:BELONGS_TO_ORGANIZATION]->(org:Organization) RETURN jp.id, org.id")
print(result)
print('========')
# Get all Documents mentioning Licensing Clerk
print(graph.query("MATCH (d:Document {title: 'Licensing Clerk'})-[:MENTIONS]->(jp:Jobprofile) RETURN d.title, jp.id"))
print('====== ai not workign: ')
print(graph.query("MATCH (jp:Jobprofile {title: 'Licensing Clerk'})-[:BELONGS_TO_ORGANIZATION]->(o:Organization) RETURN o.id")) # AI generated - not wokring - confused title with id
print('====== same prompt but claude: ')
print(graph.query("MATCH (j:Jobprofile)-[:BELONGS_TO_ORGANIZATION]->(o:Organization) WHERE j.id = 'Licensing Clerk' RETURN o.id"))


This generates a cypher for exact querying of the neo4j database (as opposed to using vector index for similarity search):

In [None]:
chain = GraphCypherQAChain.from_llm(
    llm=llm, graph=graph, verbose=True, allow_dangerous_requests=True,
    # exclude_types=['Document']
    # validate_cypher=True,  # New critical parameter
    # schema_constraints={
    #     "Jobprofile": {"identifier": "id"},  # Force 'id' usage
    #     "Document": {"identifier": "title"}
    # }
)
chain.invoke({"query": "What organizations does the 'Licensing Clerk' profile belong to? Ensure title is treated as 'id' instead of 'title'"})
# use backticks for labels containing spaces: e.g. MATCH (jt:`Job Title` ========== \n 

### Graph Visualization with Pyvis

In [None]:
# # %% [code]
# from pyvis.network import Network

# # Initialize network with configuration
# net = Network(
#     notebook=True, 
#     cdn_resources="in_line", 
#     height="750px"
# )

# # Add nodes with metadata
# nodes = graph.query("""
#     MATCH (n) 
#     RETURN n.id as id, 
#            n.name as label, 
#            n.type as group
# """)

# # Process each node and add to network
# for node in nodes:
#     net.add_node(
#         node["id"],
#         label=node["label"],
#         group=node["group"],
#         title=f"Type: {node['group']}"
#     )

# # Add relationships with labels
# relationships = graph.query("""
#     MATCH (s)-[r]->(t) 
#     RETURN s.id as source, 
#            t.id as target, 
#            type(r) as label
# """)

# # Process each relationship and add to network
# for rel in relationships:
#     net.add_edge(
#         rel["source"], 
#         rel["target"],
#         label=rel["label"],
#         arrowStrikethrough=False
#     )

# # Generate and save interactive visualization
# net.show("job_network.html")

# NEW

# %% [code]
from pyvis.network import Network

# Initialize network with optimized configuration
net = Network(
    notebook=True, 
    cdn_resources="in_line",
    # height="750px",
    # width="100%",
    # layout={
    #     # "hierarchical": {"enabled": True},
    #     # "levelSeparation": 150,    # Vertical spacing between levels [5]
    #     "nodeSpacing": 200,        # Minimum horizontal spacing [5]
    #     "treeSpacing": 300         # Spacing between disconnected components [5]
    # }
)


# Configure physics system for optimal node distribution [1][4][7]
# net.set_options("""
# {
#     "physics": {
#         "enabled": true,
#         "solver": "forceAtlas2Based",
#         "forceAtlas2Based": {
#             "gravitationalConstant": -150,
#             "centralGravity": 0.01,
#             "springLength": 250,
#             "springConstant": 0.005,
#             "damping": 0.4,
#             "avoidOverlap": 1.0
#         },
#         "maxVelocity": 75,
#         "minVelocity": 2,
#         "timestep": 0.5
#     }
# }
# """)

# Add nodes with extended metadata for visual clarity [9]
nodes = graph.query("""
    MATCH (n) 
    RETURN n.id as id, 
           n.name as label, 
           n.type as group
""")

for node in nodes:
    net.add_node(
        node["id"],
        label=node["label"],
        group=node["group"],
        title=f"""
            Type: {node['group']}
            Connections: {node.get('degree', 0)}
        """,
        value=node.get("value", 10),  # Default size if missing [9]
        borderWidth=2,                # Clear node boundaries [9]
        shape="dot",                  # Consistent node shape
        font={"size": 18}             # Improved label readability
    )

# Add relationships with enhanced visual properties [1][10]
relationships = graph.query("""
    MATCH (s)-[r]->(t) 
    RETURN s.id as source, 
           t.id as target, 
           type(r) as label
""")

for rel in relationships:
    net.add_edge(
        rel["source"], 
        rel["target"],
        label=rel["label"],
        value=rel.get("value", 1),    # Default edge weight [10]
        smooth={"type": "dynamic"},   # Curved edge rendering [12]
        arrowStrikethrough=False,
        # color={
        #     "color": "#95a5a6",       # Base edge color
        #     "highlight": "#3498db"    # Highlight color on hover
        # },
        # width=2,                      # Visual weight multiplier [10]
        physics=True                   # Enable edge spring behavior [9]
    )

# Final layout optimization steps [2][6]
# net.toggle_physics(True)             # Enable for initial stabilization
# net.show_buttons(filter_=['physics']) # Allow parameter adjustments [6]

net.set_options("""
{
    "physics": {
        "enabled": true,
        "solver": "repulsion",
        "repulsion": {
            "nodeDistance": 300,
            "springLength": 200
        }
    }
}
""")

# Generate and save visualization with preservation options
net.write_html(
    "job_network.html",
    local=False,
    notebook=False,
    # override=True
)


## Build vector index

Find all text fields in the neo4j db:

In [None]:
# Get available text properties across all nodes
from neo4j import GraphDatabase

driver = GraphDatabase.driver("bolt://localhost:7687", 
                              auth=("neo4j", "your_password"))

def get_text_properties(driver, min_percentage=0.1):
    with driver.session() as session:
        result = session.run("""
            MATCH (n)
            WITH count(*) as total
            MATCH (n)
            UNWIND keys(n) AS prop
            WITH prop, count(*) as count, total
            WHERE count >= total * $min_percentage
            AND apoc.meta.cypher.type(prop) = 'STRING'
            RETURN collect(prop) AS props
        """, {"min_percentage": min_percentage})
        return result.single()["props"]

text_props = get_text_properties(driver)
print(text_props)

Vectorstore generation from existing graph

In [None]:
# Add to existing Neo4jGraph initialization
from langchain_community.vectorstores import Neo4jVector
from langchain_huggingface import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="thenlper/gte-small")

# internally it does this:
# read_query = (
#   "CALL db.index.vector.queryNodes($index, $k, $embedding) "
#   "YIELD node, score "
# ) + retrieval_query

# from langchain_openai import OpenAIEmbeddings
# his method pulls relevant text information from the database, and calculates and stores the text embeddings back to the database.
vector_store = Neo4jVector.from_existing_graph(
    embedding=embeddings,
    url=os.environ["NEO4J_URI"],
    username=os.environ["NEO4J_USERNAME"],
    password=os.environ["NEO4J_PASSWORD"],
    index_name="document_embeddings",
    # Label-Aware Indexing - Treating all nodes as generic entities while preserving original labels in metadata
    # node_label="Jobprofile",
    # Dynamic Property Concatenation - Combining properties across node types into a single text embedding source
    # text_node_properties=["text","title","id"],
    embedding_node_property="embedding",
    # If the data is unstructured, coalesce
    #  OPTIONAL MATCH (node)-[r]->(related) # This is unidirectional

    # these first two lines are not neede decause of concat above:
    # MATCH (node:Entity)
    # WITH node, 1.0 as score
    
    # "We know the similarity search query will return the node and score variables, so we can pass those into our retrieval query to pull connected data of those similar nodes."
    retrieval_query="""
CALL db.index.vector.queryNodes($index, $k, $embedding) 
    YIELD node AS vectorNode, score AS similarityScore  // Alias both variables
    
    WITH vectorNode, similarityScore 
    ORDER BY similarityScore DESC LIMIT 5
    
    WITH vectorNode,
         apoc.map.clean(vectorNode {.*}, ['embedding'], []) AS nodeProps,
         labels(vectorNode) AS nodeLabels,
         similarityScore
    
    OPTIONAL MATCH (vectorNode)-[r:PART_OF]-(related)
    
    WITH nodeProps, nodeLabels, similarityScore,
         collect(DISTINCT { 
           rel: type(r), 
           target: related {.*, labels: labels(related)} 
         }) AS connections,
         [x IN collect(coalesce(related.description, related.name, '')) WHERE x <> ''] AS relatedContext
    
    RETURN apoc.text.join(
        [coalesce(nodeProps.text, nodeProps.title, nodeProps.id, '')] + relatedContext,
        ' | '
    ) AS text,
    
    similarityScore AS score,  // Rename back to expected 'score' field
    
    { 
      source: apoc.map.merge(nodeProps, {labels: nodeLabels}),
      connections: connections 
    } AS metadata
""",
node_label="Entity", # Generic label for all nodes
text_node_properties=text_props #"name", "description", "title",
)

In [None]:
dimension, entity_type = vector_store.retrieve_existing_index()
print('dimension: ', dimension, ', entity_type: ', entity_type)

sample_data = vector_store.query("""
MATCH (n:`Job Title`)
RETURN n.id as id,
       size(n.embedding) as embedding_size
LIMIT 5
""")

In [None]:
index_info = vector_store.query("""
SHOW INDEXES 
WHERE name = 'document_embeddings'
""")
print(index_info)

In [None]:
sample_embedding = embeddings.embed_query("test")
print(len(sample_embedding))

In [None]:
# # Create optimized vector index
# # todo: is this needed?
# graph.query("""
# CREATE VECTOR INDEX document_embeddings IF NOT EXISTS
# FOR (n:Document) ON (n.embedding)
# OPTIONS {
#   indexConfig: {
#     `vector.dimensions`: 384,
#     `vector.similarity_function`: 'cosine'
#   }
# }
# """)

In [None]:
with driver.session() as session:
    traversal_plan = session.run("""
        EXPLAIN 
        MATCH (node:Entity)
        WITH node, 1.0 as score
        OPTIONAL MATCH (node)-[r]->(related)
        RETURN apoc.text.join([
            coalesce(node.text, node.name, node.title),
            coalesce(related.name, related.title)
        ], ' ') AS text,
        score,
        node {.*, labels: labels(node)} AS metadata,
        collect({relationship: type(r), node: related}) AS graph_context
    """).consume()
    
    print("Query execution plan:")
    print(traversal_plan.plan)

In [None]:
from pprint import pprint


results = vector_store.similarity_search("Public Safety & Sol General (PSSG)", k=10)
for doc in results:
    print('==== DOC ====')
    pprint(doc.metadata)

In [None]:
debug_results = []
for doc in results:
    debug_entry = {
        "text": doc.page_content,
        # "score": doc.metadata['score'],
        "source_node": {
            "id": doc.metadata['id'],
            "labels": doc.metadata['labels'],
            "properties": {k:v for k,v in doc.metadata.items() 
                          if k not in ['score', 'id', 'labels']}
        },
        "connections": [
            {
                "relationship": rel['relationship'],
                "target_node": {
                    "id": rel['node'].get('id'),
                    "labels": rel['node'].get('labels', []),
                    "properties": {k:v for k,v in rel['node'].items()
                                  if k not in ['id', 'labels']}
                }
            } for rel in doc.metadata.get('graph_context', [])
        ]
    }
    debug_results.append(debug_entry)

# 3. Analyze the traversal patterns
print("\nTraversal statistics:")
for doc in debug_results:
    print(f"Node {doc['source_node']['id']} ({'|'.join(doc['source_node']['labels'])})")
    print(f"Connected to {len(doc['connections'])} nodes via:")
    for conn in doc['connections']:
        print(f"  - {conn['relationship']} → "
              f"{conn['target_node']['labels']} "
              f"(props: {list(conn['target_node']['properties'].keys())})")

In [None]:
# View all relationships
with driver.session() as session:
    result = session.run("""
        MATCH (n:Entity)-[r]->() 
        RETURN type(r) AS rel_type, count(*) AS count
        UNION
        MATCH (n:Entity)<-[r]-()
        RETURN type(r) AS rel_type, count(*) AS count
    """).data()
    
    print("Relationship counts:")
    for record in result:
        print(f"{record['rel_type']}: {record['count']}")

In [None]:
with driver.session() as session:
    label_check = session.run("""
        MATCH (n)
        WHERE NOT 'Entity' IN labels(n)
        RETURN count(*) AS missing_label_count
    """).single()
    
    print(f"Nodes missing Entity label: {label_check['missing_label_count']}")

In [None]:
results = vector_store.query("""
MATCH (n:Entity)
RETURN n {.*} as node
""")
print(results[0:1])

In [None]:
print(results[1:2])

In [None]:
from pprint import pprint


results = vector_store.similarity_search("Concern for order Perf and Risk Mgt Analyst", k=2)
for doc in results:
    print('==== DOC ====')
    pprint(doc)

In [None]:
results = vector_store.query("""
MATCH (node:Entity)
    WITH node, 1.0 as score
    OPTIONAL MATCH (node)-[r]-(related)
    RETURN apoc.text.join([
        coalesce(node.text, node.name, node.title, node.id),
        coalesce(related.name, related.title, node.id)
    ], ' ') AS text,
    score,
    node {.*, labels: labels(node)} AS metadata,
    collect({relationship: type(r), node: related}) AS graph_context
""")

In [None]:
print(results[1:2])