# Comprehensive Implementation of Graph RAG for Job Profile Analysis Using Local Ollama Models

This notebook demonstrates a complete Graph RAG implementation for analyzing job profile documents using local Ollama models.

## 1. Environment Setup

In [1]:
# !pip install -qU langchain langchain_community langchain-experimental neo4j pyvis ollama python-dotenv
# !ollama pull llama3.1  # 8B parameter model recommended
# !ollama pull nomic-embed-text  # Embedding model

import os
import sys
sys.path.append(os.path.abspath('..'))

## 2. Document Processing Pipeline

In [2]:

TESTING=False           # Set to True to run with toy data
RUN_IN_BATCH=True       # If False, will try to run all chunks at once to get entity connections, otherwise will save after each one
USE_OLLAMA=True         # Whether to use local Ollama or Azure API
LIMIT_CHUNKS=None       # Set to a number to limit the number of chunk to be processed
CHUNK_SIZE=1000          
csvPath="../data/job profiles/2025-02-07_profiles.csv"

Load data:

In [3]:
# tokenizer = MistralTokenizer.from_model("mistral-small", strict=True)
# text = "Your text here"
# tokens = tokenizer.encode_chat_completion(text)
# token_count = len(tokens)

In [4]:
from typing import List, Optional
from pydantic import BaseModel, Field
import pandas as pd
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
from mistral_common.protocol.instruct.request import ChatCompletionRequest
from mistral_common.protocol.instruct.messages import UserMessage

class JobProfile(BaseModel):
    title: str = Field(description="Official job title")
    classifications: List[str] = Field(description="Classification codes")
    organizations: List[str] 
    behavioural_competencies: List[str]
    education: List[str] = Field(description="Education requirements")
    job_experience: List[str]
    knowledge_skills_abilities: List[str]
    security_screenings: List[str]
    accountabilities: List[str]
    role_type: Optional[str] = Field(description="Role category")
    scopes: Optional[List[str]] = Field(description="Areas of responsibility")
    professional_registration: Optional[List[str]]

if not TESTING:
    from notebooks.utils import get_job_profile_documents
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    from transformers import AutoTokenizer

    documents=get_job_profile_documents(csvPath)
    tokenizer = MistralTokenizer.from_model("mistral-small", strict=True)

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=200,
        separators=["\n\n", "\n", "•", " ", ""],
        length_function=lambda text: len(tokenizer.encode_chat_completion(
            ChatCompletionRequest(
                messages=[
                    UserMessage(content=text)
                ],
                model="mistral-small-latest"
            )
        ).tokens)
    )
    
    chunks = splitter.split_documents(documents)


In [5]:
if LIMIT_CHUNKS is not None:
    chunks=chunks[0:LIMIT_CHUNKS]

In [6]:
csvPath="../data/job profiles/2025-02-07_profiles.csv"
df=pd.read_csv(csvPath)
df.columns

Index(['id', 'version', 'title', 'number', 'overview', 'program_overview',
       'state', 'type', 'behavioural_competencies', 'accountabilities',
       'education', 'job_experience', 'professional_registration_requirements',
       'preferences', 'knowledge_skills_abilities', 'willingness_statements',
       'optional_requirements', 'security_screenings', 'all_reports_to',
       'context', 'is_archived', 'valid_from', 'valid_to', 'views', 'role',
       'role_type', 'created_at', 'updated_at', 'published_at',
       'classifications', 'organizations', 'scopes', 'job_families', 'streams',
       'reports_to'],
      dtype='object')

In [7]:
if TESTING:
    from langchain_community.document_loaders import TextLoader
    from langchain_text_splitters import RecursiveCharacterTextSplitter

    # Custom job profile document format
    job_profiles = """
    [Job Profile: Data Scientist]
    Accountabilities:
    - Develop ML models for customer segmentation
    - Collaborate with engineering teams on deployment

    Knowledge:
    - Advanced statistics
    - Python programming

    Skills:
    - TensorFlow/PyTorch
    - SQL optimization

    [Job Profile: Cloud Architect]
    Accountabilities:
    - Design AWS infrastructure
    - Implement security protocols

    Knowledge:
    - Networking fundamentals
    - IaaS/PaaS/SaaS models

    Skills:
    - Terraform infrastructure as code
    - Cost optimization techniques
    """

    with open("job_profiles.txt", "w") as f:
        f.write(job_profiles)

    loader = TextLoader("job_profiles.txt")
    docs = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        separators=["\n\n[Job Profile:", "\n\nAccountabilities:", "\n\nKnowledge:"]
    )
    chunks = text_splitter.split_documents(docs)

## 3. Graph Construction with LLM Extraction

To View available models in ollama

In [None]:
if USE_OLLAMA:
    import ollama
    from datetime import datetime

    # Get models
    models = ollama.Client().list()['models']

    print("Available Models:")
    print("-" * 80)
    for model in models:
        # Extract model name from model field
        model_name = model['model']
        
        # Format size in GB
        size_gb = model['size'] / 1_000_000_000
        
        # Format datetime
        modified = model['modified_at'].strftime("%Y-%m-%d %H:%M:%S %Z")
        
        # Print main model info
        print(f"Model: {model_name}")
        print(f"Modified: {modified}")
        print(f"Size: {size_gb:.2f} GB")
        
        # Print model details if available
        if 'details' in model:
            details = model['details']
            print("Details:")
            print(f"  Format: {details.format}")
            print(f"  Family: {details.family}")
            if hasattr(details, 'parameter_size'):
                print(f"  Parameter Size: {details.parameter_size}")
            if hasattr(details, 'quantization_level'):
                print(f"  Quantization: {details.quantization_level}")
        
        print("-" * 80)


Set up the llm and graph_transformer (communicates with the llm and generates queries for entity relationship extraction out of chunks):

In [9]:
# Updated code
from langchain_ollama import OllamaLLM  # New import path
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_azure_ai.chat_models import AzureAIChatCompletionsModel

# Initialize with updated model naming format
if USE_OLLAMA:
    llm = OllamaLLM(model="hf.co/bartowski/cognitivecomputations_Dolphin3.0-Mistral-24B-GGUF:latest", temperature=0)
else:
    llm = AzureAIChatCompletionsModel(
                endpoint=os.getenv('AZURE_ENDPOINT'),
                credential=os.getenv('AZURE_API_KEY'),
                model_name="Mistral-small",
                api_version="2024-05-01-preview",
                model_kwargs={"max_tokens": 4000},
                
                temperature=0.5,
                top_p=0.4
            )
graph_transformer = LLMGraphTransformer(llm=llm)

In [10]:
from langchain.globals import set_debug
# import logging

# Enable verbose logging for all components
set_debug(True)

# import logging
# logging.basicConfig(level=logging.INFO) # DEBUG

In [11]:
## Check LLM connectivity
# from langchain.callbacks.base import BaseCallbackHandler

# class CleanOutputHandler(BaseCallbackHandler):
#     def on_llm_start(self, serialized, prompts, **kwargs):
#         # print("\n=== LLM Start ===")
#         print(f"====== LLM INPUT ====== \n\n: {prompts[0]}")
#         # print(f"Serialized data: {serialized}")
#         # print(f"Additional kwargs: {kwargs}")
        
#     def on_llm_end(self, response, **kwargs):
#         # print("\n=== LLM End ===")
#         if hasattr(response, 'generations'):
#             for i, generation_list in enumerate(response.generations):
#                 for j, generation in enumerate(generation_list):
#                     print(f"====== LLM OUTPUT ====== \n\n {i}.{j}: {generation.text}")
#         # print(f"Additional kwargs: {kwargs}")

# test_response = llm.invoke("Say 'hello' if you can read this.", config={"callbacks": [CleanOutputHandler()]})
# print("Connection successful. Response:", test_response)

# Non-debug configuration
# graph_transformer = LLMGraphTransformer(llm=llm)

# # Transform documents into graph nodes/relationships
# graph_documents = graph_transformer.convert_to_graph_documents(chunks)


In [12]:
from langchain_core.documents import Document

class DebugLLMGraphTransformer(LLMGraphTransformer):
    def process_response(self, document: Document, config=None):
        print(f"Processing document: {document.page_content}")
        
        result = super().process_response(document, config)
        print(f"Transformed result: {result}")
        
        return result
    
        # return result

# Initialize with debug transformer
if TESTING:
    debug_transformer = DebugLLMGraphTransformer(
        llm=llm,
        # strict_mode=False,  # Disable strict filtering during debugging
    )
else:
    if not USE_OLLAMA:
        debug_transformer = DebugLLMGraphTransformer(
        llm=llm,
        node_properties={
            "Jobprofile": ["id"],  # Matches database label and property
            "Classification": ["id"],
            "Organization": ["id"],
            "Behaviouralcompetency": ["id"],  # Matches exact label spelling
            "Document": ["title"],  # Title exists on Document nodes
            "__Entity__": ["id"],  # From constraint in visualization
            "Education": ["requirement"],
            "Experience": ["requirement"],
            "SecurityScreening": ["requirement"],
            "Accountability": ["description"]
        },
        relationship_properties={
            # All observed relationships
            "HAS_CLASSIFICATION": {},
            "BELONGS_TO_ORGANIZATION": {},
            "REQUIRES_COMPETENCY": {},
            "MENTIONS": {},  # Critical missing relationship
            "HAS_EDUCATION_REQUIREMENT": {},
            "REQUIRES_EXPERIENCE": {},
            "REQUIRES_SCREENING": {},
            "HAS_ACCOUNTABILITY": {}
        }
        )
    else:
        # must remove restrictions for local ollama
        debug_transformer = DebugLLMGraphTransformer(
            llm=llm,
            # strict_mode=True  # Keep strict filtering if needed
        )


## 4. Neo4j Graph Database Integration

In [13]:
from langchain_neo4j import Neo4jGraph, GraphCypherQAChain
import os

os.environ["NEO4J_URI"] = "bolt://localhost:7687"
os.environ["NEO4J_USERNAME"] = "neo4j"
os.environ["NEO4J_PASSWORD"] = "your_password"

graph = Neo4jGraph()

Test out neo4j

In [14]:
print(graph.query("CALL db.info()"))

[{'id': '69116869CB35145331EE2540ACCB41CE6769FA0EF563941B7EF46E8944A02587', 'name': 'neo4j', 'creationDate': '2025-02-23T22:56:33.286Z'}]


In [None]:
# Processed up to:
# Inserted 1 graph elements from batch 188
# Processing document batch 189/1130
# Clear existing data
graph.query("MATCH (n) DETACH DELETE n")

if not RUN_IN_BATCH:
    graph_documents = debug_transformer.convert_to_graph_documents(chunks)
    print(graph_documents)

    # Load your GraphDocument data
    graph.add_graph_documents(
        graph_documents,
        baseEntityLabel=True,    # Adds __Entity__ label for better indexing
        include_source=True      # Maintains document source relationships
    )
else:
    # Process documents in batches
    for i, chunk in enumerate(chunks):
        print(f"Processing document batch {i+1}/{len(chunks)}")
        
        # Convert single chunk to graph document
        graph_doc = debug_transformer.convert_to_graph_documents([chunk])  # Wrap in list
        
        # Add to database immediately
        graph.add_graph_documents(
            graph_doc,
            baseEntityLabel=True,
            include_source=True
        )
        
        # Optional: Add progress tracking
        print(f"Inserted {len(graph_doc)} graph elements from batch {i+1}")

In [None]:
# # Create nodes and relationships
# graph.query(
#     """
# MERGE (m:Movie {name:"Top Gun", runtime: 120})
# WITH m
# UNWIND ["Tom Cruise", "Val Kilmer", "Anthony Edwards", "Meg Ryan"] AS actor
# MERGE (a:Actor {name:actor})
# MERGE (a)-[:ACTED_IN]->(m)
# """
# )
# graph.refresh_schema()
# print(graph.schema)
# chain = GraphCypherQAChain.from_llm(
#     llm=llm, graph=graph, verbose=True, allow_dangerous_requests=True
# )
# chain.invoke({"query": "Who played in Top Gun?"})

In [None]:
graph.query("CALL db.schema.visualization()")

In [None]:
graph.refresh_schema()
print(graph.schema)

In [None]:
# Check relationship count in database
result = graph.query("""
    MATCH ()-[r]->() 
    RETURN count(r) AS relationship_count,
           collect(distinct type(r)) AS relationship_types
""")
print(f"Relationships Found: {result[0]['relationship_count']}")
print(f"Relationship Types: {result[0]['relationship_types']}")

In [None]:
# Get all Jobprofiles and their organizations
result = graph.query("MATCH (jp:Jobprofile)-[:BELONGS_TO_ORGANIZATION]->(org:Organization) RETURN jp.id, org.id")
print(result)
print('========')
# Get all Documents mentioning Licensing Clerk
print(graph.query("MATCH (d:Document {title: 'Licensing Clerk'})-[:MENTIONS]->(jp:Jobprofile) RETURN d.title, jp.id"))
print('====== ai not workign: ')
print(graph.query("MATCH (jp:Jobprofile {title: 'Licensing Clerk'})-[:BELONGS_TO_ORGANIZATION]->(o:Organization) RETURN o.id")) # AI generated - not wokring - confused title with id
print('====== same prompt but claude: ')
print(graph.query("MATCH (j:Jobprofile)-[:BELONGS_TO_ORGANIZATION]->(o:Organization) WHERE j.id = 'Licensing Clerk' RETURN o.id"))


In [None]:
chain = GraphCypherQAChain.from_llm(
    llm=llm, graph=graph, verbose=True, allow_dangerous_requests=True,
    # exclude_types=['Document']
    # validate_cypher=True,  # New critical parameter
    # schema_constraints={
    #     "Jobprofile": {"identifier": "id"},  # Force 'id' usage
    #     "Document": {"identifier": "title"}
    # }
)
chain.invoke({"query": "What organizations does the 'Licensing Clerk' profile belong to? Ensure title is treated as 'id' instead of 'title'"})
# use backticks for labels containing spaces: e.g. MATCH (jt:`Job Title` ========== \n 

## 5. Graph Visualization with Pyvis

In [16]:
# # %% [code]
# from pyvis.network import Network

# # Initialize network with configuration
# net = Network(
#     notebook=True, 
#     cdn_resources="in_line", 
#     height="750px"
# )

# # Add nodes with metadata
# nodes = graph.query("""
#     MATCH (n) 
#     RETURN n.id as id, 
#            n.name as label, 
#            n.type as group
# """)

# # Process each node and add to network
# for node in nodes:
#     net.add_node(
#         node["id"],
#         label=node["label"],
#         group=node["group"],
#         title=f"Type: {node['group']}"
#     )

# # Add relationships with labels
# relationships = graph.query("""
#     MATCH (s)-[r]->(t) 
#     RETURN s.id as source, 
#            t.id as target, 
#            type(r) as label
# """)

# # Process each relationship and add to network
# for rel in relationships:
#     net.add_edge(
#         rel["source"], 
#         rel["target"],
#         label=rel["label"],
#         arrowStrikethrough=False
#     )

# # Generate and save interactive visualization
# net.show("job_network.html")

# NEW

# %% [code]
from pyvis.network import Network

# Initialize network with optimized configuration
net = Network(
    notebook=True, 
    cdn_resources="in_line",
    # height="750px",
    # width="100%",
    # layout={
    #     # "hierarchical": {"enabled": True},
    #     # "levelSeparation": 150,    # Vertical spacing between levels [5]
    #     "nodeSpacing": 200,        # Minimum horizontal spacing [5]
    #     "treeSpacing": 300         # Spacing between disconnected components [5]
    # }
)


# Configure physics system for optimal node distribution [1][4][7]
# net.set_options("""
# {
#     "physics": {
#         "enabled": true,
#         "solver": "forceAtlas2Based",
#         "forceAtlas2Based": {
#             "gravitationalConstant": -150,
#             "centralGravity": 0.01,
#             "springLength": 250,
#             "springConstant": 0.005,
#             "damping": 0.4,
#             "avoidOverlap": 1.0
#         },
#         "maxVelocity": 75,
#         "minVelocity": 2,
#         "timestep": 0.5
#     }
# }
# """)

# Add nodes with extended metadata for visual clarity [9]
nodes = graph.query("""
    MATCH (n) 
    RETURN n.id as id, 
           n.name as label, 
           n.type as group
""")

for node in nodes:
    net.add_node(
        node["id"],
        label=node["label"],
        group=node["group"],
        title=f"""
            Type: {node['group']}
            Connections: {node.get('degree', 0)}
        """,
        value=node.get("value", 10),  # Default size if missing [9]
        borderWidth=2,                # Clear node boundaries [9]
        shape="dot",                  # Consistent node shape
        font={"size": 18}             # Improved label readability
    )

# Add relationships with enhanced visual properties [1][10]
relationships = graph.query("""
    MATCH (s)-[r]->(t) 
    RETURN s.id as source, 
           t.id as target, 
           type(r) as label
""")

for rel in relationships:
    net.add_edge(
        rel["source"], 
        rel["target"],
        label=rel["label"],
        value=rel.get("value", 1),    # Default edge weight [10]
        smooth={"type": "dynamic"},   # Curved edge rendering [12]
        arrowStrikethrough=False,
        # color={
        #     "color": "#95a5a6",       # Base edge color
        #     "highlight": "#3498db"    # Highlight color on hover
        # },
        # width=2,                      # Visual weight multiplier [10]
        physics=True                   # Enable edge spring behavior [9]
    )

# Final layout optimization steps [2][6]
# net.toggle_physics(True)             # Enable for initial stabilization
# net.show_buttons(filter_=['physics']) # Allow parameter adjustments [6]

net.set_options("""
{
    "physics": {
        "enabled": true,
        "solver": "repulsion",
        "repulsion": {
            "nodeDistance": 300,
            "springLength": 200
        }
    }
}
""")

# Generate and save visualization with preservation options
net.write_html(
    "job_network.html",
    local=False,
    notebook=False,
    # override=True
)


Now add vector embedding index:

In [None]:
# Add to existing Neo4jGraph initialization
from langchain_community.vectorstores import Neo4jVector
from langchain_huggingface import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="thenlper/gte-small")

# from langchain_openai import OpenAIEmbeddings
# his method pulls relevant text information from the database, and calculates and stores the text embeddings back to the database.
vector_store = Neo4jVector.from_existing_graph(
    embedding=embeddings,
    url=os.environ["NEO4J_URI"],
    username=os.environ["NEO4J_USERNAME"],
    password=os.environ["NEO4J_PASSWORD"],
    index_name="document_embeddings",
    node_label="Jobprofile",
    text_node_properties=["text","title","id"],
    embedding_node_property="embedding",
)

In [None]:
sample_embedding = embeddings.embed_query("test")
print(len(sample_embedding))

In [None]:
# Create optimized vector index
# todo: is this needed?
graph.query("""
CREATE VECTOR INDEX document_embeddings IF NOT EXISTS
FOR (n:Document) ON (n.embedding)
OPTIONS {
  indexConfig: {
    `vector.dimensions`: 384,
    `vector.similarity_function`: 'cosine'
  }
}
""")

In [None]:
results = vector_store.similarity_search("Agriculture", k=3)
print([doc.page_content for doc in results])