In [1]:
import os
from dotenv import load_dotenv
from neo4j import GraphDatabase
import pandas as pd
from pathlib import Path

print(os.getcwd())

env_path = Path('..') / '.env.local'
load_dotenv(dotenv_path=env_path)

# Retrieve credentials
uri = os.getenv("NEO4J_URI")
username = os.getenv("NEO4J_USER")
password = os.getenv("NEO4J_PASSWORD")

print(uri)

driver = GraphDatabase.driver(uri, auth=(username, password))

C:\Users\Tobias Fechner\Documents\2_Work\prisma\evaluate\analysis\notebooks
neo4j+s://7a2be29f.databases.neo4j.io


Dont run the below cell unless you want to recreate the index (which isn't necessary unless you have new data)

def drop_vector_index(driver, index_name):
    with driver.session() as session:
        # Use backticks in case the index name has hyphens
        cypher = f"DROP INDEX `{index_name}` IF EXISTS"
        session.run(cypher)
        print(f"🗑️ Dropped vector index: {index_name}")


# Usage
drop_vector_index(driver, "voice-vector-index")


In [2]:
INDEX_NAME = "voice-vector-index"

Or these two

# Index parameters
NODE_LABEL = "VoiceChunk"
PROPERTY_NAME = "embedding"
DIMENSIONS = 3072  # For OpenAI "text-embedding-3-large"
SIMILARITY_FUNCTION = "cosine"  # or 'euclidean' or 'dot'

# Create the index
def create_vector_index(driver):
    cypher = f"""
    CALL db.index.vector.createNodeIndex(
        '{INDEX_NAME}',
        '{NODE_LABEL}',
        '{PROPERTY_NAME}',
        {DIMENSIONS},
        '{SIMILARITY_FUNCTION}'
    )
    """
    with driver.session() as session:
        try:
            session.run(cypher)
            print(f"✅ Vector index '{INDEX_NAME}' created successfully.")
        except Exception as e:
            print(f"❌ Failed to create vector index: {e}")

create_vector_index(driver)

In [3]:
from neo4j import GraphDatabase
from neo4j_graphrag.retrievers import VectorRetriever
from neo4j_graphrag.llm import OpenAILLM
from neo4j_graphrag.generation import GraphRAG
from neo4j_graphrag.embeddings import OpenAIEmbeddings

In [4]:
embedder = OpenAIEmbeddings(model="text-embedding-3-large")

# Initialize the retriever
retriever = VectorRetriever(driver, INDEX_NAME, embedder)

# 3. LLM
# Note: the OPENAI_API_KEY must be in the env vars
llm = OpenAILLM(model_name="gpt-4o", model_params={"temperature": 0})

# Initialize the RAG pipeline
rag = GraphRAG(retriever=retriever, llm=llm)

In [5]:
# Query the graph
query_text = "List as many tangible stakeholders that were referred to. Exclude Prisma, ARC, Wada and Dream Village. Examples of stakeholders are: companies, communities, institutions, governments etc."
response = rag.search(query_text=query_text, retriever_config={"top_k": 25})
print(response.answer)

1. University community (including students and licensed counselors)
2. Neighborhoods as potential communities
3. Investee systems and investor body as a community
4. University contacts and referral networks
5. Schools and clubs


### Filter by participant

In [7]:
from neo4j_graphrag.retrievers import VectorCypherRetriever

# Cypher to retrieve only facilitator voice reflections
retrieval_query = """
MATCH (node)<-[:HAS_CHUNK]-(v:Voice)<-[:HAS_VOICE]-(e:Entry)-[:SENT_BY]->(p:Participant)
WHERE p.role = 'participant'
RETURN node.chunk_text AS content, score
"""

# Set up the retriever
retriever = VectorCypherRetriever(
    driver=driver,
    index_name="voice-vector-index",       # Make sure this matches your vector index name
    retrieval_query=retrieval_query,
    embedder=embedder
)

# Re-initialize the RAG pipeline
rag = GraphRAG(retriever=retriever, llm=llm)

# Example query
response = rag.search(query_text="List all organisational entities, stakeholders and fields of work that were referred to. Examples of stakeholders are: companies, communities, institutions, governments etc. Give an example with real text.", retriever_config={"top_k": 45})

# Print results
print(response.answer)

Based on the provided context, the organizational entities, stakeholders, and fields of work referred to include:

1. **Investees**: Mentioned in the context of focusing on investee systems and reaching out to them. Example: "we need to reach out to the investee."

2. **Communities**: Discussed as potential stakeholders, with different types of communities being considered, such as investor bodies and neighborhoods. Example: "the investor body could be one community. Neighborhoods could be another community."

3. **Dream Village**: Referred to in the context of expected activities and contributions. Example: "we also spoke about Dream Village, what we should expect from Dream Village."

4. **Recycling Companies**: Mentioned in the context of meeting with operation managers. Example: "we were able to meet the recycling companies. We met the operation managers for two recycling companies."

5. **Transport Companies**: Briefly mentioned in the context of not needing buses. Example: "there

### Filter by participant AND include chunk parent node for whole context

In [None]:
from neo4j_graphrag.types import RetrieverResultItem

def result_formatter(record):
    chunk = record.get("chunk_text")
    full_text = record.get("full_transcription")
    score = record.get("score")

    # Compose content with chunk + hint of full text
    content = f"Chunk: {chunk}\n\nContext: {full_text[:1500]}..."  # truncate to avoid tokens overflow

    return RetrieverResultItem(
        content=content,
        metadata={"score": score}
    )

In [None]:
# Cypher to retrieve only facilitator voice reflections
retrieval_query = """
MATCH (node:VoiceChunk)<-[:HAS_CHUNK]-(v:Voice)
MATCH (v)<-[:HAS_VOICE]-(e:Entry)-[:SENT_BY]->(p:Participant)
WHERE p.role = 'participant'
RETURN
  node.chunk_text AS chunk_text,
  v.transcription AS full_transcription,
  score
"""

# Set up the retriever
retriever = VectorCypherRetriever(
    driver=driver,
    index_name="voice-vector-index",       # Make sure this matches your vector index name
    retrieval_query=retrieval_query,
    embedder=embedder,
    result_formatter=result_formatter
)

# Re-initialize the RAG pipeline
rag = GraphRAG(retriever=retriever, llm=llm)

# Example query
response = rag.search(query_text="What were the main facilitated exercises (workshops, scheduled agenda items, activities, sessions) that the participants found interesting or valuable? Value can be understood in terms of having created a shift in their perspective. Reflective data is favoured. Use only the context provided. Give examples of the reflections for each, using the exact text from the data. Present in the format Session Name: Reflection. Try to retrieve 5 distinct sessions.", retriever_config={"top_k": 25})

# Print results
print(response.answer)