# Vector Embeddings Setup for DPP PoC

This notebook handles:
- Text chunking from exhibits
- Vector embedding generation
- Neo4j vector index creation
- Chunk-to-exhibit relationships

# Vector Embeddings Setup for DPP PoC

This notebook handles:
- Text chunking from exhibits
- Vector embedding generation
- Neo4j vector index creation
- Chunk-to-exhibit relationships

In [30]:

import sys
print(f"Python version: {sys.version}")
import sentence_transformers
print(f"sentence-transformers version: {sentence_transformers.__version__}")


from pathlib import Path
print(f"Path.home(): {Path.home()}")
print(f"Current working directory: {Path.cwd()}")

Python version: 3.12.12 (main, Oct 21 2025, 02:11:22) [GCC 14.2.0]
sentence-transformers version: 5.1.2
Path.home(): /root
Current working directory: /app/notebooks


In [30]:

import sys
print(f"Python version: {sys.version}")
import sentence_transformers
print(f"sentence-transformers version: {sentence_transformers.__version__}")


from pathlib import Path
print(f"Path.home(): {Path.home()}")
print(f"Current working directory: {Path.cwd()}")

Python version: 3.12.12 (main, Oct 21 2025, 02:11:22) [GCC 14.2.0]
sentence-transformers version: 5.1.2
Path.home(): /root
Current working directory: /app/notebooks


In [30]:

import sys
print(f"Python version: {sys.version}")
import sentence_transformers
print(f"sentence-transformers version: {sentence_transformers.__version__}")


from pathlib import Path
print(f"Path.home(): {Path.home()}")
print(f"Current working directory: {Path.cwd()}")

Python version: 3.12.12 (main, Oct 21 2025, 02:11:22) [GCC 14.2.0]
sentence-transformers version: 5.1.2
Path.home(): /root
Current working directory: /app/notebooks


In [31]:
from neo4j import GraphDatabase
from sentence_transformers import SentenceTransformer
from pathlib import Path
import pandas as pd, uuid, re, nltk
nltk.download('punkt', quiet=True)
from nltk.tokenize import sent_tokenize
from dotenv import load_dotenv
import os

# Load .env from home directory in AzureML
home_env = Path.home() / '.env'
if home_env.exists():
    load_dotenv(home_env)
else:
    load_dotenv()  # fallback to local .env

NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USER = os.getenv('NEO4J_USER')
NEO4J_PASS = os.getenv('NEO4J_PASS')

EXHIBIT_DIR = Path("data/exhibits")  # folder with E*.txt
NODES_CSV   = "data/lantana_charge_bundle/POLE_nodes.csv"      # already uploaded

EMBED_MODEL = "all-MiniLM-L6-v2"   # <-- change here if you switch models
EMBED_DIM   = 384                  # <-- must match your Neo4j vector index

In [32]:
def chunk_text(text, max_chars=400):
    """Split text into chunks, handling legal document structure."""
    # First try splitting by logical sections (A., B., C., Summary:, etc.)
    section_splits = re.split(r'\n(?=[A-Z]\.|Summary:|Remarks:|\d+\.)', text)
    
    if len(section_splits) > 1:
        # We found logical sections, process each
        for section in section_splits:
            section = section.strip()
            if not section:
                continue
            if len(section) <= max_chars:
                yield section
            else:
                # Section too long, split by sentences
                sentences = re.split(r'(?<=\.)\s+', section)
                buf = []
                for s in sentences:
                    s = s.strip()
                    if not s:
                        continue
                    if sum(len(x) for x in buf) + len(s) + 1 <= max_chars:
                        buf.append(s)
                    else:
                        if buf:
                            yield ' '.join(buf)
                        buf = [s]
                if buf:
                    yield ' '.join(buf)
    else:
        # No logical sections found, fall back to paragraph/sentence splitting
        paras = [p.strip() for p in re.split(r"\n\s*\n", text) if p.strip()]
        if len(paras) <= 1:
            # No paragraph breaks, split by lines or sentences
            lines = [line.strip() for line in text.split('\n') if line.strip()]
            buf = []
            for line in lines:
                if sum(len(x) for x in buf) + len(line) + 1 <= max_chars:
                    buf.append(line)
                else:
                    if buf:
                        yield ' '.join(buf)
                    buf = [line]
            if buf:
                yield ' '.join(buf)
        else:
            # Process paragraphs normally
            for para in paras:
                if len(para) <= max_chars:
                    yield para
                else:
                    sentences = re.split(r'(?<=\.)\s+', para)
                    buf = []
                    for s in sentences:
                        s = s.strip()
                        if not s:
                            continue
                        if sum(len(x) for x in buf) + len(s) + 1 <= max_chars:
                            buf.append(s)
                        else:
                            if buf:
                                yield ' '.join(buf)
                            buf = [s]
                    if buf:
                        yield ' '.join(buf)

In [33]:
# Load embedding model - use model name for AzureML compatibility
print("Loading embedding model...")
  # Fallback to download (for AzureML)
model = SentenceTransformer(EMBED_MODEL)
print("✅ Downloaded model from HuggingFace")

def embed(text): 
    """Embed text using the loaded model."""
    return model.encode(text, show_progress_bar=False).tolist()

# Load nodes for mention linking
print("Loading entity labels...")
nodes = pd.read_csv(NODES_CSV)
labels = sorted(set(nodes["label"].dropna().astype(str)))
print(f"✅ Loaded {len(labels)} entity labels for mention linking")

Loading embedding model...
✅ Downloaded model from HuggingFace
Loading entity labels...
✅ Loaded 32 entity labels for mention linking


In [34]:
# CLEANUP: Remove existing chunks to rebuild with better chunking
print("Cleaning up existing chunks...")
print(f"neo: {NEO4J_URI}")
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASS))

with driver.session(database="dpppoc") as s:
    # Remove all chunks and their relationships
    result = s.run("MATCH (c:Chunk) DETACH DELETE c RETURN count(*) as deleted")
    deleted = result.single()["deleted"]
    print(f"✅ Deleted {deleted} existing chunks")
    
    # Also remove the vector index to recreate it fresh
    try:
        s.run("DROP INDEX chunk_vec IF EXISTS")
        print("✅ Dropped existing vector index")
    except Exception as e:
        print(f"Note: {e}")

driver.close()
print("Ready to rebuild chunks with improved chunking strategy.")

Cleaning up existing chunks...
neo: bolt://20.58.149.213:7687
✅ Deleted 20 existing chunks
✅ Dropped existing vector index
Ready to rebuild chunks with improved chunking strategy.


In [35]:
print("Creating vector embeddings and chunks...")
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASS))

# Check if exhibits exist first
exhibit_files = list(EXHIBIT_DIR.glob("E*.txt"))
print(f"Found {len(exhibit_files)} exhibit files")

if not exhibit_files:
    print("⚠️  No exhibit files found. Check EXHIBIT_DIR path.")
else:
    with driver.session(database="dpppoc") as s:
        # Ensure constraints/index exist (safe to run repeatedly)
        print("Setting up database constraints and indexes...")
        s.run("CREATE CONSTRAINT exhibit_id IF NOT EXISTS FOR (e:Exhibit) REQUIRE e.id IS UNIQUE")
        s.run("CREATE CONSTRAINT chunk_id   IF NOT EXISTS FOR (c:Chunk)   REQUIRE c.id IS UNIQUE")
        s.run("""
        CREATE VECTOR INDEX chunk_vec IF NOT EXISTS FOR (c:Chunk) ON (c.embedding)
        OPTIONS { indexConfig: { `vector.dimensions`: $dim, `vector.similarity_function`: "cosine" } }
        """, dim=EMBED_DIM)

        total_chunks = 0
        for i, p in enumerate(sorted(exhibit_files), 1):
            print(f"Processing {p.name} ({i}/{len(exhibit_files)})...")
            
            text = p.read_text(encoding="utf-8", errors="ignore")
            m = re.match(r"^(E\d+)", p.stem)
            ex_id = m.group(1) if m else p.stem

            # Upsert Exhibit
            s.run("""
            MERGE (e:Exhibit {id:$id})
            ON CREATE SET e.filename=$file
            ON MATCH  SET e.filename=$file
            """, id=ex_id, file=p.name)

            # Process chunks for this exhibit
            chunks = list(chunk_text(text))
            print(f"  Creating {len(chunks)} chunks...")
            
            start = 0
            for chunk_num, para in enumerate(chunks, 1):
                if chunk_num % 5 == 0:  # Progress indicator
                    print(f"    Chunk {chunk_num}/{len(chunks)}")
                
                cid = str(uuid.uuid4())
                vec = embed(para)
                end = start + len(para)

                s.run("""
                MATCH (e:Exhibit {id:$ex})
                CREATE (c:Chunk {
                  id:$id, text:$text, exhibit_id:$ex,
                  source_file:$file, offset_start:$start, offset_end:$end,
                  embedding:$emb
                })
                MERGE (c)-[:FROM_EXHIBIT]->(e)
                """, ex=ex_id, id=cid, text=para, file=p.name, start=start, end=end, emb=vec)
                start = end + 2
            
            total_chunks += len(chunks)
            print(f"  ✅ Completed {p.name}")

    print(f"✅ Created {total_chunks} chunks from {len(exhibit_files)} exhibits")

driver.close()

Creating vector embeddings and chunks...
Found 9 exhibit files
Setting up database constraints and indexes...
Processing E0_Charge_Brief.txt (1/9)...
  Creating 19 chunks...
    Chunk 5/19
    Chunk 10/19
    Chunk 15/19
  ✅ Completed E0_Charge_Brief.txt
Processing E1_Forensic_Scene_Photographs.txt (2/9)...
  Creating 2 chunks...
  ✅ Completed E1_Forensic_Scene_Photographs.txt
Processing E2_Autopsy_Report.txt (3/9)...
  Creating 3 chunks...
  ✅ Completed E2_Autopsy_Report.txt
Processing E3_Mobile_Phone_Tower_Log.txt (4/9)...
  Creating 2 chunks...
  ✅ Completed E3_Mobile_Phone_Tower_Log.txt
Processing E4_Vehicle_Examination_Report.txt (5/9)...
  Creating 2 chunks...
  ✅ Completed E4_Vehicle_Examination_Report.txt
Processing E5_Interview_Patrick_Phelan.txt (6/9)...
  Creating 3 chunks...
  ✅ Completed E5_Interview_Patrick_Phelan.txt
Processing E6_Therapy_Notes_and_Fibre_Analysis.txt (7/9)...
  Creating 2 chunks...
  ✅ Completed E6_Therapy_Notes_and_Fibre_Analysis.txt
Processing E7_Witne

In [6]:
# Create mention relationships between chunks and entities
print("Creating mention relationships...")
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASS))

with driver.session(database="dpppoc") as s:
    # Process in smaller batches to avoid timeout
    batch_size = 20
    for i in range(0, len(labels), batch_size):
        batch_labels = labels[i:i+batch_size]
        print(f"Processing entities {i+1}-{min(i+batch_size, len(labels))} of {len(labels)}...")
        
        s.run("""
        WITH $labels AS labels
        UNWIND labels AS name
        MATCH (c:Chunk)
        WHERE toLower(c.text) CONTAINS toLower(name)
        OPTIONAL MATCH (p:Person   {label:name})
        OPTIONAL MATCH (o:Object   {label:name})
        OPTIONAL MATCH (l:Location {label:name})
        OPTIONAL MATCH (e:Event    {label:name})
        FOREACH (_ IN CASE WHEN p IS NULL THEN [] ELSE [1] END | MERGE (c)-[:MENTIONS]->(p))
        FOREACH (_ IN CASE WHEN o IS NULL THEN [] ELSE [1] END | MERGE (c)-[:MENTIONS]->(o))
        FOREACH (_ IN CASE WHEN l IS NULL THEN [] ELSE [1] END | MERGE (c)-[:MENTIONS]->(l))
        FOREACH (_ IN CASE WHEN e IS NULL THEN [] ELSE [1] END | MERGE (c)-[:MENTIONS]->(e))
        """, labels=batch_labels, timeout=30000)  # 30 second timeout
    
    print("✅ Mention relationships created")

driver.close()

Creating mention relationships...
Processing entities 1-20 of 32...
Processing entities 21-32 of 32...
✅ Mention relationships created


In [36]:
# Test vector search functionality
print("Testing vector search...")
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASS))

# Reuse the existing model instead of reloading
question = "Who was the last person to see the victim alive?"
print(f"Query: {question}")

qvec = embed(question)  # Use the embed function we already defined

with driver.session(database="dpppoc") as s:
    # First check if we have any chunks
    chunk_count = s.run("MATCH (c:Chunk) RETURN count(c) AS count").single()["count"]
    print(f"Found {chunk_count} chunks in database")
    
    if chunk_count > 0:
        result = s.run("""
            CALL db.index.vector.queryNodes('chunk_vec', 5, $embedding)
            YIELD node AS c, score
            RETURN c.exhibit_id AS exhibit, c.source_file AS file, c.text AS snippet, score
            ORDER BY score DESC
            LIMIT 5
        """, embedding=qvec)
        
        print("\nVector search test results:")
        results = list(result)
        if results:
            for row in results:
                print(f"{row['exhibit']} | score={row['score']:.3f}")
                print(f"{row['snippet'][:200]}...\n---\n")
            print(f"✅ Vector search working! Found {len(results)} results")
        else:
            print("⚠️  No vector search results - check if vector index is populated")
    else:
        print("⚠️  No chunks found - run the embedding creation cells first")

driver.close()
print("✅ Vector embeddings setup complete!")

Testing vector search...
Query: Who was the last person to see the victim alive?
Found 39 chunks in database

Vector search test results:
E0 | score=0.748
Somers died as a result of blunt force trauma to the head consistent with an assault. Based on the totality of evidence obtained, Patrick Phelan, male, 39 years, is charged with Murder (Section 18, Cr...
---

E0 | score=0.746
1. Executive Summary
This brief of evidence concerns the unlawful killing of Dr. Valerie Somers, psychologist, aged 42 years, whose body was recovered from bushland near Lantana Park, Sydney, on 2 Dec...
---

E2 | score=0.722
Exhibit E2 – Autopsy Report
Victim: Dr. Valerie Somers
Examiner: Dr. Elaine Wong, NSW Forensic Pathology
Date: 3 December 2001...
---

E0 | score=0.719
2. Persons Involved
Victim: Dr. Valerie Somers – Clinical psychologist; resident of Lane Cove; estranged from husband John Knox. Accused: Patrick Phelan – 39; unemployed; neighbour of John Knox; prior...
---

E5 | score=0.711
Summary:
- Init