#### Confirm Working

In [1]:
# Confirm Jupyter and kernel works
import sys
print(sys.executable)
print("Kernel working. Proceed")

/home/sepeh/.cache/pypoetry/virtualenvs/graphrag-developer-challenge3-4suqu0LP-py3.12/bin/python
Kernel working. Proceed


#### Connect to Neo4j

In [2]:
# Connect to Neo4j
import os
import sys

# Add parent directory to Python path so we can import KnowledgeGraph
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
print(f"Current directory: {os.getcwd()}")
print(f"Parent directory: {parent_dir}")
print(f"Parent dir exists: {os.path.exists(parent_dir)}")
if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)
print(f"KnowledgeGraph exists: {os.path.exists(os.path.join(parent_dir, 'KnowledgeGraph'))}")

from KnowledgeGraph.knowledgegraph import ingest_Chunks, create_nodes, create_relationship, create_vector_index, embed_text
from KnowledgeGraph.chunking_strategy import ChunkingConfig, chunk_document_text
from KnowledgeGraph.config import load_neo4j_graph

import json

# Error handling for Neo4j connection
try:
    graph, openAI_api, openAI_endpoint, openAI_model = load_neo4j_graph()
    print("Successfully connected to Neo4j!")
except ValueError as e:
    print(f"Warning: {e}")
    print("Please set up Neo4j (local or AuraDB) and update .env file")
    raise

Current directory: /mnt/c/Users/sepeh/OneDrive/Documents/Git/GraphRag_Developer_Challenge3/Main Functions
Parent directory: /mnt/c/Users/sepeh/OneDrive/Documents/Git/GraphRag_Developer_Challenge3
Parent dir exists: True
KnowledgeGraph exists: True
Successfully connected to Neo4j!


#### Optional Reset

In [3]:
# -------------------------------------------------------------
# Cleanup (optional reset between runs)
# -------------------------------------------------------------
labels_to_wipe = ['Chunk', 'Document', 'Publication', 'Gazette', 'Issue', 'Issues', 'Page', 'Section']
print(f"Deleting old {', '.join(labels_to_wipe)} nodes...")
try:
    graph.query("""
    MATCH (n)
    WHERE n:Chunk OR n:Document OR n:Publication OR n:Gazette OR n:Issue OR n:Issues OR n:Page OR n:Section
    DETACH DELETE n
    """)
    print("Old nodes deleted successfully.\n")
except Exception as e:
    print(f"Cleanup skipped: {e}\n")

Deleting old Chunk, Document, Publication, Gazette, Issue, Issues, Page, Section nodes...


Old nodes deleted successfully.



### <r> Clear Embeddings </r> ###

In [4]:
# -------------------------------------------------------------
# Recreate vector index at dims from .env and clear old vectors
# -------------------------------------------------------------
from textwrap import dedent

def run(q, params=None):
    return graph.query(q, params or {})

# 1) Drop old index (idempotent)
try:
    run("DROP INDEX `Chunk` IF EXISTS")
    print("Dropped index `Chunk` (if it existed).")
except Exception as e:
    print(f"Drop index skipped: {e}")

# 2) Clear old vectors (they may be wrong dims)
run("MATCH (c:Chunk) REMOVE c.textEmbeddingOpenAI")
print("Cleared old embeddings from Chunk nodes.")

# 3) Recreate index using dims from .env (OPENAI_EMBED_DIM)
from KnowledgeGraph.knowledgegraph import create_vector_index
create_vector_index(graph=graph, index_name='Chunk')  # reads OPENAI_EMBED_DIM
print("Recreated vector index `Chunk` with dims from OPENAI_EMBED_DIM.")

# 4) Show index options to confirm dims
rows = run(dedent("""
SHOW INDEXES
YIELD name, type, entityType, labelsOrTypes, properties, options
WHERE name = 'Chunk'
RETURN name, options
"""))
print("Index info:", rows[0] if rows else "Index not found")


Dropped index `Chunk` (if it existed).
Cleared old embeddings from Chunk nodes.
Recreated vector index `Chunk` with dims from OPENAI_EMBED_DIM.
Index info: {'name': 'Chunk', 'options': {'indexProvider': 'vector-2.0', 'indexConfig': {'vector.hnsw.m': 16, 'vector.hnsw.ef_construction': 100, 'vector.dimensions': 3072, 'vector.similarity_function': 'COSINE', 'vector.quantization.enabled': True}}}


#### Loop through each file and ingest main nodes and chunk data

In [5]:
import json, re, uuid, shutil, time
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

PROJECT_ROOT = Path(os.getcwd())
SOURCE_DIR = (PROJECT_ROOT.parent / "KnowledgeGraph" / "source_data").resolve()
GAZETTE_NAME = "Kuwait Today / Al-Kuwait Al-Youm"
CHUNK_CONFIG = ChunkingConfig()
max_workers = int(os.getenv("MAX_WORKERS", str(min(16, (os.cpu_count() or 4) * 2))))

if not SOURCE_DIR.exists():
    raise FileNotFoundError(f"Expected source markdown directory at {SOURCE_DIR}")

tqdm.write(f"Project root: {PROJECT_ROOT}", file=sys.stderr)
tqdm.write(f"Source folder: {SOURCE_DIR}", file=sys.stderr)

METADATA_RE = re.compile(r"<document_metadata>\s*(\{.*?\})\s*</document_metadata>", re.DOTALL)
DATE_FROM_FILENAME_RE = re.compile(r"(\d{4})[-_](\d{2})[-_](\d{2})")
SUPPLEMENT_NUMBER_RE = re.compile(r"Supplement\s*(?:No\.?|Number)?\s*(\d+)", re.IGNORECASE)
ISSUE_RE = re.compile(r"Issue\s*(?:No\.?|Number)?\s*(\d{3,4})", re.IGNORECASE)
BLOCK_RE = re.compile(r"<(document_metadata|page_metadata)>.*?</\1>", re.DOTALL)
PAGE_START_RE = re.compile(r"<page_start[^>]*>(\d+)</page_start>")
PAGE_END_RE = re.compile(r"<page_end[^>]*>(\d+)</page_end>")
DOC_NUMBER_PATTERNS = [
    re.compile(r"No\.?\s*\(?\s*([A-Za-z0-9\s\-/–—]+)\s*\)?\s*of\s*(\d{4})", re.IGNORECASE),
    re.compile(r"No\.?\s*[:\-]?\s*([A-Za-z0-9\s\-/–—]+)", re.IGNORECASE),
]
TOKEN_RE = re.compile(r"\S+\s*")

NUMBER_WORDS = {
    "zero": "0", "one": "1", "two": "2", "three": "3", "four": "4", "five": "5",
    "six": "6", "seven": "7", "eight": "8", "nine": "9", "ten": "10", "eleven": "11",
    "twelve": "12", "thirteen": "13", "fourteen": "14", "fifteen": "15", "sixteen": "16",
    "seventeen": "17", "eighteen": "18", "nineteen": "19", "twenty": "20",
    "first": "1", "second": "2", "third": "3", "fourth": "4", "fifth": "5",
    "sixth": "6", "seventh": "7", "eighth": "8", "ninth": "9", "tenth": "10"
}
ROMAN_MAP = {"I": 1, "V": 5, "X": 10, "L": 50, "C": 100, "D": 500, "M": 1000}
DOC_KEYWORDS = [
    "decree-law", "decree", "decision", "resolution", "tender", "notice", "law",
    "memorandum", "circular", "announcement", "order", "statement", "invitation",
    "award", "regulation", "contract", "agreement", "ministerial decision",
    "council", "committee", "correction", "report"
]


def _parse_int(value):
    try:
        text = str(value).strip()
        if not text:
            return None
        return int(text)
    except Exception:
        return None


def _roman_to_int(token: str) -> int | None:
    total = 0
    prev = 0
    for ch in token.upper():
        if ch not in ROMAN_MAP:
            return None
        value = ROMAN_MAP[ch]
        if value > prev:
            total += value - 2 * prev
        else:
            total += value
        prev = value
    return total


def parse_publication_header(text: str, filename: str) -> dict:
    slug = Path(filename).stem
    metadata = {}
    match = METADATA_RE.search(text)
    if match:
        try:
            metadata = json.loads(match.group(1))
        except json.JSONDecodeError:
            metadata = {}

    header_slice = text[:3000]
    header_plain = re.sub(r"<[^>]+>", " ", header_slice)

    publication_date = metadata.get("document_date") or metadata.get("publication_date")
    if not publication_date:
        date_match = DATE_FROM_FILENAME_RE.search(slug)
        if date_match:
            publication_date = "-".join(date_match.groups())

    issue_number = _parse_int(metadata.get("issue_number"))
    if issue_number is None:
        issue_match = ISSUE_RE.search(header_plain)
        if issue_match:
            issue_number = _parse_int(issue_match.group(1))

    volume_number = _parse_int(metadata.get("volume_number"))
    if volume_number is None and "Seventy-First" in header_plain:
        volume_number = 71

    title = metadata.get("document_title")

    supplement_index = 0
    sup_match = SUPPLEMENT_NUMBER_RE.search(title or "") or SUPPLEMENT_NUMBER_RE.search(header_plain)
    if sup_match:
        candidate = _parse_int(sup_match.group(1)) or 0
        if issue_number and candidate == issue_number:
            supplement_index = 1
        elif candidate > 50:
            supplement_index = 1
        else:
            supplement_index = candidate
    elif re.search(r"Supplement", title or "", re.IGNORECASE) or re.search(r"Supplement", header_plain, re.IGNORECASE):
        supplement_index = 1

    publication_date = publication_date or "Unknown"
    issue_number = issue_number or 0
    volume_number = volume_number or 0

    publication_key = f"{volume_number}-{issue_number}-{supplement_index}"

    return {
        "slug": slug,
        "publication_date": publication_date,
        "volume_number": volume_number,
        "issue_number": issue_number,
        "supplement_index": supplement_index,
        "is_supplement": supplement_index > 0,
        "title": title,
        "publication_key": publication_key,
        "metadata": metadata,
    }


def _normalize_doc_number(base: str, year: str | None) -> str:
    for char in "\u2013\u2014\u2012\u2212":
        base = base.replace(char, "-")
    base = re.sub(r"\s+", "", base.strip())
    if year and year not in base:
        base = f"{base}/{year}"
    return base


def _extract_doc_number(text: str) -> str | None:
    for pattern in DOC_NUMBER_PATTERNS:
        match = pattern.search(text)
        if match:
            base = match.group(1)
            year = match.group(2) if len(match.groups()) >= 2 else None
            return _normalize_doc_number(base, year)
    return None


def _determine_doc_type(header: str) -> str:
    lower = header.lower()
    mapping = [
        (r"ministerial decision", "Decision"),
        (r"decree-law", "Decree-Law"),
        (r"emiri decree", "Decree"),
        (r"amiri decree", "Decree"),
        (r"decree", "Decree"),
        (r"resolution", "Resolution"),
        (r"decision", "Decision"),
        (r"memorandum", "Memorandum"),
        (r"circular", "Circular"),
        (r"tender", "Tender"),
        (r"notice", "Notice"),
        (r"invitation", "Invitation"),
        (r"announcement", "Announcement"),
        (r"law", "Law"),
        (r"order", "Order"),
        (r"statement", "Statement"),
        (r"regulation", "Regulation"),
        (r"contract", "Contract"),
        (r"agreement", "Agreement"),
        (r"award", "Award"),
        (r"correction", "Correction"),
        (r"report", "Report")
    ]
    for pattern, doc_type in mapping:
        if re.search(pattern, lower):
            return doc_type
    return "Document"


def _detect_title(lines: list[str]) -> str | None:
    for line in lines:
        stripped = line.strip().lstrip("#").strip()
        if not stripped:
            continue
        lowered = stripped.lower()
        if lowered.startswith("no."):
            continue
        if lowered.startswith("article") or lowered.startswith("مادة"):
            continue
        return stripped
    return None


def _detect_doc_start(line: str) -> bool:
    stripped = line.lstrip()
    hashes = len(stripped) - len(stripped.lstrip("#"))
    if hashes != 2:
        return False
    header = stripped.lstrip("#").strip().lower()
    return any(keyword in header for keyword in DOC_KEYWORDS)


def iter_documents(text: str):
    cleaned = BLOCK_RE.sub("", text)
    lines = cleaned.splitlines()
    current_section = None
    pending_pages: list[int] = []
    current_doc = None

    def flush(doc_state):
        if not doc_state:
            return None
        doc_lines = doc_state["lines"]
        body = "\n".join(doc_lines).strip()
        if not body:
            return None
        header = doc_state["header"]
        doc_type = _determine_doc_type(header)
        doc_number = _extract_doc_number("\n".join(doc_lines[:5]))
        title = _detect_title(doc_lines[1:6])
        page_numbers = sorted(set(doc_state.get("page_numbers") or []))
        return {
            "doc_type": doc_type,
            "doc_number": doc_number,
            "issuer": None,
            "title": title,
            "section_heading": doc_state.get("section_heading"),
            "page_start": page_numbers[0] if page_numbers else None,
            "page_end": page_numbers[-1] if page_numbers else None,
            "text": body,
        }

    for raw_line in lines:
        line = raw_line.rstrip()
        if not line:
            if current_doc:
                current_doc["lines"].append("")
            continue
        page_start = PAGE_START_RE.search(line)
        page_end = PAGE_END_RE.search(line)
        if page_start:
            pending_pages.append(int(page_start.group(1)))
            continue
        if page_end:
            pending_pages.append(int(page_end.group(1)))
            continue
        stripped = line.lstrip()
        hashes = len(stripped) - len(stripped.lstrip("#"))
        if hashes == 1:
            current_section = stripped.lstrip("#").strip()
            continue
        if _detect_doc_start(line):
            doc_payload = flush(current_doc)
            if doc_payload:
                yield doc_payload
            header = line.lstrip("#").strip()
            current_doc = {
                "header": header,
                "lines": [header],
                "section_heading": current_section,
                "page_numbers": list(pending_pages) if pending_pages else [],
            }
            pending_pages.clear()
            continue
        if current_doc is None:
            continue
        if pending_pages:
            current_doc["page_numbers"].extend(pending_pages)
            pending_pages.clear()
        current_doc["lines"].append(line)

    final_doc = flush(current_doc)
    if final_doc:
        yield final_doc



def ensure_constraints(graph_client):
    queries = [
        """
        CREATE CONSTRAINT publication_key_unique IF NOT EXISTS
        FOR (p:Publication) REQUIRE p.publication_key IS UNIQUE
        """,
        """
        CREATE CONSTRAINT document_key_unique IF NOT EXISTS
        FOR (d:Document) REQUIRE d.document_key IS UNIQUE
        """,
        """
        CREATE CONSTRAINT chunk_id_unique IF NOT EXISTS
        FOR (c:Chunk) REQUIRE c.chunk_id IS UNIQUE
        """,
    ]
    for query in queries:
        graph_client.query(query)

ensure_constraints(graph)

graph.query(
    """
    MERGE (g:Gazette {name: $name})
    ON CREATE SET g.name = $name
    """,
    {"name": GAZETTE_NAME},
)


def _sanitize_doc_type(doc_type: str) -> str:
    safe = re.sub(r"\s+", "-", doc_type.strip())
    return safe or "Document"

md_files = sorted(SOURCE_DIR.rglob("*.md"))
if not md_files:
    tqdm.write("No markdown files found for ingestion.", file=sys.stderr)

prepared_publications = []
total_chunks = 0
for path in md_files:
    text = path.read_text(encoding="utf-8")
    publication = parse_publication_header(text, str(path))
    publication["source_basename"] = path.name
    documents = []
    doc_seq = 0
    for doc in iter_documents(text):
        doc_seq += 1
        doc_type = doc["doc_type"] or "Document"
        key_doc_type = _sanitize_doc_type(doc_type)
        doc_number = doc.get("doc_number")
        key_suffix = doc_number or f"{doc_seq:03d}"
        document_key = f"{publication['publication_key']}:{key_doc_type}:{key_suffix}"
        doc["document_key"] = document_key
        doc["doc_type"] = doc_type
        doc["doc_sequence"] = doc_seq
        documents.append(doc)
    publication_context = {
        "publication_key": publication["publication_key"],
        "issue_number": publication["issue_number"],
        "supplement_index": publication["supplement_index"],
        "publication_date": publication["publication_date"],
        "source_basename": publication["source_basename"],
    }

    for doc in documents:
        doc_meta = {
            "document_key": doc["document_key"],
            "doc_type": doc["doc_type"],
            "doc_number": doc.get("doc_number"),
            "section_heading": doc.get("section_heading"),
            "page_start": doc.get("page_start"),
            "page_end": doc.get("page_end"),
        }
        doc_chunks = chunk_document_text(doc["text"], publication_context, doc_meta, cfg=CHUNK_CONFIG)
        doc["chunks"] = doc_chunks
        total_chunks += len(doc_chunks)
        if doc.get("doc_sequence") == 1 and doc_chunks:
            preview_lines = doc_chunks[0]["text"].splitlines()[:4]
            tqdm.write("Chunk preview: " + " | ".join(preview_lines), file=sys.stderr)
    prepared_publications.append({
        "path": path,
        "publication": publication,
        "documents": documents,
    })
    tqdm.write(
        f"Prepared {path.name}: {len(documents)} documents, {sum(len(d['chunks']) for d in documents)} chunks",
        file=sys.stderr,
    )

bar_format = '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]'
try:
    term_width = shutil.get_terminal_size().columns
except Exception:
    term_width = 100


def _ingest_publication_entry(entry, bar):
    publication = entry["publication"]
    publication_props = {
        "slug": publication["slug"],
        "title": publication.get("title"),
        "publication_date": publication["publication_date"],
        "volume_number": publication["volume_number"],
        "issue_number": publication["issue_number"],
        "supplement_index": publication["supplement_index"],
        "is_supplement": publication["is_supplement"],
        "source_path": str(entry["path"]),
    }

    graph.query(
        """
        MERGE (g:Gazette {name: $gazette_name})
        WITH g
        MERGE (p:Publication {publication_key: $publication_key})
        ON CREATE SET p += $props
        ON MATCH SET p += $props
        MERGE (g)-[:HAS_PUBLICATION]->(p)
        """,
        {
            "gazette_name": GAZETTE_NAME,
            "publication_key": publication["publication_key"],
            "props": publication_props,
        },
    )

    def _ingest_one_doc(doc):
        doc_props = {
            "doc_type": doc["doc_type"],
            "doc_number": doc.get("doc_number"),
            "issuer": doc.get("issuer"),
            "title": doc.get("title"),
            "section_heading": doc.get("section_heading"),
            "page_start": doc.get("page_start"),
            "page_end": doc.get("page_end"),
            "doc_sequence": doc.get("doc_sequence"),
            "publication_key": publication["publication_key"],
        }
        graph.query(
            """
            MATCH (p:Publication {publication_key: $publication_key})
            MERGE (d:Document {document_key: $document_key})
            ON CREATE SET d += $props
            ON MATCH SET d += $props
            MERGE (p)-[:CONTAINS]->(d)
            """,
            {
                "publication_key": publication["publication_key"],
                "document_key": doc["document_key"],
                "props": doc_props,
            },
        )

        chunks = doc.get("chunks") or []
        if chunks:
            graph.query(
                """
                UNWIND $chunks AS chunk
                MERGE (c:Chunk {chunk_id: chunk.chunk_id})
                ON CREATE SET c += chunk
                ON MATCH SET c += chunk
                WITH c, chunk
                MATCH (d:Document {document_key: chunk.document_key})
                MERGE (d)-[:HAS_CHUNK]->(c)
                """,
                {"chunks": chunks},
            )
        return len(chunks)

    with ThreadPoolExecutor(max_workers=max_workers) as ex:
        futures = [ex.submit(_ingest_one_doc, d) for d in entry["documents"]]
        for fut in futures:
            try:
                count = fut.result()
                if count:
                    bar.update(count)
            except Exception as e:
                tqdm.write(f"Doc ingest error: {e}", file=sys.stderr)

start = time.perf_counter()
with tqdm(
    total=max(total_chunks, 1),
    desc=f"Ingesting chunks (workers={max_workers})",
    ncols=term_width,
    bar_format=bar_format,
    file=sys.stderr,
    disable=(total_chunks == 0),
) as bar:
    for entry in prepared_publications:
        _ingest_publication_entry(entry, bar)

dt = time.perf_counter() - start
if total_chunks == 0:
    tqdm.write("No chunks were generated for ingestion.", file=sys.stderr)
else:
    rate = total_chunks / max(dt, 1e-6)
    tqdm.write(f"Ingestion complete. Ingested {total_chunks} chunks in {dt:.1f}s ({rate:,.0f} chunks/s).", file=sys.stderr)



Project root: /mnt/c/Users/sepeh/OneDrive/Documents/Git/GraphRag_Developer_Challenge3/Main Functions
Source folder: /mnt/c/Users/sepeh/OneDrive/Documents/Git/GraphRag_Developer_Challenge3/KnowledgeGraph/source_data
Chunk preview: [PUB=71-1736-0] [ISSUE=1736] [SUPP=0] [DATE=2025-04-27] [SRC=2025-04-27_en.md] [DOC=Decree 71/2025] [SECTION=Council of Ministers <orig>مجلس الوزراء</orig>] | Decree No. 71 of 2025
Prepared 2025-04-27_en.md: 115 documents, 1136 chunks
Chunk preview: [PUB=71-1736-2] [ISSUE=1736] [SUPP=2] [DATE=2025-05-01] [SRC=2025-05-01_en.md] [DOC=Decree-Law 63/2025] [SECTION=Council of Ministers] | Decree-Law No. 63 of 2025
Prepared 2025-05-01_en.md: 1 documents, 9 chunks
Chunk preview: [PUB=71-1737-0] [ISSUE=1737] [SUPP=0] [DATE=2025-05-04] [SRC=2025-05-04_en.md] [DOC=Resolution 75/2025] [SECTION=Capital Markets Authority / Hay'at Aswaq Al-Mal <orig>هيئة أسواق المال</orig>] | Resolution No. (75) of 2025 | Regarding the Delisting of Shares of Jeiad Holding Company <orig>شركة

#### Summary & Validation

In [6]:
# -------------------------------------------------------------
# Summary & validation checks
# -------------------------------------------------------------
from textwrap import dedent

def run_query(query, params=None):
    return graph.query(query, params or {})

expected_publications = len(md_files) if 'md_files' in globals() else None

print('\n--- Core Counts ---')
gazette_count = run_query('MATCH (g:Gazette) RETURN count(g) AS c')[0]['c']
publication_count = run_query('MATCH (p:Publication) RETURN count(p) AS c')[0]['c']
document_count = run_query('MATCH (d:Document) RETURN count(d) AS c')[0]['c']
chunk_count = run_query('MATCH (c:Chunk) RETURN count(c) AS c')[0]['c']
print(f'Gazettes: {gazette_count}')
if expected_publications is not None:
    print(f'Publications: {publication_count} (expected {expected_publications})')
else:
    print(f'Publications: {publication_count}')
print(f'Documents: {document_count}')
print(f'Chunks: {chunk_count}')

print('\n--- Supplement Spot Checks ---')
for issue in (1736, 1738):
    rows = run_query(
        'MATCH (p:Publication {issue_number: $issue}) RETURN p.publication_key AS key, p.is_supplement AS is_supplement, p.supplement_index AS supplement_index ORDER BY p.supplement_index',
        {'issue': issue},
    )
    if not rows:
        print(f'Issue {issue}: no rows found')
        continue
    for row in rows:
        print(f"Issue {issue} -> {row['key']} (supplement={row['is_supplement']}, index={row['supplement_index']})")

print('\n--- Document Key Samples ---')
for row in run_query(
    'MATCH (p:Publication)-[:CONTAINS]->(d:Document) RETURN p.publication_key AS pk, d.document_key AS dk ORDER BY d.document_key LIMIT 3'
):
    print(f"{row['pk']} :: {row['dk']}")

print('\n--- Chunk Header Preview ---')
chunk_rows = run_query('MATCH (d:Document)-[:HAS_CHUNK]->(c:Chunk) RETURN c.text AS text LIMIT 3')
if not chunk_rows:
    print('No chunks found.')
else:
    for idx, row in enumerate(chunk_rows, start=1):
        header_lines = row['text'].splitlines()[:2]
        preview = ' | '.join(header_lines)
        print(f'Chunk {idx}: {preview[:200]}')


--- Core Counts ---
Gazettes: 1
Publications: 10 (expected 10)
Documents: 567
Chunks: 8488

--- Supplement Spot Checks ---
Issue 1736 -> 71-1736-0 (supplement=False, index=0)
Issue 1736 -> 71-1736-2 (supplement=True, index=2)
Issue 1738 -> 71-1738-0 (supplement=False, index=0)
Issue 1738 -> 71-1738-1 (supplement=True, index=1)

--- Document Key Samples ---
71-1733-0 :: 71-1733-0:Announcement:uncement
71-1733-0 :: 71-1733-0:Announcement:uncementArequesthasbeensubmittedtotheDepartmentofPartnerships
71-1733-0 :: 71-1733-0:Announcement:uncementMessrs

--- Chunk Header Preview ---
Chunk 1: [PUB=71-1736-0] [ISSUE=1736] [SUPP=0] [DATE=2025-04-27] [SRC=2025-04-27_en.md] [DOC=Decision 2/2025] [SECTION=Public Authority for Manpower <orig>الهيئة العامة للقوى العاملة</orig>] [ARTICLE=Summary o
Chunk 2: [PUB=71-1736-0] [ISSUE=1736] [SUPP=0] [DATE=2025-04-27] [SRC=2025-04-27_en.md] [DOC=Decision 2/2025] [SECTION=Public Authority for Manpower <orig>الهيئة العامة للقوى العاملة</orig>] [ARTICLE=Articl

In [7]:
# -------------------------------------------------------------
# Supplements + latest pointers + authority weights
# -------------------------------------------------------------
from textwrap import dedent

def run(q, params=None):
    return graph.query(q, params or {})

# Link supplements to their base (same volume + issue; supplement_index>0)
run(dedent('''
MATCH (sup:Publication)
WHERE sup.supplement_index > 0
MATCH (base:Publication {volume_number: sup.volume_number, issue_number: sup.issue_number, supplement_index: 0})
MERGE (sup)-[:UPDATED_BY_PUBLICATION]->(base)
'''))

# Remove legacy SUPPLEMENT_OF edges (if any remain)
run('MATCH ()-[r:SUPPLEMENT_OF]->() DELETE r')

# Set grouping key per publication
run(dedent('''
MATCH (p:Publication)
SET p.issue_group_key = toString(p.volume_number) + '-' + toString(p.issue_number)
'''))

# Compute latest_updated_by_publication_key for base issues
run(dedent('''
MATCH (base:Publication {supplement_index: 0})
OPTIONAL MATCH (base)<-[:UPDATED_BY_PUBLICATION]-(sup:Publication)
WITH base, sup
ORDER BY sup.publication_date DESC, sup.supplement_index DESC, sup.publication_key DESC
WITH base, [s IN collect(sup) WHERE s IS NOT NULL] AS supplements
SET base.latest_updated_by_publication_key =
  CASE
    WHEN size(supplements) = 0 THEN null
    ELSE supplements[0].publication_key
  END
'''))

# Clear pointer on supplements themselves
run(dedent('''
MATCH (p:Publication)
WHERE p.supplement_index > 0
SET p.latest_updated_by_publication_key = null
'''))

# Set authority weights by doc_type (simple heuristic)
run(dedent('''
MATCH (d:Document)
SET d.authority_weight =
  CASE d.doc_type
    WHEN 'Decree-Law' THEN 1.30
    WHEN 'Decree' THEN 1.20
    WHEN 'Law' THEN 1.20
    WHEN 'Decision' THEN 1.10
    WHEN 'Resolution' THEN 1.05
    WHEN 'Memorandum' THEN 1.00
    WHEN 'Notice' THEN 0.95
    WHEN 'Tender' THEN 0.90
    ELSE 1.00
  END
'''))

# Quick summary
supp_links = run('MATCH (:Publication)-[:UPDATED_BY_PUBLICATION]->(:Publication) RETURN count(*) AS n')[0]['n']
bases_with_latest = run('MATCH (p:Publication) WHERE p.supplement_index = 0 AND p.latest_updated_by_publication_key IS NOT NULL RETURN count(p) AS n')[0]['n']
print(f"Supplement links: {supp_links}, bases with latest pointer: {bases_with_latest}")





Supplement links: 2, bases with latest pointer: 2


In [8]:
# --- Normalize chunk text and extract procurement facts ---
from KnowledgeGraph.normalize import normalize_chunk

rows = graph.query(
    """
    MATCH (c:Chunk)
    RETURN c.chunk_id AS chunk_id, c.text AS text
    """
)

updates = []
for row in rows:
    normalized = normalize_chunk(row["text"] or "")
    facts = normalized.get("facts") or {}
    updates.append(
        {
            "chunk_id": row["chunk_id"],
            "text_norm": normalized.get("text_norm"),
            "table_kv": normalized.get("table_kv"),
            "closing_date": facts.get("closing_date"),
            "price_kd": facts.get("price_kd"),
            "guarantee_kd": facts.get("guarantee_kd"),
        }
    )

if updates:
    graph.query(
        """
        UNWIND $rows AS row
        MATCH (c:Chunk {chunk_id: row.chunk_id})
        SET c.text_norm = row.text_norm,
            c.table_kv = row.table_kv,
            c.closing_date = row.closing_date,
            c.price_kd = row.price_kd,
            c.guarantee_kd = row.guarantee_kd
        """,
        {"rows": updates},
    )

summary = graph.query(
    """
    MATCH (c:Chunk)
    RETURN count(c) AS total,
           count(c.closing_date) AS closing_dates,
           count(c.price_kd) AS price_values,
           count(c.guarantee_kd) AS guarantee_values
    """
)[0]
print(
    "Normalized {total} chunks (closing_date={closing_dates}, price_kd={price_values}, guarantee_kd={guarantee_values}).".format(
        **summary
    )
)




Normalized 8488 chunks (closing_date=632, price_kd=675, guarantee_kd=186).


In [9]:
# --- Stamp chunk source (publication_key) and ensure page bounds ---
from textwrap import dedent

run(
    dedent(
        """
        MATCH (d:Document)-[:HAS_CHUNK]->(c:Chunk)
        MATCH (d)-[:PUBLISHED_IN]->(p:Publication)
        SET c.source = p.publication_key
        """
    )
)

page_stats = run(
    dedent(
        """
        MATCH (c:Chunk)
        RETURN count(c) AS total,
               count(c.page_start) AS with_start,
               count(c.page_end) AS with_end
        """
    )
)[0]
print(
    "Chunk source stamped; page coverage -> start: {with_start}/{total}, end: {with_end}/{total}".format(
        **page_stats
    )
)




Chunk source stamped; page coverage -> start: 7462/8488, end: 7462/8488


#### Ingest all data and create index

In [10]:
# --- Anchor extraction + indexes (persistent) ---
import re
from typing import Dict, List, Set

_ARABIC_DIGITS = str.maketrans("٠١٢٣٤٥٦٧٨٩", "0123456789")

def _norm_digits(s: str) -> str:
    return (s or "").translate(_ARABIC_DIGITS)

def _normalize_anchor_value(anchor: str) -> str:
    t = _norm_digits(anchor or "")
    t = re.sub(r"[\u200f\u200e\u00a0\u202f]", " ", t)
    t = t.translate(str.maketrans({"‐": "-", "‑": "-", "‒": "-", "–": "-", "—": "-", "−": "-"}))
    t = t.strip()
    if not t:
        return ""
    t = t.upper()
    t = re.sub(r"(?<=\b[A-Z])\.(?=[A-Z])", "", t)
    t = re.sub(r"\s+", " ", t)
    t = re.sub(r"\s*([:/-])\s*", r"\1", t)
    t = re.sub(r"\s+", " ", t)
    return t

_ANCHOR_KIND_RULES: List[tuple] = [
    (re.compile(r"\bRFP\b"), "tender"),
    (re.compile(r"\bTENDER\b"), "tender"),
    (re.compile(r"\bHRA\b"), "tender"),
    (re.compile(r"\bPRACTICE\b"), "tender"),
    (re.compile(r"\bDECREE\b"), "decision"),
    (re.compile(r"\bDECISION\b"), "decision"),
    (re.compile(r"\bLAW\b"), "decision"),
    (re.compile(r"\bA/M/\d+\b"), "case"),
    (re.compile(r"\b5D[A-Z0-9]+\b"), "contract"),
    (re.compile(r"\bCERTIFICAT"), "case"),
    (re.compile(r"\bMEETING\b"), "decision"),
]

_TENDER_DOC_TYPES = {"tender", "notice", "announcement"}

_RX_ANCHORS = [
    re.compile(r"\bRFP[-\s]*\d+\b", re.IGNORECASE),
    re.compile(r"\b\d{4,7}[-\s]*RFP\b", re.IGNORECASE),
    re.compile(r"\bH\.?\s*R\.?\s*A\.?\s*\d+\s*/\s*\d+\s*/\s*\d+\b", re.IGNORECASE),
    re.compile(r"\bTENDER\s+NO\.?\s*(?:[:\-])?\s*\d+\s*/\s*\d+\s*/\s*\d+\b", re.IGNORECASE),
    re.compile(r"\bPRACTICE\s+NO\.?\s*[A-Z0-9]+(?:\s*[-/]\s*[A-Z0-9]+)+\b", re.IGNORECASE),
    re.compile(r"\bA/M/\d+\b", re.IGNORECASE),
    re.compile(r"\b5D[A-Z0-9]+\b", re.IGNORECASE),
    re.compile(r"\b\d{4}/\d{5}\b", re.IGNORECASE),
    re.compile(r"\bMEETING (?:MINUTES )?NO\.?\s*\d{4}/\d+\b", re.IGNORECASE),
    re.compile(r"\b(?:DECREE-?LAW|LAW)\s*(?:NO\.?\s*)?\d+/?\d{4}\b", re.IGNORECASE),
]

def extract_anchors_from_text(text: str) -> list[str]:
    t = _norm_digits(text or "")
    out: list[str] = []
    for rx in _RX_ANCHORS:
        out.extend(rx.findall(t))
    seen = set(); uniq: list[str] = []
    for a in out:
        k = a.upper()
        if k not in seen:
            seen.add(k); uniq.append(a)
    return uniq

def _classify_anchor(value: str) -> str:
    for rx, kind in _ANCHOR_KIND_RULES:
        if rx.search(value):
            return kind
    if re.search(r"\d{4}/\d{5}", value):
        return "case"
    if value.startswith("EPA/") or value.startswith("CAPT/"):
        return "tender"
    if value.startswith("HRA") or value.startswith("TENDER"):
        return "tender"
    if re.match(r"^[A-Z]*\d+/\d+/\d+$", value):
        return "tender"
    return "other"

def _build_anchor_records(texts: List[str]) -> List[Dict[str, str]]:
    records: List[Dict[str, str]] = []
    seen: Set[str] = set()
    for text in texts:
        for raw in extract_anchors_from_text(text):
            value = _normalize_anchor_value(raw)
            if not value or value in seen:
                continue
            seen.add(value)
            records.append(
                {
                    "kind": _classify_anchor(value),
                    "value": value,
                    "raw": raw.strip(),
                }
            )
    return records


def _filter_anchor_records(records: List[Dict[str, str]], doc_type: str | None) -> List[Dict[str, str]]:
    if not records:
        return records
    doc_type_norm = (doc_type or "").strip().lower()
    if doc_type_norm in _TENDER_DOC_TYPES:
        return records
    return [rec for rec in records if rec.get("kind") != "tender"]


run = (lambda q, p=None: graph.query(q, p or {}))
run("CREATE CONSTRAINT anchor_unique IF NOT EXISTS FOR (a:Anchor) REQUIRE (a.kind, a.value) IS UNIQUE")
run("CREATE CONSTRAINT doc_key_unique IF NOT EXISTS FOR (d:Document) REQUIRE d.document_key IS UNIQUE")
run("CREATE INDEX doc_status IF NOT EXISTS FOR (d:Document) ON (d.status)")

# Compute anchors per Document from its chunks and persist
rows = graph.query(
    """
    MATCH (d:Document)-[:HAS_CHUNK]->(c:Chunk)
    RETURN d.document_key AS dk,
           d.doc_type AS doc_type,
           collect(c.text) AS texts,
           collect(c.table_kv) AS tables
    """
)
for row in rows:
    texts: List[str] = [t for t in (row.get("texts") or []) if t]
    tables: List[str] = [t for t in (row.get("tables") or []) if t]
    doc_type = row.get("doc_type")
    anchor_records = _filter_anchor_records(
        _build_anchor_records(texts + tables),
        doc_type,
    )
    anchor_values = [rec["value"] for rec in anchor_records]
    graph.query(
        """
        MATCH (d:Document {document_key: $dk})
        SET d.anchors = $anchors
        """,
        {"dk": row["dk"], "anchors": anchor_values},
    )
    if anchor_records:
        graph.query(
            """
            UNWIND $anchor_rows AS anchor
            MERGE (a:Anchor {kind: anchor.kind, value: anchor.value})
            ON CREATE SET a.display_value = anchor.raw
            ON MATCH SET a.display_value = coalesce(a.display_value, anchor.raw)
            """,
            {"anchor_rows": anchor_records},
        )

# Helpful fulltext/property indexes (idempotent)
run("CREATE FULLTEXT INDEX chunk_text_idx IF NOT EXISTS FOR (c:Chunk) ON EACH [c.text]")
run("CREATE FULLTEXT INDEX doc_text_idx   IF NOT EXISTS FOR (d:Document) ON EACH [d.title, d.section_heading, d.doc_number, d.document_key]")
run("CREATE FULLTEXT INDEX doc_anchor_ft IF NOT EXISTS FOR (d:Document) ON EACH [d.title, d.section_heading, d.doc_number, d.document_key, d.anchors]")
run("CREATE FULLTEXT INDEX doc_anchors_idx IF NOT EXISTS FOR (d:Document) ON EACH [d.anchors]")
run("CREATE INDEX doc_pub_key_idx IF NOT EXISTS FOR (d:Document) ON (d.publication_key)")





[]

In [11]:
# --- Rebuild HAS_ANCHOR relationships from d.anchors ---
run(
    """
    MATCH (d:Document)
    OPTIONAL MATCH (d)-[r:HAS_ANCHOR]->(:Anchor)
    DELETE r
    WITH d
    UNWIND coalesce(d.anchors, []) AS value
    MATCH (a:Anchor {value: value})
    MERGE (d)-[:HAS_ANCHOR]->(a)
    """
)



[]

In [12]:
# --- Create PUBLISHED_IN edges + Publication->LATEST and page-number backfills ---
from textwrap import dedent

run = (lambda q, p=None: graph.query(q, p or {}))

# 1) Create PUBLISHED_IN edges by publication_key
run(dedent('''
MATCH (d:Document)
WHERE d.publication_key IS NOT NULL AND NOT (d)-[:PUBLISHED_IN]->(:Publication)
MATCH (p:Publication {publication_key: d.publication_key})
MERGE (d)-[:PUBLISHED_IN]->(p)
'''))

# 2) One LATEST per Publication to newest Document (by doc_sequence desc)
run(dedent('''
MATCH (p:Publication)-[:CONTAINS]->(d:Document)
WITH p, d ORDER BY d.doc_sequence DESC, d.document_key DESC
WITH p, collect(d)[0] AS latest
MERGE (p)-[:LATEST]->(latest)
WITH p
MATCH (p)-[r:LATEST]->(:Document)
WITH p, collect(r) AS rels
FOREACH (r IN rels[1..] | DELETE r)
'''))

# 3) Backfill Document page_start/page_end from chunks where missing
run(dedent('''
MATCH (d:Document)-[:HAS_CHUNK]->(c:Chunk)
WITH d, min(c.page_start) AS minStart, max(c.page_end) AS maxEnd
SET d.page_start = coalesce(d.page_start, minStart),
    d.page_end   = coalesce(d.page_end,   maxEnd)
'''))

# 4) Propagate doc page numbers to chunks where missing
run(dedent('''
MATCH (d:Document)-[:HAS_CHUNK]->(c:Chunk)
SET c.page_start = coalesce(c.page_start, d.page_start),
    c.page_end   = coalesce(c.page_end,   d.page_end)
'''))

# 5) Quick summary
pubs_without_docs = run('MATCH (p:Publication) WHERE NOT (:Document)-[:PUBLISHED_IN]->(p) RETURN count(p) AS n')[0]['n']
docs_missing_edge = run('MATCH (d:Document) WHERE NOT (d)-[:PUBLISHED_IN]->(:Publication) RETURN count(d) AS n')[0]['n']
chunk_counts = run('MATCH (c:Chunk) RETURN count(c) AS total, count(c.page_start) AS withStart, count(c.page_end) AS withEnd')[0]
print(f"PUBLISHED_IN: pubs without docs={pubs_without_docs}, docs missing edge={docs_missing_edge}")
print(f"Chunk pages: total={chunk_counts['total']}, withStart={chunk_counts['withStart']}, withEnd={chunk_counts['withEnd']}")




PUBLISHED_IN: pubs without docs=0, docs missing edge=0
Chunk pages: total=8488, withStart=7866, withEnd=7866


In [13]:
# --- Propagate anchors across document updates and supplements ---
# (Handled in the combined anchor refresh cell below.)
pass


In [14]:
# --- Anchor LATEST rebuild (after PUBLISHED_IN edges) ---
# (Handled in the combined anchor refresh cell below.)
pass



In [15]:
# --- Document update detection (corrections / cancellations / etc.) ---
_UPDATE_PATTERNS = [
    (re.compile(r"\bcorrection(s)?\b", re.IGNORECASE), ("correction", "corrected")),
    (re.compile(r"\bamend(ed|ment|ments)\b", re.IGNORECASE), ("correction", "corrected")),
    (re.compile(r"استدراك"), ("correction", "corrected")),
    (re.compile(r"تصحيح"), ("correction", "corrected")),
    (re.compile(r"\bcancell?ation\b", re.IGNORECASE), ("cancellation", "canceled")),
    (re.compile(r"\bcancel(l)?ed\b", re.IGNORECASE), ("cancellation", "canceled")),
    (re.compile(r"إلغاء"), ("cancellation", "canceled")),
    (re.compile(r"يلغى"), ("cancellation", "canceled")),
    (re.compile(r"\bwithdraw(al|n)?\b", re.IGNORECASE), ("revocation", "revoked")),
    (re.compile(r"سحب"), ("revocation", "revoked")),
    (re.compile(r"\brevocation\b", re.IGNORECASE), ("revocation", "revoked")),
    (re.compile(r"\bextend(ed|s|ing)?\b", re.IGNORECASE), ("extension", "extended")),
    (re.compile(r"\bextension\b", re.IGNORECASE), ("extension", "extended")),
    (re.compile(r"تمديد"), ("extension", "extended")),
    (re.compile(r"\bpostpone(d|ment)?\b", re.IGNORECASE), ("postponement", "postponed")),
    (re.compile(r"تأجيل"), ("postponement", "postponed")),
    (re.compile(r"\baward(ed|ing)?\b", re.IGNORECASE), ("award", "awarded")),
    (re.compile(r"ترسية"), ("award", "awarded")),
]

def _detect_update_status(text: str):
    normalized = _norm_digits(text or "")
    for pattern, payload in _UPDATE_PATTERNS:
        if pattern.search(normalized):
            return payload
    return None

run("MATCH ()-[r:UPDATES]->() DELETE r")
run("MATCH (d:Document) SET d.status = 'announced'")

doc_rows = graph.query(
    """
    MATCH (d:Document)-[:PUBLISHED_IN]->(p:Publication)
    RETURN d.document_key AS key,
           coalesce(d.title, '') AS title,
           coalesce(d.section_heading, '') AS section,
           coalesce(d.doc_type, '') AS doc_type,
           coalesce(d.anchors, []) AS anchors,
           p.publication_date AS publication_date
    """
)

for row in doc_rows:
    context_parts = [row["title"], row["section"], row["doc_type"]]
    context_text = " ".join(part for part in context_parts if part)
    detected = _detect_update_status(context_text)
    if not detected:
        continue
    kind, status = detected
    graph.query(
        """
        MATCH (d:Document {document_key: $doc_key})
        SET d.status = $status
        """,
        {"doc_key": row["key"], "status": status},
    )
    anchors = [a for a in (row["anchors"] or []) if a]
    if not anchors:
        continue
    graph.query(
        """
        MATCH (d_new:Document {document_key: $doc_key})-[:PUBLISHED_IN]->(p_new:Publication)
        UNWIND $anchors AS anchor_value
        MATCH (a:Anchor {value: anchor_value})<-[:HAS_ANCHOR]-(d_old:Document)
        MATCH (d_old)-[:PUBLISHED_IN]->(p_old:Publication)
        WHERE d_old <> d_new AND p_old.publication_date <= p_new.publication_date
        WITH DISTINCT d_new, d_old
        MERGE (d_new)-[:UPDATES {kind: $kind}]->(d_old)
        """,
        {"doc_key": row["key"], "anchors": anchors, "kind": kind},
    )

status_summary = {
    row["status"] or "unknown": row["count"]
    for row in graph.query("MATCH (d:Document) RETURN d.status AS status, count(*) AS count")
}
print("Document status summary:", status_summary)



Document status summary: {'announced': 500, 'canceled': 8, 'corrected': 21, 'extended': 17, 'awarded': 18, 'revoked': 3}


In [16]:
# --- Optional: verify anchor freshness for a specific identifier ---
anchor_value = "RFP-2113556"
check_rows = run(
    dedent(
        """
        WITH $anchor_value AS val
        MATCH (a:Anchor {value: val})<-[:HAS_ANCHOR]-(d:Document)-[:PUBLISHED_IN]->(p:Publication)
        OPTIONAL MATCH (a)-[:LATEST]->(cur:Document)
        RETURN p.publication_date AS pub_date,
               d.document_key AS doc,
               d = cur AS is_latest
        ORDER BY pub_date DESC, doc DESC
        """
    ),
    {"anchor_value": anchor_value},
)
if not check_rows:
    print(f"Anchor {anchor_value} not found (no documents)")
else:
    for row in check_rows:
        flag = "<-- LATEST" if row["is_latest"] else ""
        print(f"{row['pub_date']} :: {row['doc']} {flag}")


2025-05-04 :: 71-1737-0:Announcement:uncementTheInternalProcurementCommittee 
2025-04-27 :: 71-1736-0:Notice:uncement/NoticeRegardingtheAdditionofCompaniestoTenderNo 


In [17]:
# --- Anchor freshness summary ---
superseded_counts = {
    row["flag"]: row["count"]
    for row in run('MATCH (d:Document) RETURN coalesce(d.superseded,false) AS flag, count(*) AS count')
}
latest_links = run('MATCH (:Anchor)-[:LATEST]->(:Document) RETURN count(*) AS n')[0]['n']
print("Superseded counts:", superseded_counts, "| anchor latest links:", latest_links)



Superseded counts: {False: 567} | anchor latest links: 0


In [18]:
# --- Propagate anchors and rebuild LATEST ---
from textwrap import dedent

_ALLOWED_TENDER_DOC_TYPES = [t.lower() for t in _TENDER_DOC_TYPES]

def _run(query: str, params: dict | None = None):
    return graph.query(query, params or {})

before_stats = _run(
    """
    MATCH (a:Anchor)<-[:HAS_ANCHOR]-(d:Document)
    RETURN count(DISTINCT a) AS anchors_with_docs,
           count(DISTINCT d) AS documents_with_anchors
    """
)[0]
print(
    "Anchors before propagation: {anchors_with_docs} linked to {documents_with_anchors} documents.".format(
        **before_stats
    )
)

_run(
    dedent(
        """
        MATCH (new:Document)-[:UPDATES]->(old:Document)-[:HAS_ANCHOR]->(a:Anchor)
        WHERE (a.kind <> 'tender' OR toLower(coalesce(new.doc_type, '')) IN $allowed)
          AND a.value IN coalesce(new.anchors, [])
        MERGE (new)-[:HAS_ANCHOR]->(a)
        """
    ),
    {"allowed": _ALLOWED_TENDER_DOC_TYPES},
)

_run(
    dedent(
        """
        MATCH (latest:Publication)<-[:PUBLISHED_IN]-(latest_doc:Document)
        MATCH (latest)-[:UPDATED_BY_PUBLICATION]->(base:Publication)
        MATCH (base)<-[:PUBLISHED_IN]-(base_doc:Document)-[:HAS_ANCHOR]->(a:Anchor)
        WHERE (a.kind <> 'tender' OR toLower(coalesce(latest_doc.doc_type, '')) IN $allowed)
          AND a.value IN coalesce(latest_doc.anchors, [])
        MERGE (latest_doc)-[:HAS_ANCHOR]->(a)
        """
    ),
    {"allowed": _ALLOWED_TENDER_DOC_TYPES},
)

_run(
    dedent(
        """
        MATCH (d:Document)-[r:HAS_ANCHOR]->(a:Anchor)
        WHERE a.kind = 'tender' AND NOT toLower(coalesce(d.doc_type, '')) IN $allowed
        DELETE r
        """
    ),
    {"allowed": _ALLOWED_TENDER_DOC_TYPES},
)

_run(
    dedent(
        """
        MATCH (d:Document)-[:PUBLISHED_IN]->(p:Publication)
        SET d.superseded = false,
            d.effective_date = coalesce(d.effective_date, p.publication_date)
        """
    )
)

_run("MATCH (:Anchor)-[r:LATEST]->(:Document) DELETE r")

_run(
    dedent(
        """
        MATCH (a:Anchor)<-[:HAS_ANCHOR]-(d:Document)-[:PUBLISHED_IN]->(p:Publication)
        WITH a, d, p
        ORDER BY p.publication_date DESC,
                 coalesce(p.supplement_index, 0) DESC,
                 d.document_key DESC
        WITH a, collect(d) AS docs
        WITH a, docs WHERE size(docs) > 0
        UNWIND range(0, size(docs) - 1) AS idx
        WITH a, docs, idx, docs[idx] AS doc
        SET doc.superseded = (idx > 0)
        WITH DISTINCT a, docs[0] AS newest
        MERGE (a)-[:LATEST]->(newest)
        """
    )
)

_run(
    dedent(
        """
        MATCH (d:Document)-[:HAS_CHUNK]->(c:Chunk)
        SET c.superseded = coalesce(d.superseded, false)
        """
    )
)

after_stats = _run(
    """
    MATCH (a:Anchor)<-[:HAS_ANCHOR]-(d:Document)
    RETURN count(DISTINCT a) AS anchors_with_docs,
           count(DISTINCT d) AS documents_with_anchors
    """
)[0]
print(
    "Anchors after propagation: {anchors_with_docs} linked to {documents_with_anchors} documents.".format(
        **after_stats
    )
)

latest_count = _run("MATCH (:Anchor)-[:LATEST]->(:Document) RETURN count(*) AS n")[0]["n"]
print(f"Anchor LATEST edges rebuilt: {latest_count}")

sample_rows = _run(
    dedent(
        """
        WITH 'RFP-2113556' AS val
        MATCH (a:Anchor {value: val})<-[:HAS_ANCHOR]-(d:Document)-[:PUBLISHED_IN]->(p:Publication)
        OPTIONAL MATCH (a)-[:LATEST]->(cur:Document)
        RETURN p.publication_date AS pub_date,
               d.document_key AS doc,
               d = cur AS is_latest
        ORDER BY pub_date DESC, doc DESC
        """
    )
)
if not sample_rows:
    print("Anchor RFP-2113556 not found (no documents)")
else:
    for row in sample_rows:
        flag = "<-- LATEST" if row["is_latest"] else ""
        print(f"{row['pub_date']} :: {row['doc']} {flag}")



Anchors before propagation: 404 linked to 115 documents.
Anchors after propagation: 404 linked to 115 documents.
Anchor LATEST edges rebuilt: 404
2025-05-04 :: 71-1737-0:Announcement:uncementTheInternalProcurementCommittee <-- LATEST
2025-04-27 :: 71-1736-0:Notice:uncement/NoticeRegardingtheAdditionofCompaniestoTenderNo 


### Embed Chunks

In [19]:
# Create Vector Index
create_vector_index(graph=graph, index_name='Chunk')

# Embed Chunks
embed_text(graph=graph, OPENAI_API_KEY=openAI_api, OPENAI_ENDPOINT=openAI_endpoint, node_name='Chunk', model_name=openAI_model, max_workers=100)

Starting embedding update...
Found 8488 nodes without embeddings.


Embedding nodes: 100%|██████████████████████████████████████████| 8488/8488 [01:28<00:00, 96.04it/s]

Finished embedding update.



