#### Confirm Working

In [1]:
# Confirm Jupyter and kernel works
import sys
print(sys.executable)
print("Kernel working. Proceed")

/home/sepeh/.cache/pypoetry/virtualenvs/graphrag-developer-challenge3-4suqu0LP-py3.12/bin/python
Kernel working. Proceed


#### Connect to Neo4j

In [2]:
# Connect to Neo4j
import os

# Add parent directory to Python path so we can import KnowledgeGraph
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
print(f"Current directory: {os.getcwd()}")
print(f"Parent directory: {parent_dir}")
print(f"Parent dir exists: {os.path.exists(parent_dir)}")
if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)
print(f"KnowledgeGraph exists: {os.path.exists(os.path.join(parent_dir, 'KnowledgeGraph'))}")

from KnowledgeGraph.knowledgegraph import ingest_Chunks, create_nodes, create_relationship, create_vector_index, embed_text
from KnowledgeGraph.chunking import split_data_from_file
from KnowledgeGraph.config import load_neo4j_graph

import json

# Error handling for Neo4j connection
try:
    graph, openAI_api, openAI_endpoint, openAI_model = load_neo4j_graph()
    print("Successfully connected to Neo4j!")
except ValueError as e:
    print(f"Warning: {e}")
    print("Please set up Neo4j (local or AuraDB) and update .env file")
    raise

Current directory: /mnt/c/Users/sepeh/OneDrive/Documents/Git/GraphRag_Developer_Challenge3/Main Functions
Parent directory: /mnt/c/Users/sepeh/OneDrive/Documents/Git/GraphRag_Developer_Challenge3
Parent dir exists: True
KnowledgeGraph exists: True


Successfully connected to Neo4j!


#### Optional Reset

In [3]:
# -------------------------------------------------------------
# Cleanup (optional reset between runs)
# -------------------------------------------------------------
labels_to_wipe = ['Chunk', 'Document', 'Publication', 'Gazette', 'Issue', 'Issues', 'Page', 'Section']
print(f"Deleting old {', '.join(labels_to_wipe)} nodes...")
try:
    graph.query("""
    MATCH (n)
    WHERE n:Chunk OR n:Document OR n:Publication OR n:Gazette OR n:Issue OR n:Issues OR n:Page OR n:Section
    DETACH DELETE n
    """)
    print("Old nodes deleted successfully.\n")
except Exception as e:
    print(f"Cleanup skipped: {e}\n")

Deleting old Chunk, Document, Publication, Gazette, Issue, Issues, Page, Section nodes...
Old nodes deleted successfully.



### <r> Clear Embeddings </r> ###

In [4]:
# -------------------------------------------------------------
# Recreate vector index at dims from .env and clear old vectors
# -------------------------------------------------------------
from textwrap import dedent

def run(q, params=None):
    return graph.query(q, params or {})

# 1) Drop old index (idempotent)
try:
    run("DROP INDEX `Chunk` IF EXISTS")
    print("Dropped index `Chunk` (if it existed).")
except Exception as e:
    print(f"Drop index skipped: {e}")

# 2) Clear old vectors (they may be wrong dims)
run("MATCH (c:Chunk) REMOVE c.textEmbeddingOpenAI")
print("Cleared old embeddings from Chunk nodes.")

# 3) Recreate index using dims from .env (OPENAI_EMBED_DIM)
from KnowledgeGraph.knowledgegraph import create_vector_index
create_vector_index(graph=graph, index_name='Chunk')  # reads OPENAI_EMBED_DIM
print("Recreated vector index `Chunk` with dims from OPENAI_EMBED_DIM.")

# 4) Show index options to confirm dims
rows = run(dedent("""
SHOW INDEXES
YIELD name, type, entityType, labelsOrTypes, properties, options
WHERE name = 'Chunk'
RETURN name, options
"""))
print("Index info:", rows[0] if rows else "Index not found")


Dropped index `Chunk` (if it existed).
Cleared old embeddings from Chunk nodes.
Recreated vector index `Chunk` with dims from OPENAI_EMBED_DIM.
Index info: {'name': 'Chunk', 'options': {'indexProvider': 'vector-2.0', 'indexConfig': {'vector.hnsw.m': 16, 'vector.hnsw.ef_construction': 100, 'vector.dimensions': 3072, 'vector.similarity_function': 'COSINE', 'vector.quantization.enabled': True}}}


#### Loop through each file and ingest main nodes and chunk data

In [5]:
import json, re, uuid, shutil, time
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

PROJECT_ROOT = Path(os.getcwd())
SOURCE_DIR = (PROJECT_ROOT.parent / "KnowledgeGraph" / "source_data").resolve()
GAZETTE_NAME = "Kuwait Today / Al-Kuwait Al-Youm"
max_workers = int(os.getenv("MAX_WORKERS", str(min(16, (os.cpu_count() or 4) * 2))))

if not SOURCE_DIR.exists():
    raise FileNotFoundError(f"Expected source markdown directory at {SOURCE_DIR}")

tqdm.write(f"Project root: {PROJECT_ROOT}", file=sys.stderr)
tqdm.write(f"Source folder: {SOURCE_DIR}", file=sys.stderr)

METADATA_RE = re.compile(r"<document_metadata>\s*(\{.*?\})\s*</document_metadata>", re.DOTALL)
DATE_FROM_FILENAME_RE = re.compile(r"(\d{4})[-_](\d{2})[-_](\d{2})")
SUPPLEMENT_NUMBER_RE = re.compile(r"Supplement\s*(?:No\.?|Number)?\s*(\d+)", re.IGNORECASE)
ISSUE_RE = re.compile(r"Issue\s*(?:No\.?|Number)?\s*(\d{3,4})", re.IGNORECASE)
BLOCK_RE = re.compile(r"<(document_metadata|page_metadata)>.*?</\1>", re.DOTALL)
PAGE_START_RE = re.compile(r"<page_start[^>]*>(\d+)</page_start>")
PAGE_END_RE = re.compile(r"<page_end[^>]*>(\d+)</page_end>")
DOC_NUMBER_PATTERNS = [
    re.compile(r"No\.?\s*\(?\s*([A-Za-z0-9\s\-/–—]+)\s*\)?\s*of\s*(\d{4})", re.IGNORECASE),
    re.compile(r"No\.?\s*[:\-]?\s*([A-Za-z0-9\s\-/–—]+)", re.IGNORECASE),
]
TOKEN_RE = re.compile(r"\S+\s*")

NUMBER_WORDS = {
    "zero": "0", "one": "1", "two": "2", "three": "3", "four": "4", "five": "5",
    "six": "6", "seven": "7", "eight": "8", "nine": "9", "ten": "10", "eleven": "11",
    "twelve": "12", "thirteen": "13", "fourteen": "14", "fifteen": "15", "sixteen": "16",
    "seventeen": "17", "eighteen": "18", "nineteen": "19", "twenty": "20",
    "first": "1", "second": "2", "third": "3", "fourth": "4", "fifth": "5",
    "sixth": "6", "seventh": "7", "eighth": "8", "ninth": "9", "tenth": "10"
}
ROMAN_MAP = {"I": 1, "V": 5, "X": 10, "L": 50, "C": 100, "D": 500, "M": 1000}
DOC_KEYWORDS = [
    "decree-law", "decree", "decision", "resolution", "tender", "notice", "law",
    "memorandum", "circular", "announcement", "order", "statement", "invitation",
    "award", "regulation", "contract", "agreement", "ministerial decision",
    "council", "committee", "correction", "report"
]


def _parse_int(value):
    try:
        text = str(value).strip()
        if not text:
            return None
        return int(text)
    except Exception:
        return None


def _roman_to_int(token: str) -> int | None:
    total = 0
    prev = 0
    for ch in token.upper():
        if ch not in ROMAN_MAP:
            return None
        value = ROMAN_MAP[ch]
        if value > prev:
            total += value - 2 * prev
        else:
            total += value
        prev = value
    return total


def parse_publication_header(text: str, filename: str) -> dict:
    slug = Path(filename).stem
    metadata = {}
    match = METADATA_RE.search(text)
    if match:
        try:
            metadata = json.loads(match.group(1))
        except json.JSONDecodeError:
            metadata = {}

    header_slice = text[:3000]
    header_plain = re.sub(r"<[^>]+>", " ", header_slice)

    publication_date = metadata.get("document_date") or metadata.get("publication_date")
    if not publication_date:
        date_match = DATE_FROM_FILENAME_RE.search(slug)
        if date_match:
            publication_date = "-".join(date_match.groups())

    issue_number = _parse_int(metadata.get("issue_number"))
    if issue_number is None:
        issue_match = ISSUE_RE.search(header_plain)
        if issue_match:
            issue_number = _parse_int(issue_match.group(1))

    volume_number = _parse_int(metadata.get("volume_number"))
    if volume_number is None and "Seventy-First" in header_plain:
        volume_number = 71

    title = metadata.get("document_title")

    supplement_index = 0
    sup_match = SUPPLEMENT_NUMBER_RE.search(title or "") or SUPPLEMENT_NUMBER_RE.search(header_plain)
    if sup_match:
        candidate = _parse_int(sup_match.group(1)) or 0
        if issue_number and candidate == issue_number:
            supplement_index = 1
        elif candidate > 50:
            supplement_index = 1
        else:
            supplement_index = candidate
    elif re.search(r"Supplement", title or "", re.IGNORECASE) or re.search(r"Supplement", header_plain, re.IGNORECASE):
        supplement_index = 1

    publication_date = publication_date or "Unknown"
    issue_number = issue_number or 0
    volume_number = volume_number or 0

    publication_key = f"{volume_number}-{issue_number}-{supplement_index}"

    return {
        "slug": slug,
        "publication_date": publication_date,
        "volume_number": volume_number,
        "issue_number": issue_number,
        "supplement_index": supplement_index,
        "is_supplement": supplement_index > 0,
        "title": title,
        "publication_key": publication_key,
        "metadata": metadata,
    }


def _normalize_doc_number(base: str, year: str | None) -> str:
    for char in "\u2013\u2014\u2012\u2212":
        base = base.replace(char, "-")
    base = re.sub(r"\s+", "", base.strip())
    if year and year not in base:
        base = f"{base}/{year}"
    return base


def _extract_doc_number(text: str) -> str | None:
    for pattern in DOC_NUMBER_PATTERNS:
        match = pattern.search(text)
        if match:
            base = match.group(1)
            year = match.group(2) if len(match.groups()) >= 2 else None
            return _normalize_doc_number(base, year)
    return None


def _determine_doc_type(header: str) -> str:
    lower = header.lower()
    mapping = [
        (r"ministerial decision", "Decision"),
        (r"decree-law", "Decree-Law"),
        (r"emiri decree", "Decree"),
        (r"amiri decree", "Decree"),
        (r"decree", "Decree"),
        (r"resolution", "Resolution"),
        (r"decision", "Decision"),
        (r"memorandum", "Memorandum"),
        (r"circular", "Circular"),
        (r"tender", "Tender"),
        (r"notice", "Notice"),
        (r"invitation", "Invitation"),
        (r"announcement", "Announcement"),
        (r"law", "Law"),
        (r"order", "Order"),
        (r"statement", "Statement"),
        (r"regulation", "Regulation"),
        (r"contract", "Contract"),
        (r"agreement", "Agreement"),
        (r"award", "Award"),
        (r"correction", "Correction"),
        (r"report", "Report")
    ]
    for pattern, doc_type in mapping:
        if re.search(pattern, lower):
            return doc_type
    return "Document"


def _detect_title(lines: list[str]) -> str | None:
    for line in lines:
        stripped = line.strip().lstrip("#").strip()
        if not stripped:
            continue
        lowered = stripped.lower()
        if lowered.startswith("no."):
            continue
        if lowered.startswith("article") or lowered.startswith("مادة"):
            continue
        return stripped
    return None


def _detect_doc_start(line: str) -> bool:
    stripped = line.lstrip()
    hashes = len(stripped) - len(stripped.lstrip("#"))
    if hashes != 2:
        return False
    header = stripped.lstrip("#").strip().lower()
    return any(keyword in header for keyword in DOC_KEYWORDS)


def iter_documents(text: str):
    cleaned = BLOCK_RE.sub("", text)
    lines = cleaned.splitlines()
    current_section = None
    pending_pages: list[int] = []
    current_doc = None

    def flush(doc_state):
        if not doc_state:
            return None
        doc_lines = doc_state["lines"]
        body = "\n".join(doc_lines).strip()
        if not body:
            return None
        header = doc_state["header"]
        doc_type = _determine_doc_type(header)
        doc_number = _extract_doc_number("\n".join(doc_lines[:5]))
        title = _detect_title(doc_lines[1:6])
        page_numbers = sorted(set(doc_state.get("page_numbers") or []))
        return {
            "doc_type": doc_type,
            "doc_number": doc_number,
            "issuer": None,
            "title": title,
            "section_heading": doc_state.get("section_heading"),
            "page_start": page_numbers[0] if page_numbers else None,
            "page_end": page_numbers[-1] if page_numbers else None,
            "text": body,
        }

    for raw_line in lines:
        line = raw_line.rstrip()
        if not line:
            if current_doc:
                current_doc["lines"].append("")
            continue
        page_start = PAGE_START_RE.search(line)
        page_end = PAGE_END_RE.search(line)
        if page_start:
            pending_pages.append(int(page_start.group(1)))
            continue
        if page_end:
            pending_pages.append(int(page_end.group(1)))
            continue
        stripped = line.lstrip()
        hashes = len(stripped) - len(stripped.lstrip("#"))
        if hashes == 1:
            current_section = stripped.lstrip("#").strip()
            continue
        if _detect_doc_start(line):
            doc_payload = flush(current_doc)
            if doc_payload:
                yield doc_payload
            header = line.lstrip("#").strip()
            current_doc = {
                "header": header,
                "lines": [header],
                "section_heading": current_section,
                "page_numbers": list(pending_pages) if pending_pages else [],
            }
            pending_pages.clear()
            continue
        if current_doc is None:
            continue
        if pending_pages:
            current_doc["page_numbers"].extend(pending_pages)
            pending_pages.clear()
        current_doc["lines"].append(line)

    final_doc = flush(current_doc)
    if final_doc:
        yield final_doc


def _normalize_article_number(token: str | None) -> str | None:
    if not token:
        return None
    cleaned = token.strip("()[]{}:.- ").lower()
    if not cleaned:
        return None
    if cleaned in NUMBER_WORDS:
        return NUMBER_WORDS[cleaned]
    roman = _roman_to_int(cleaned)
    if roman is not None:
        return str(roman)
    digits = re.findall(r"\d+", cleaned)
    if digits:
        return digits[0]
    return cleaned


def _is_article_heading(line: str) -> tuple[bool, str | None]:
    stripped = line.strip().lstrip("*-#").strip()
    lower = stripped.lower()
    if not (lower.startswith("article") or lower.startswith("مادة")):
        return False, None
    parts = stripped.split(None, 1)
    if len(parts) < 2:
        return False, None
    rest = parts[1].strip()
    number_match = re.match(r"^\(?([A-Za-z0-9IVXLCDM\-\u0660-\u0669]+)\)?", rest, re.IGNORECASE)
    if not number_match:
        return False, None
    number_token = number_match.group(1)
    after_number = rest[number_match.end():].strip()
    if after_number and not after_number.startswith((":", "-", "–", "—")):
        return False, None
    return True, _normalize_article_number(number_token)


def _token_chunks(text: str, max_tokens: int = 900, overlap: int = 90) -> list[str]:
    tokens = TOKEN_RE.findall(text)
    if not tokens:
        return []
    if len(tokens) <= max_tokens:
        return ["".join(tokens).strip()]
    chunks: list[str] = []
    start = 0
    while start < len(tokens):
        end = min(len(tokens), start + max_tokens)
        chunk = "".join(tokens[start:end]).strip()
        if chunk:
            chunks.append(chunk)
        if end == len(tokens):
            break
        start = max(0, end - overlap)
    return chunks


def article_aware_chunks(publication: dict, document: dict):
    lines = document["text"].splitlines()
    sections: list[tuple[str | None, str]] = []
    current_lines: list[str] = []
    current_article: str | None = None
    for line in lines:
        is_article, article_number = _is_article_heading(line)
        if is_article:
            if current_lines:
                section_text = "\n".join(current_lines).strip()
                if section_text:
                    sections.append((current_article, section_text))
            current_lines = [line]
            current_article = article_number
        else:
            current_lines.append(line)
    if current_lines:
        section_text = "\n".join(current_lines).strip()
        if section_text:
            sections.append((current_article, section_text))
    if not sections:
        sections = [(None, document["text"].strip())]

    header_prefix = f"[PUB={publication['publication_key']}] [ISSUE={publication['issue_number']}] [SUPP={publication['supplement_index']}] [DATE={publication['publication_date']}]"
    for article_number, section_text in sections:
        if not section_text:
            continue
        bodies = _token_chunks(section_text)
        if not bodies:
            bodies = [section_text]
        for body in bodies:
            body = body.strip()
            if not body:
                continue
            doc_number = document.get("doc_number") or ""
            section_heading = document.get("section_heading") or ""
            article_token = article_number or ""
            header_line = f"[DOC={document['doc_type']} {doc_number}] [SECTION={section_heading}] [ARTICLE={article_token}]"
            yield {
                "chunk_id": str(uuid.uuid4()),
                "publication_key": publication["publication_key"],
                "document_key": document["document_key"],
                "article_number": article_number,
                "text": f"{header_prefix}\n{header_line}\n{body}",
            }


def ensure_constraints(graph_client):
    queries = [
        """
        CREATE CONSTRAINT publication_key_unique IF NOT EXISTS
        FOR (p:Publication) REQUIRE p.publication_key IS UNIQUE
        """,
        """
        CREATE CONSTRAINT document_key_unique IF NOT EXISTS
        FOR (d:Document) REQUIRE d.document_key IS UNIQUE
        """,
        """
        CREATE CONSTRAINT chunk_id_unique IF NOT EXISTS
        FOR (c:Chunk) REQUIRE c.chunk_id IS UNIQUE
        """,
    ]
    for query in queries:
        graph_client.query(query)

ensure_constraints(graph)

graph.query(
    """
    MERGE (g:Gazette {name: $name})
    ON CREATE SET g.name = $name
    """,
    {"name": GAZETTE_NAME},
)


def _sanitize_doc_type(doc_type: str) -> str:
    safe = re.sub(r"\s+", "-", doc_type.strip())
    return safe or "Document"

md_files = sorted(SOURCE_DIR.rglob("*.md"))
if not md_files:
    tqdm.write("No markdown files found for ingestion.", file=sys.stderr)

prepared_publications = []
total_chunks = 0
for path in md_files:
    text = path.read_text(encoding="utf-8")
    publication = parse_publication_header(text, str(path))
    documents = []
    doc_seq = 0
    for doc in iter_documents(text):
        doc_seq += 1
        doc_type = doc["doc_type"] or "Document"
        key_doc_type = _sanitize_doc_type(doc_type)
        doc_number = doc.get("doc_number")
        key_suffix = doc_number or f"{doc_seq:03d}"
        document_key = f"{publication['publication_key']}:{key_doc_type}:{key_suffix}"
        doc["document_key"] = document_key
        doc["doc_type"] = doc_type
        doc["doc_sequence"] = doc_seq
        documents.append(doc)
    for doc in documents:
        chunks = list(article_aware_chunks(publication, doc))
        doc["chunks"] = chunks
        total_chunks += len(chunks)
    prepared_publications.append({
        "path": path,
        "publication": publication,
        "documents": documents,
    })
    tqdm.write(
        f"Prepared {path.name}: {len(documents)} documents, {sum(len(d['chunks']) for d in documents)} chunks",
        file=sys.stderr,
    )

bar_format = '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]'
try:
    term_width = shutil.get_terminal_size().columns
except Exception:
    term_width = 100


def _ingest_publication_entry(entry, bar):
    publication = entry["publication"]
    publication_props = {
        "slug": publication["slug"],
        "title": publication.get("title"),
        "publication_date": publication["publication_date"],
        "volume_number": publication["volume_number"],
        "issue_number": publication["issue_number"],
        "supplement_index": publication["supplement_index"],
        "is_supplement": publication["is_supplement"],
        "source_path": str(entry["path"]),
    }

    graph.query(
        """
        MERGE (g:Gazette {name: $gazette_name})
        WITH g
        MERGE (p:Publication {publication_key: $publication_key})
        ON CREATE SET p += $props
        ON MATCH SET p += $props
        MERGE (g)-[:HAS_PUBLICATION]->(p)
        """,
        {
            "gazette_name": GAZETTE_NAME,
            "publication_key": publication["publication_key"],
            "props": publication_props,
        },
    )

    def _ingest_one_doc(doc):
        doc_props = {
            "doc_type": doc["doc_type"],
            "doc_number": doc.get("doc_number"),
            "issuer": doc.get("issuer"),
            "title": doc.get("title"),
            "section_heading": doc.get("section_heading"),
            "page_start": doc.get("page_start"),
            "page_end": doc.get("page_end"),
            "doc_sequence": doc.get("doc_sequence"),
            "publication_key": publication["publication_key"],
        }
        graph.query(
            """
            MATCH (p:Publication {publication_key: $publication_key})
            MERGE (d:Document {document_key: $document_key})
            ON CREATE SET d += $props
            ON MATCH SET d += $props
            MERGE (p)-[:CONTAINS]->(d)
            """,
            {
                "publication_key": publication["publication_key"],
                "document_key": doc["document_key"],
                "props": doc_props,
            },
        )

        chunks = doc.get("chunks") or []
        if chunks:
            graph.query(
                """
                UNWIND $chunks AS chunk
                MERGE (c:Chunk {chunk_id: chunk.chunk_id})
                ON CREATE SET c += chunk
                ON MATCH SET c += chunk
                WITH c, chunk
                MATCH (d:Document {document_key: chunk.document_key})
                MERGE (d)-[:HAS_CHUNK]->(c)
                """,
                {"chunks": chunks},
            )
        return len(chunks)

    with ThreadPoolExecutor(max_workers=max_workers) as ex:
        futures = [ex.submit(_ingest_one_doc, d) for d in entry["documents"]]
        for fut in futures:
            try:
                count = fut.result()
                if count:
                    bar.update(count)
            except Exception as e:
                tqdm.write(f"Doc ingest error: {e}", file=sys.stderr)

start = time.perf_counter()
with tqdm(
    total=max(total_chunks, 1),
    desc=f"Ingesting chunks (workers={max_workers})",
    ncols=term_width,
    bar_format=bar_format,
    file=sys.stderr,
    disable=(total_chunks == 0),
) as bar:
    for entry in prepared_publications:
        _ingest_publication_entry(entry, bar)

dt = time.perf_counter() - start
if total_chunks == 0:
    tqdm.write("No chunks were generated for ingestion.", file=sys.stderr)
else:
    rate = total_chunks / max(dt, 1e-6)
    tqdm.write(f"Ingestion complete. Ingested {total_chunks} chunks in {dt:.1f}s ({rate:,.0f} chunks/s).", file=sys.stderr)



Project root: /mnt/c/Users/sepeh/OneDrive/Documents/Git/GraphRag_Developer_Challenge3/Main Functions
Source folder: /mnt/c/Users/sepeh/OneDrive/Documents/Git/GraphRag_Developer_Challenge3/KnowledgeGraph/source_data
Prepared 2025-04-27_en.md: 115 documents, 328 chunks
Prepared 2025-05-01_en.md: 1 documents, 2 chunks
Prepared 2025-05-04_en.md: 144 documents, 431 chunks
Prepared 2025-05-11_en.md: 182 documents, 664 chunks
Prepared 2025-05-15_en.md: 2 documents, 6 chunks
Prepared 2025-05-20_en.md: 1 documents, 3 chunks
Prepared 2025-05-25_en.md: 113 documents, 335 chunks
Prepared 2025_04_06_en.md: 88 documents, 188 chunks
Prepared 2025_04_13_en.md: 135 documents, 494 chunks
Prepared 2025_04_20_en.md: 126 documents, 322 chunks
Ingesting chunks (workers=16): 100%|███████████████████| 2773/2773 [00:09<00:00]
Ingestion complete. Ingested 2773 chunks in 9.6s (289 chunks/s).


#### Summary & Validation

In [6]:
# -------------------------------------------------------------
# Summary & validation checks
# -------------------------------------------------------------
from textwrap import dedent

def run_query(query, params=None):
    return graph.query(query, params or {})

expected_publications = len(md_files) if 'md_files' in globals() else None

print('\n--- Core Counts ---')
gazette_count = run_query('MATCH (g:Gazette) RETURN count(g) AS c')[0]['c']
publication_count = run_query('MATCH (p:Publication) RETURN count(p) AS c')[0]['c']
document_count = run_query('MATCH (d:Document) RETURN count(d) AS c')[0]['c']
chunk_count = run_query('MATCH (c:Chunk) RETURN count(c) AS c')[0]['c']
print(f'Gazettes: {gazette_count}')
if expected_publications is not None:
    print(f'Publications: {publication_count} (expected {expected_publications})')
else:
    print(f'Publications: {publication_count}')
print(f'Documents: {document_count}')
print(f'Chunks: {chunk_count}')

print('\n--- Supplement Spot Checks ---')
for issue in (1736, 1738):
    rows = run_query(
        'MATCH (p:Publication {issue_number: $issue}) RETURN p.publication_key AS key, p.is_supplement AS is_supplement, p.supplement_index AS supplement_index ORDER BY p.supplement_index',
        {'issue': issue},
    )
    if not rows:
        print(f'Issue {issue}: no rows found')
        continue
    for row in rows:
        print(f"Issue {issue} -> {row['key']} (supplement={row['is_supplement']}, index={row['supplement_index']})")

print('\n--- Document Key Samples ---')
for row in run_query(
    'MATCH (p:Publication)-[:CONTAINS]->(d:Document) RETURN p.publication_key AS pk, d.document_key AS dk ORDER BY d.document_key LIMIT 3'
):
    print(f"{row['pk']} :: {row['dk']}")

print('\n--- Chunk Header Preview ---')
chunk_rows = run_query('MATCH (d:Document)-[:HAS_CHUNK]->(c:Chunk) RETURN c.text AS text LIMIT 3')
if not chunk_rows:
    print('No chunks found.')
else:
    for idx, row in enumerate(chunk_rows, start=1):
        header_lines = row['text'].splitlines()[:2]
        preview = ' | '.join(header_lines)
        print(f'Chunk {idx}: {preview[:200]}')


--- Core Counts ---
Gazettes: 1
Publications: 10 (expected 10)
Documents: 567
Chunks: 2773

--- Supplement Spot Checks ---
Issue 1736 -> 71-1736-0 (supplement=False, index=0)
Issue 1736 -> 71-1736-2 (supplement=True, index=2)
Issue 1738 -> 71-1738-0 (supplement=False, index=0)
Issue 1738 -> 71-1738-1 (supplement=True, index=1)

--- Document Key Samples ---
71-1733-0 :: 71-1733-0:Announcement:uncement
71-1733-0 :: 71-1733-0:Announcement:uncementArequesthasbeensubmittedtotheDepartmentofPartnerships
71-1733-0 :: 71-1733-0:Announcement:uncementMessrs

--- Chunk Header Preview ---
Chunk 1: [PUB=71-1736-0] [ISSUE=1736] [SUPP=0] [DATE=2025-04-27] | [DOC=Decree 71/2025] [SECTION=Council of Ministers <orig>مجلس الوزراء</orig>] [ARTICLE=1]
Chunk 2: [PUB=71-1736-0] [ISSUE=1736] [SUPP=0] [DATE=2025-04-27] | [DOC=Decree 71/2025] [SECTION=Council of Ministers <orig>مجلس الوزراء</orig>] [ARTICLE=1]
Chunk 3: [PUB=71-1736-0] [ISSUE=1736] [SUPP=0] [DATE=2025-04-27] | [DOC=Decree 71/2025] [SECTION=Counc

In [7]:
# -------------------------------------------------------------
# Supplements + latest flags + authority weights
# -------------------------------------------------------------
from textwrap import dedent

def run(q, params=None):
    return graph.query(q, params or {})

# Link supplements to their base (same volume + issue; supplement_index>0)
run(dedent('''
MATCH (base:Publication {supplement_index: 0})
MATCH (sup:Publication)
WHERE sup.volume_number = base.volume_number
  AND sup.issue_number = base.issue_number
  AND sup.supplement_index > 0
MERGE (sup)-[:SUPPLEMENT_OF]->(base)
'''))

# Mark latest per (volume, issue) and set a grouping key
run(dedent('''
MATCH (p:Publication)
WITH p.volume_number AS v, p.issue_number AS i, max(p.supplement_index) AS maxSupp
MATCH (q:Publication {volume_number: v, issue_number: i})
SET q.is_latest_for_issue = (q.supplement_index = maxSupp),
    q.issue_group_key = toString(v) + '-' + toString(i)
'''))

# Tag chunks from the latest publication for each issue
run(dedent('''
MATCH (p:Publication {is_latest_for_issue: true})-[:CONTAINS]->(:Document)-[:HAS_CHUNK]->(c:Chunk)
SET c.latest_for_issue = true
'''))

# Mirror supplement flag onto chunks
run(dedent('''
MATCH (p:Publication)-[:CONTAINS]->(:Document)-[:HAS_CHUNK]->(c:Chunk)
SET c.is_supplement = p.is_supplement
'''))

# Set authority weights by doc_type (simple heuristic)
run(dedent('''
MATCH (d:Document)
SET d.authority_weight =
  CASE d.doc_type
    WHEN 'Decree-Law' THEN 1.30
    WHEN 'Decree' THEN 1.20
    WHEN 'Law' THEN 1.20
    WHEN 'Decision' THEN 1.10
    WHEN 'Resolution' THEN 1.05
    WHEN 'Memorandum' THEN 1.00
    WHEN 'Notice' THEN 0.95
    WHEN 'Tender' THEN 0.90
    ELSE 1.00
  END
'''))

# Quick summary
sup_links = run('MATCH (:Publication)-[:SUPPLEMENT_OF]->(:Publication) RETURN count(*) AS n')[0]['n']
latest_pubs = run('MATCH (p:Publication {is_latest_for_issue:true}) RETURN count(p) AS n')[0]['n']
latest_chunks = run('MATCH (c:Chunk {latest_for_issue:true}) RETURN count(c) AS n')[0]['n']
print(f"Supplement links: {sup_links}, latest publications: {latest_pubs}, tagged chunks: {latest_chunks}")



Supplement links: 2, latest publications: 8, tagged chunks: 1781


#### First ingest all data and then create index

In [8]:
# --- Anchor extraction + indexes (minimal, persistent) ---
import re

_ARABIC_DIGITS = str.maketrans("٠١٢٣٤٥٦٧٨٩", "0123456789")

def _norm_digits(s: str) -> str:
    return (s or "").translate(_ARABIC_DIGITS)

_RX_ANCHORS = [
    re.compile(r"\bRFP[\s/-]?\d+\b", re.IGNORECASE),
    re.compile(r"\b\d{6,7}[\s/-]?RFP\b", re.IGNORECASE),
    re.compile(r"\bA/M/\d+\b", re.IGNORECASE),
    re.compile(r"\b5D[A-Z0-9]+\b", re.IGNORECASE),
    re.compile(r"\b\d{4}/\d{5}\b", re.IGNORECASE),
    re.compile(r"\bMEETING (?:MINUTES )?NO\.?\s*\d{4}/\d+\b", re.IGNORECASE),
    re.compile(r"\b(?:DECREE-?LAW|LAW)\s*(?:NO\.?\s*)?\d+/?\d{4}\b", re.IGNORECASE),
]

def extract_anchors_from_text(text: str) -> list[str]:
    t = _norm_digits(text or "")
    out: list[str] = []
    for rx in _RX_ANCHORS:
        out.extend(rx.findall(t))
    seen = set(); uniq: list[str] = []
    for a in out:
        k = a.upper()
        if k not in seen:
            seen.add(k); uniq.append(a)
    return uniq

# Compute anchors per Document from its chunks and persist
rows = graph.query(
    """
    MATCH (d:Document)-[:HAS_CHUNK]->(c:Chunk)
    RETURN d.document_key AS dk, collect(c.text) AS texts
    """
)
for row in rows:
    anchors = set()
    for t in row["texts"]:
        anchors.update(extract_anchors_from_text(t))
    graph.query(
        """
        MATCH (d:Document {document_key: $dk})
        SET d.anchors = $anchors
        """,
        {"dk": row["dk"], "anchors": sorted(list(anchors))},
    )

# Helpful fulltext/property indexes (idempotent)
run = (lambda q, p=None: graph.query(q, p or {}))
run("CREATE FULLTEXT INDEX chunk_text_idx IF NOT EXISTS FOR (c:Chunk) ON EACH [c.text]")
run("CREATE FULLTEXT INDEX doc_text_idx   IF NOT EXISTS FOR (d:Document) ON EACH [d.title, d.section_heading, d.doc_number, d.document_key]")
run("CREATE FULLTEXT INDEX doc_anchors_idx IF NOT EXISTS FOR (d:Document) ON EACH [d.anchors]")
run("CREATE INDEX doc_pub_key_idx IF NOT EXISTS FOR (d:Document) ON (d.publication_key)")



[]

### Embed Chunks

In [9]:
# Create Vector Index
create_vector_index(graph=graph, index_name='Chunk')

# Embed Chunks
embed_text(graph=graph, OPENAI_API_KEY=openAI_api, OPENAI_ENDPOINT=openAI_endpoint, node_name='Chunk', model_name=openAI_model, max_workers=40)

Starting embedding update...
Found 2773 nodes without embeddings.


Embedding nodes: 100%|██████████████████████████████████████████| 2773/2773 [00:29<00:00, 93.32it/s]

Finished embedding update.



