In [1]:
from llama_index.core import SimpleDirectoryReader
from llama_index.readers.docling import DoclingReader
from llama_index.node_parser.docling import DoclingNodeParser

reader = DoclingReader(export_type=DoclingReader.ExportType.JSON)
node_parser = DoclingNodeParser()   

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
documents = SimpleDirectoryReader(
    input_dir="docs",
    file_extractor={".pdf": reader},
).load_data(show_progress=True)

Loading files:   0%|          | 0/1 [00:00<?, ?it/s]2025-09-17 00:05:18,269 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-09-17 00:05:18,308 - INFO - Going to convert document batch...
2025-09-17 00:05:18,309 - INFO - Initializing pipeline for StandardPdfPipeline with options hash e647edf348883bed75367b22fbe60347
2025-09-17 00:05:18,314 - INFO - Loading plugin 'docling_defaults'
2025-09-17 00:05:18,315 - INFO - Registered picture descriptions: ['vlm', 'api']
2025-09-17 00:05:18,319 - INFO - Loading plugin 'docling_defaults'
2025-09-17 00:05:18,321 - INFO - Registered ocr engines: ['easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
2025-09-17 00:05:18,476 - INFO - Accelerator device: 'mps'
2025-09-17 00:05:20,600 - INFO - Accelerator device: 'mps'
2025-09-17 00:05:21,705 - INFO - Accelerator device: 'mps'
2025-09-17 00:05:22,194 - INFO - Processing document Abu Dhabi Procurement Standards.PDF
2025-09-17 00:05:59,008 - INFO - Finished converting document Abu Dhabi Pr

In [3]:
node_parser.get_nodes_from_documents(documents)[0]

TextNode(id_='4a0ab297-4929-4a00-aa74-52f877e4660b', embedding=None, metadata={'file_path': '/Users/sayedameer/Projects/interviews/northbay/genai-poc/indexer/docs/Abu Dhabi Procurement Standards.PDF', 'file_name': 'Abu Dhabi Procurement Standards.PDF', 'file_type': 'application/pdf', 'file_size': 1450487, 'creation_date': '2025-09-09', 'last_modified_date': '2025-09-09', 'schema_name': 'docling_core.transforms.chunker.DocMeta', 'version': '1.0.0', 'doc_items': [{'self_ref': '#/tables/0', 'parent': {'$ref': '#/body'}, 'children': [], 'content_layer': 'body', 'label': 'table', 'prov': [{'page_no': 2, 'bbox': {'l': 44.45536422729492, 't': 710.9623718261719, 'r': 517.1547241210938, 'b': 637.5954895019531, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 0]}]}], 'headings': ['Abu Dhabi Procurement Standards'], 'origin': {'mimetype': 'application/pdf', 'binary_hash': 13835862547732020269, 'filename': 'Abu Dhabi Procurement Standards.PDF'}}, excluded_embed_metadata_keys=['schema_name', 'version

In [3]:
from pgvector.sqlalchemy import Vector
from sqlalchemy import insert, create_engine, String, text, Integer
from sqlalchemy.orm import declarative_base, mapped_column

In [4]:
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.vector_stores.postgres import PGVectorStore
import textwrap

In [5]:
import psycopg2

connection_string = "postgresql://postgres:password@localhost:5432"
db_name = "vector_db"
conn = psycopg2.connect(connection_string)
conn.autocommit = True

with conn.cursor() as c:
    c.execute(f"DROP DATABASE IF EXISTS {db_name}")
    c.execute(f"CREATE DATABASE {db_name}")

In [6]:
from llama_index.embeddings.ollama import OllamaEmbedding
EMBED_MODEL = OllamaEmbedding(model_name="nomic-embed-text")

In [7]:
from sqlalchemy import make_url

url = make_url(connection_string)
vector_store = PGVectorStore.from_params(
    database=db_name,
    host=url.host,
    password=url.password,
    port=url.port,
    user=url.username,
    table_name="proc_docs",
    embed_dim=768,
    hybrid_search=True,
    text_search_config="english",
    hnsw_kwargs={
        "hnsw_m": 16,
        "hnsw_ef_construction": 64,
        "hnsw_ef_search": 40,
        "hnsw_dist_method": "vector_cosine_ops",
    },
)

storage_context = StorageContext.from_defaults(vector_store=vector_store)

hybrid_index = VectorStoreIndex.from_documents(
    documents=documents,
    storage_context=storage_context,
    transformations=[node_parser],
    embed_model=EMBED_MODEL,
    show_progress=True
)

Parsing nodes: 100%|██████████| 1/1 [00:00<00:00,  6.34it/s]
Generating embeddings:   0%|          | 0/191 [00:00<?, ?it/s]2025-09-17 00:08:02,527 - INFO - HTTP Request: POST http://localhost:11434/api/embed "HTTP/1.1 200 OK"
2025-09-17 00:08:02,551 - INFO - HTTP Request: POST http://localhost:11434/api/embed "HTTP/1.1 200 OK"
2025-09-17 00:08:02,574 - INFO - HTTP Request: POST http://localhost:11434/api/embed "HTTP/1.1 200 OK"
2025-09-17 00:08:02,603 - INFO - HTTP Request: POST http://localhost:11434/api/embed "HTTP/1.1 200 OK"
2025-09-17 00:08:02,632 - INFO - HTTP Request: POST http://localhost:11434/api/embed "HTTP/1.1 200 OK"
2025-09-17 00:08:02,663 - INFO - HTTP Request: POST http://localhost:11434/api/embed "HTTP/1.1 200 OK"
2025-09-17 00:08:02,693 - INFO - HTTP Request: POST http://localhost:11434/api/embed "HTTP/1.1 200 OK"
2025-09-17 00:08:02,723 - INFO - HTTP Request: POST http://localhost:11434/api/embed "HTTP/1.1 200 OK"
2025-09-17 00:08:02,751 - INFO - HTTP Request: POST h

In [9]:
from llama_index.llms.ollama import Ollama
GEN_MODEL = Ollama(model="qwen3:4b-instruct-2507-q8_0", temperature=0.7, keep_alive=True, context_window=2048)


In [None]:
# query_engine = index.as_query_engine(llm=GEN_MODEL)


In [11]:
from llama_index.core.response_synthesizers import CompactAndRefine
from llama_index.core.retrievers import QueryFusionRetriever
from llama_index.core.query_engine import RetrieverQueryEngine

vector_retriever = hybrid_index.as_retriever(
    vector_store_query_mode="default",
    similarity_top_k=5,
)
text_retriever = hybrid_index.as_retriever(
    vector_store_query_mode="sparse",
    similarity_top_k=5,  # interchangeable with sparse_top_k in this context
)
retriever = QueryFusionRetriever(
    [vector_retriever, text_retriever],
    llm=GEN_MODEL,
    similarity_top_k=5,
    num_queries=1,  # set this to 1 to disable query generation
    mode="relative_score",
    use_async=False,
)

response_synthesizer = CompactAndRefine( llm=GEN_MODEL)
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
)

In [17]:
QUERY= """
Describe the relationship between "Intellectual Property created during the tenure and framework of the contract" (Foreground Intellectual Property) and the "relevant Intellectual Property supplied by the contracting parties at the beginning of the engagement" (Background Intellectual Property), as defined in the standards.


"""
result = query_engine.query(QUERY)
print(f"Q: {QUERY}\nA: {result.response.strip()}\n\nSources:")
display([(n.text, n.metadata) for n in result.source_nodes])

2025-09-17 00:26:21,175 - INFO - HTTP Request: POST http://localhost:11434/api/embed "HTTP/1.1 200 OK"
2025-09-17 00:26:30,862 - INFO - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"


Q: 
Describe the relationship between "Intellectual Property created during the tenure and framework of the contract" (Foreground Intellectual Property) and the "relevant Intellectual Property supplied by the contracting parties at the beginning of the engagement" (Background Intellectual Property), as defined in the standards.



A: Foreground Intellectual Property refers to intellectual property created during the course of the contract, while Background Intellectual Property refers to intellectual property that is provided by the contracting parties at the start of the engagement. The standards emphasize that procurement must collaborate with end-users to identify both types of intellectual property. Background Intellectual Property is relevant at the outset and may require justification for transfer, licensing, retention, or sharing, supported by a business case. Foreground Intellectual Property, being generated during the contract, is protected through contractual provisions that 

[('Bid Bonds, 1 = A guarantee required for the bidding process to mitigate bidding risks. A bid bond ensures that the awarded Supplier will fulfil the commitments of the accepted bid.. Single Award, 1 = An award made to one Supplier.. Split Award, 1 = An award made to every selected Supplier for each part of a divided solicitation.. Primary and Secondary Award, 1 = A type of award where in addition to the selected primary source award is also made to a secondary or backup source of supply.. Multiple Awards, 1 = Awards madetomultiple sources of supply for the samematerials, services and projects.. Price List Agreement, 1 = A type of framework agreement for establishing the terms governing contracts to be awarded during a given period, in particular with regard to the price of goods, services and projects.. Performance Bonds, 1 = A guarantee issued to provide security for satisfactory completion of a contract. A performance bond ensures payment of a sum of money in case of a failure of a