In [1]:
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions, AcceleratorDevice, AcceleratorOptions
from docling.datamodel.base_models import InputFormat
import os
from pathlib import Path
from typing import List
from settings import AIConfig
from dotenv import load_dotenv
import pandas as pd

load_dotenv()  # Load environment variables from .env file

  from .autonotebook import tqdm as notebook_tqdm


True

In [None]:
from docling.datamodel.pipeline_options import PictureDescriptionApiOptions

In [None]:
source = "./data/M2.pdf"  # file path or URL

# source = "https://arxiv.org/pdf/2408.09869"
picture_desc_api_option = PictureDescriptionApiOptions(
    url="http://localhost:11434/v1",
    prompt="Describe the content of this image in a single paragraph.",
    params=dict(model="ollama:ministral-3:8b", temperature=0.2),
    timeout=60
)

# Configure PdfPipelineOptions for OCR with Tesseract CLI
pipeline_options = PdfPipelineOptions(
    accelerator_options=AcceleratorOptions(device=AcceleratorDevice.MPS),
    # do_picture_description=True,
    # picture_description_api_option=picture_desc_api_option,
    # generate_picture_images=True,
    # enable_remote_services=True,
    do_ocr=False,
    # images_scale=2,
    # ocr_options=TesseractCliOcrOptions(lang=["eng"])
)

# Initialize DocumentConverter with the configured options
converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(
            pipeline_options=pipeline_options,
            
        )
    }
)



In [None]:
import torch
print(torch.cuda.is_available())
print(torch.version.cuda)

In [None]:
doc = converter.convert(source)
markdown = doc.document.export_to_markdown()
output_path = "./data/M3.md"
with open(output_path, "w", encoding="utf-8") as f:
    f.write(markdown)
print(f"Saved markdown to {output_path}")

In [10]:

os.environ["GEMINI_API_KEY"] = os.environ.get("GOOGLE_API_KEY")
print(os.environ.get("GEMINI_API_KEY"))
print(os.environ.get("GOOGLE_API_KEY"))

AIzaSyD0ZJ1MovbHkX9PCC_6DDjt5JyDfu78Y64
AIzaSyD0ZJ1MovbHkX9PCC_6DDjt5JyDfu78Y64


In [None]:
import psycopg2
from psycopg2.extras import execute_values
from pgvector.psycopg2 import register_vector
import google.generativeai as genai

from docling.document_converter import DocumentConverter
from docling.chunking import HybridChunker
import time
from docling.datamodel.base_models import InputFormat
import numpy as np
import os

MD_FILE_PATH = "./data/M3.md"  
DB_NAME = "vectordb"
DB_USER = "postgres"
DB_PASSWORD = "password"  
DB_HOST = "localhost"
DB_PORT = "5432"
TABLE_NAME = "engineering_notes"


genai.configure(api_key=os.environ.get("GEMINI_API_KEY") or os.environ.get("GOOGLE_API_KEY"))


conn = psycopg2.connect(
    dbname=DB_NAME,
    user=DB_USER,
    password=DB_PASSWORD,
    host=DB_HOST,
    port=DB_PORT
)
conn.autocommit = True
cur = conn.cursor()
cur.execute(f"""
CREATE EXTENSION IF NOT EXISTS vector;
""")

register_vector(conn)


cur.execute(f"""
    CREATE EXTENSION IF NOT EXISTS vector;
    
    DROP TABLE IF EXISTS {TABLE_NAME};
    
    CREATE TABLE {TABLE_NAME} (
        id SERIAL PRIMARY KEY,
        text TEXT NOT NULL,
        embedding vector(768),  -- Gemini text-embedding-004 produces 768-dimensional vectors
        filename TEXT,
        chunk_index INTEGER,
        chunk_type TEXT
    );
    
    CREATE INDEX ON {TABLE_NAME} USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100);
""")

print(f"Table '{TABLE_NAME}' created successfully.")

def get_embedding(text: str) -> list:
    """Generate embedding using Gemini API."""
    result = genai.embed_content(
        model="models/text-embedding-004",
        content=text,
        task_type="retrieval_document"
    )
    return result['embedding']

def process_and_store_md(file_path: str):
    print(f"Processing: {file_path}")
    converter = DocumentConverter()
    result = converter.convert(file_path)
    doc = result.document
    chunker = HybridChunker(max_tokens=800, overlap_tokens=100, merge_peers=True)

    chunk_iter = chunker.chunk(doc)
    data_to_ingest = []
    for i, chunk in enumerate(chunk_iter):
        headers = [h for h in chunk.meta.headings]
        hierarchy_path = " > ".join(headers) if headers else "Root"
        content_type = "text"
        if "```" in chunk.text:
            content_type = "code"
        elif "|" in chunk.text and "-|-" in chunk.text:
            content_type = "table"
        
        embedding = get_embedding(chunk.text)
        entry = (
            chunk.text,
            embedding,
            os.path.basename("M3.md"),
            i,
            content_type
        )
        data_to_ingest.append(entry)
    
    batch_size = 48  
    if data_to_ingest:
        for i in range(0, len(data_to_ingest), batch_size):
            batch = data_to_ingest[i:i + batch_size]
            execute_values(
                cur,
                f"INSERT INTO {TABLE_NAME} (text, embedding, filename, chunk_index, chunk_type) VALUES %s",
                batch
            )
            time.sleep(0.5)  
        print(f"Successfully added {len(data_to_ingest)} chunks to PostgreSQL.")
    else:
        print("No chunks generated.")


Table 'engineering_notes' created successfully.


In [36]:
process_and_store_md(MD_FILE_PATH)

2025-12-08 15:14:51,985 - INFO - detected formats: [<InputFormat.MD: 'md'>]
2025-12-08 15:14:51,993 - INFO - Going to convert document batch...
2025-12-08 15:14:51,993 - INFO - Going to convert document batch...
2025-12-08 15:14:51,994 - INFO - Initializing pipeline for SimplePipeline with options hash 995a146ad601044538e6a923bea22f4e
2025-12-08 15:14:51,994 - INFO - Initializing pipeline for SimplePipeline with options hash 995a146ad601044538e6a923bea22f4e
2025-12-08 15:14:51,996 - INFO - Processing document M3.md
2025-12-08 15:14:51,996 - INFO - Processing document M3.md


Processing: ./data/M3.md


2025-12-08 15:14:52,670 - INFO - Finished converting document M3.md in 0.69 sec.
Token indices sequence length is longer than the specified maximum sequence length for this model (548 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (548 > 512). Running this sequence through the model will result in indexing errors


Successfully added 85 chunks to PostgreSQL.


In [45]:
# Search using cosine similarity
query = "Explain OSI networking"
query_embedding = get_embedding(query)

cur.execute(f"""
    SELECT text, filename, chunk_index, chunk_type,
           1 - (embedding <=> %s::vector) as similarity
    FROM {TABLE_NAME}
    ORDER BY embedding <=> %s::vector
    LIMIT 5
""", (query_embedding, query_embedding))

results = cur.fetchall()
for i in results:
    print(i)

('The data link layer uses the services of the physical layer to send and receive bits over communication channels.\n1. Providing a well-defined service interface to the network layer.\nIt has a number of functions, including:\n1. Dealing with transmission errors.\n2. Regulating the flow of data so that slow receivers are not swamped by fast senders. To accomplish these goals, the data link layer takes the packets it gets from the network layer and encapsulates them into frames for transmission.', 'M3.md', 0, 'text', 0.7581933079601433)
('IEEE Project 802 has created a sublayer called media access control that defines the specific access method for each LAN.\nFor example, it defines CSMA/CD as the media access method for Ethernet LANs and the token passing method for Token Ring and Token Bus LANs. A part of the framing function is also handled by the MAC layer. The MAC sublayer contains a number of distinct modules; each defines the access method and the framing format specific to the 

In [43]:
# PostgreSQL doesn't have built-in hybrid search like LanceDB
# You can implement full-text search separately and combine results
query = "Explain Logical Link Control"
query_embedding = get_embedding(query)

cur.execute(f"""
    SELECT text, filename, chunk_index, chunk_type,
           1 - (embedding <=> %s::vector) as similarity
    FROM {TABLE_NAME}
    ORDER BY embedding <=> %s::vector
    LIMIT 10
""", (query_embedding, query_embedding))

results = cur.fetchall()
print("Vector search results:")
for i, (text, filename, chunk_index, chunk_type, similarity) in enumerate(results, 1):
    print(f"\n--- Result {i} (similarity: {similarity:.4f}) ---")
    print(f"File: {filename}, Chunk: {chunk_index}, Type: {chunk_type}")
    print(f"Text: {text[:200]}...")


Vector search results:

--- Result 1 (similarity: 0.7179) ---
File: M3.md, Chunk: 36, Type: text
Text: 1. Perform control of access to media
2. Perform unique a addressing to station directly connected to LANS
3. 3.Error detection...

--- Result 2 (similarity: 0.7003) ---
File: M3.md, Chunk: 34, Type: text
Text: Data link control handles framing, flow control, and error control.
In IEEE Project 802, flow control, error control,and part of the framing duties are collected intoone sublayer called thelogicallink...

--- Result 3 (similarity: 0.6736) ---
File: M3.md, Chunk: 0, Type: text
Text: The data link layer uses the services of the physical layer to send and receive bits over communication channels.
1. Providing a well-defined service interface to the network layer.
It has a number of...

--- Result 4 (similarity: 0.6682) ---
File: M3.md, Chunk: 72, Type: text
Text: The physical layer corresponds to the OSI physical layer.
The data link layer in all the 802 protocols is split into tw