In [1]:
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions, AcceleratorDevice, AcceleratorOptions
from docling.datamodel.base_models import InputFormat
import os
from pathlib import Path
from typing import List
from settings import AIConfig
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
from docling.datamodel.pipeline_options import PictureDescriptionApiOptions

In [16]:
source = "./data/M2.pdf"  # file path or URL

# source = "https://arxiv.org/pdf/2408.09869"
picture_desc_api_option = PictureDescriptionApiOptions(
    url="http://localhost:11434/v1",
    prompt="Describe the content of this image in a single paragraph.",
    params=dict(model="ollama:ministral-3:8b", temperature=0.2),
    timeout=60
)

# Configure PdfPipelineOptions for OCR with Tesseract CLI
pipeline_options = PdfPipelineOptions(
    accelerator_options=AcceleratorOptions(device=AcceleratorDevice.CUDA),
    do_picture_description=True,
    picture_description_api_option=picture_desc_api_option,
    generate_picture_images=True,
    enable_remote_services=True,
    # do_ocr=False,
    images_scale=2,
    # ocr_options=TesseractCliOcrOptions(lang=["eng"])
)

# Initialize DocumentConverter with the configured options
converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(
            pipeline_options=pipeline_options,
            
        )
    }
)



In [17]:
import torch
print(torch.cuda.is_available())
print(torch.version.cuda)

True
12.1


In [4]:
doc = converter.convert(source)
markdown = doc.document.export_to_markdown()
output_path = "./data/M3.md"
with open(output_path, "w", encoding="utf-8") as f:
    f.write(markdown)
print(f"Saved markdown to {output_path}")

2025-12-06 19:53:37,205 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-06 19:53:37,610 - INFO - Going to convert document batch...
2025-12-06 19:53:37,612 - INFO - Processing document M2.pdf
2025-12-06 19:54:47,841 - INFO - Finished converting document M2.pdf in 70.80 sec.


Saved markdown to ./data/M3.md


In [5]:
import lancedb
from lancedb.pydantic import LanceModel, Vector
from lancedb.embeddings import get_registry
from docling.document_converter import DocumentConverter
from docling.chunking import HybridChunker
from docling.datamodel.base_models import InputFormat
MD_FILE_PATH = "./data/M3.md"  # Your converted markdown file
DB_URI = "./lancedb_data"    # This will create a local folder
TABLE_NAME = "engineering_docs"

embedding_func = get_registry().get("sentence-transformers").create()

class DocChunk(LanceModel):
    """
    Pydantic schema for LanceDB.
    """
    text: str = embedding_func.SourceField() # The text to be embedded
    vector: Vector(embedding_func.ndims()) = embedding_func.VectorField() # type: ignore # Auto-generated
    
    # Metadata fields
    filename: str
    chunk_index: int
    chunk_type: str  # e.g., "text", "table", "code"

db = lancedb.connect("./lancedb_data")
table_name = "engineering_notes"

table = db.create_table(table_name, schema=DocChunk, mode="overwrite")
def process_and_store_md(file_path: str):
    print(f"Processing: {file_path}")
    converter = DocumentConverter()
    result = converter.convert(file_path)
    doc = result.document
    chunker = HybridChunker(
        max_tokens=512,
        merge_peers=True,
    )

    chunk_iter = chunker.chunk(doc)
    data_to_ingest = []
    for i, chunk in enumerate(chunk_iter):
        headers = [h for h in chunk.meta.headings]
        hierarchy_path = " > ".join(headers) if headers else "Root"
        content_type = "mixed"
        if "```" in chunk.text:
            content_type = "code"
        elif "|" in chunk.text and "-|-" in chunk.text:
            content_type = "table"

        entry = {
            "text": chunk.text,
            "filename": os.path.basename("M2.md"),
            "chunk_type": content_type,
            "chunk_index": i,
        }
        data_to_ingest.append(entry)
    if data_to_ingest:
        table.add(data_to_ingest)
        print(f"Successfully added {len(data_to_ingest)} chunks to LanceDB.")
    else:
        print("No chunks generated.")
    table.create_fts_index("text")

2025-12-06 19:56:24,720 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Batches: 100%|██████████| 1/1 [00:00<00:00,  7.77it/s]


In [6]:
process_and_store_md(MD_FILE_PATH)

2025-12-06 19:56:59,629 - INFO - detected formats: [<InputFormat.MD: 'md'>]
2025-12-06 19:56:59,631 - INFO - Going to convert document batch...
2025-12-06 19:56:59,632 - INFO - Initializing pipeline for SimplePipeline with options hash 995a146ad601044538e6a923bea22f4e
2025-12-06 19:56:59,632 - INFO - Processing document M3.md


Processing: ./data/M3.md


2025-12-06 19:57:06,388 - INFO - Finished converting document M3.md in 6.75 sec.
Token indices sequence length is longer than the specified maximum sequence length for this model (548 > 512). Running this sequence through the model will result in indexing errors
Batches: 100%|██████████| 3/3 [00:02<00:00,  1.33it/s]


Successfully added 91 chunks to LanceDB.


In [7]:
res = table.search("Explain selective repeat").distance_type("cosine").limit(5)
res.to_pandas()['text']

Batches: 100%|██████████| 1/1 [00:00<00:00,  9.61it/s]


0    The main issue is complexity at the sender and...
1    1-persistent CSMA - When a station has data to...
2    This technique allows data frames to contain a...
3    In Go back N, sender window size is N and rece...
4    A method for doubling the capacity of an ALOHA...
Name: text, dtype: object

In [12]:
from lancedb.rerankers import RRFReranker
reranker = RRFReranker()
results = (
    table.search(
        "Explain Logical Link Control",
        query_type="hybrid",
        vector_column_name="vector",
        fts_columns="text",
    )
    .rerank(reranker)
    .limit(10)
    .to_pandas()
)

print("Hybrid search results:")
print(results)

Batches: 100%|██████████| 1/1 [00:00<00:00, 132.77it/s]

Hybrid search results:
                                                text  \
0  The physical layer corresponds to the OSI phys...   
1  - physical radio layer, deals with radio trans...   
2  1. A framing method - that shows the end of on...   
3  PPP -Point to Point Protocol\nHDLC - High leve...   
4  Data link control handles framing, flow contro...   
5  Networks can be divided into two categories:\n...   
6  The data link layer uses the services of the p...   
7  - Start frame delimiter (SFD)-Alerts each stat...   
8  The data link layer have to detect and, if nec...   
9  The principal service is transferring data fro...   

                                              vector filename  chunk_index  \
0  [-0.04163005, -0.0043371688, 0.009071809, 0.02...    M2.md           74   
1  [-0.07774799, -0.041657556, -0.07229016, 0.038...    M2.md           86   
2  [-0.04260041, 0.031152837, -0.042637832, -0.05...    M2.md           28   
3  [-0.058674913, -0.05213298, -0.08557983, -0.0


