In [46]:
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions, TesseractCliOcrOptions, PictureDescriptionApiOptions
from docling.datamodel.base_models import InputFormat
import os
from pathlib import Path
from typing import List
from settings import AIConfig
import pandas as pd

In [48]:
source = "./data/System.pdf"  # file path or URL

# source = "https://arxiv.org/pdf/2408.09869"
# picture_desc_api_option = PictureDescriptionApiOptions(
#     url=os.environ.get('OLLAMA_BASE_URL'),
#     prompt="Describe the content of this image in a single paragraph.",
#     params=dict(model="ollama:ministral-3:8b", temperature=0.2),
#     timeout=60
# )

# Configure PdfPipelineOptions for OCR with Tesseract CLI
pipeline_options = PdfPipelineOptions(
    # do_picture_description=True,
    # picture_description_api_option=picture_desc_api_option,
    # generate_picture_images=True,
    # enable_remote_services=True,
    # do_ocr=False,
    # images_scale=2,
    # ocr_options=TesseractCliOcrOptions(lang=["eng"])
)

# Initialize DocumentConverter with the configured options
converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(
            pipeline_options=pipeline_options
        )
    }
)



In [49]:
doc = converter.convert(source)
markdown = doc.document.export_to_markdown()
output_path = "./data/System.md"
with open(output_path, "w", encoding="utf-8") as f:
    f.write(markdown)
print(f"Saved markdown to {output_path}")

2025-12-07 00:16:00,404 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-07 00:16:00,581 - INFO - Going to convert document batch...
2025-12-07 00:16:00,586 - INFO - Initializing pipeline for StandardPdfPipeline with options hash e15bc6f248154cc62f8db15ef18a8ab7
2025-12-07 00:16:00,701 - INFO - Loading plugin 'docling_defaults'
2025-12-07 00:16:00,743 - INFO - Registered ocr engines: ['auto', 'easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
2025-12-07 00:16:01,917 - INFO - Auto OCR model selected ocrmac.
2025-12-07 00:16:01,924 - INFO - Loading plugin 'docling_defaults'
2025-12-07 00:16:01,941 - INFO - Registered layout engines: ['docling_layout_default', 'docling_experimental_table_crops_layout']
2025-12-07 00:16:01,970 - INFO - Accelerator device: 'mps'
2025-12-07 00:16:35,261 - INFO - Loading plugin 'docling_defaults'
2025-12-07 00:16:35,271 - INFO - Registered table structure engines: ['docling_tableformer']
2025-12-07 00:16:36,243 - INFO - Accelerator device

KeyboardInterrupt: 

In [41]:
import lancedb
from lancedb.pydantic import LanceModel, Vector
from lancedb.embeddings import get_registry
from docling.document_converter import DocumentConverter
from docling.chunking import HybridChunker
from docling.datamodel.base_models import InputFormat
MD_FILE_PATH = "./data/M2.md"  # Your converted markdown file
DB_URI = "./lancedb_data"    # This will create a local folder
TABLE_NAME = "engineering_docs"

embedding_func = get_registry().get("sentence-transformers").create()

class DocChunk(LanceModel):
    """
    Pydantic schema for LanceDB.
    """
    text: str = embedding_func.SourceField() # The text to be embedded
    vector: Vector(embedding_func.ndims()) = embedding_func.VectorField() # type: ignore # Auto-generated
    
    # Metadata fields
    filename: str
    chunk_index: int
    chunk_type: str  # e.g., "text", "table", "code"

db = lancedb.connect("./lancedb_data")
table_name = "engineering_notes"

table = db.create_table(table_name, schema=DocChunk, mode="overwrite")
def process_and_store_md(file_path: str):
    print(f"Processing: {file_path}")
    converter = DocumentConverter()
    result = converter.convert(file_path)
    doc = result.document
    chunker = HybridChunker(
        max_tokens=800, # Increased slightly as Gemini handles larger contexts well
        merge_peers=True,
    )

    chunk_iter = chunker.chunk(doc)
    data_to_ingest = []
    for i, chunk in enumerate(chunk_iter):
        headers = [h for h in chunk.meta.headings]
        hierarchy_path = " > ".join(headers) if headers else "Root"
        content_type = "mixed"
        if "```" in chunk.text:
            content_type = "code"
        elif "|" in chunk.text and "-|-" in chunk.text:
            content_type = "table"

        entry = {
            "text": chunk.text,
            "filename": os.path.basename("M2.md"),
            "chunk_type": content_type,
            "chunk_index": i,
        }
        data_to_ingest.append(entry)
    if data_to_ingest:
        table.add(data_to_ingest)
        print(f"Successfully added {len(data_to_ingest)} chunks to LanceDB.")
    else:
        print("No chunks generated.")
    table.create_fts_index("text")

Batches: 100%|██████████| 1/1 [00:00<00:00, 83.97it/s]


In [42]:
process_and_store_md(MD_FILE_PATH)

2025-12-06 18:21:57,076 - INFO - detected formats: [<InputFormat.MD: 'md'>]
2025-12-06 18:21:57,078 - INFO - Going to convert document batch...
2025-12-06 18:21:57,080 - INFO - Initializing pipeline for SimplePipeline with options hash 995a146ad601044538e6a923bea22f4e
2025-12-06 18:21:57,083 - INFO - Processing document M2.md


Processing: ./data/M2.md


2025-12-06 18:21:57,449 - INFO - Finished converting document M2.md in 0.38 sec.
Token indices sequence length is longer than the specified maximum sequence length for this model (589 > 512). Running this sequence through the model will result in indexing errors
Batches: 100%|██████████| 2/2 [00:00<00:00,  4.09it/s]

Successfully added 38 chunks to LanceDB.





In [38]:
res = table.search("Explain spiral model").distance_type("cosine").limit(5)
res.to_pandas()['text']

Batches: 100%|██████████| 1/1 [00:00<00:00, 48.94it/s]


0    - Originally proposed by Barry Boehm the spira...
1    - Scrum is an agile method that focuses on man...
2    - The four basic process activities of specifi...
3    The stages of the waterfall model directly ref...
4    - Product development where a software company...
Name: text, dtype: object

In [None]:
from lancedb.rerankers import RRFReranker
reranker = RRFReranker()
results = (
    table.search(
        "Different types of software development methodologies",
        query_type="hybrid",
        vector_column_name="vector",
        fts_columns="text",
    )
    .rerank(reranker)
    .limit(10)
    .to_list()
)

print("Hybrid search results:")
print(results)

Batches: 100%|██████████| 1/1 [00:00<00:00, 35.10it/s]

Hybrid search results:
                                                text  \
0  - The four basic process activities of specifi...   
1  There are two kinds of software product:\n1. G...   
2  - The principal responsibility of software pro...   
3  - Product development where a software company...   
4  What is software?, Answer = Computer programs ...   
5  - Agile methods are incremental development me...   
6  1. Heterogeneity Increasingly, systems are req...   
7  The systematic approach that is used in softwa...   
8  - Dissatisfaction with the overheads involved ...   
9  In the image, we can see the words "Concurrent...   

                                              vector filename  chunk_index  \
0  [0.008744788, -0.015853986, -0.018221293, -0.0...    M2.md           11   
1  [-0.049120296, -0.051378325, 0.04316983, -0.07...    M2.md            1   
2  [-0.055930406, 0.038180914, 0.007761431, 0.033...    M2.md           32   
3  [0.008349403, -0.0031605244, -0.028572725, -0


