In [None]:
import os
from unstructured.partition.pdf import partition_pdf

def extract_pdf_data(file_path):
    print(f"ðŸš€ Processing: {file_path} (This uses OCR and may take time...)")
    
    # --- CORE EXTRACTION ---
    elements = partition_pdf(
        filename=file_path,
        strategy="hi_res",
        infer_table_structure=True,
        extract_images_in_pdf=True,
        extract_image_block_types=["Image", "Table"],
        chunking_strategy="by_title",
        max_characters=4000,
        new_after_n_chars=3200,
        combine_text_under_n_chars=800,
        languages=["eng", "ind"]
    )
    
    return elements

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import re

def clean_text(text):
    if not text:
        return ""
    
    # Remove Page Markers (e.g., "--- PAGE 100 ---")
    text = re.sub(r'---\s*PAGE\s*\d+\s*---', '', text, flags=re.IGNORECASE)
    
    # Remove Image Placeholders (e.g., "[Image 184]")
    text = re.sub(r'\[Image\s*\d+\]', '', text)
    
    # Fix OCR Currency Glitches (The "RpRp" problem)
    text = re.sub(r'(Rp\.?\s*){2,}', 'Rp ', text, flags=re.IGNORECASE)
    text = re.sub(r'Rp\.(\d)', r'Rp \1', text, flags=re.IGNORECASE)

    # Remove excessive whitespace / headers
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Filter out navigational noise
    noise_phrases = ["Table of CONTENTS", "Back to top", "ERHA ULTIMATE PRICE LIST"]
    for phrase in noise_phrases:
        text = text.replace(phrase, "")
        
    return text.strip()

def enrich_price_data(text):
    price_pattern = r'Rp\s*[\d\.]+(?:,\d{2})?'
    matches = re.findall(price_pattern, text)
    
    if matches:
        return f"[PRICELIST ITEM] {text}"
    return text

In [3]:
from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams, Distance
from dotenv import load_dotenv
load_dotenv()

qdrant_host = "http://localhost:6333"
collection_name = os.getenv("QDRANT_COLLECTION")
qdrant_client = QdrantClient(url=qdrant_host)
collections = qdrant_client.get_collections()
existing_collections = [collection.name for collection in collections.collections]

if collection_name not in existing_collections:
    qdrant_client.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=768, distance=Distance.COSINE)
    )

In [4]:
import uuid
from langchain_ollama.embeddings import OllamaEmbeddings
from qdrant_client.models import PointStruct

ollama_embedding = OllamaEmbeddings(
    model=os.getenv("OLLAMA_EMBEDDING_MODEL"),
    temperature=0
)

for fname in os.listdir("../data/pdf"):
    fpath = os.path.join("../data/pdf", fname)
    
    elements = extract_pdf_data(fpath)

    points = []
    print(f"   found {len(elements)} chunks. Embedding...")

    for i, el in enumerate(elements):
        text_content = el.text

        cleaned_text = clean_text(text_content)    
        enriched_price = enrich_price_data(cleaned_text)
        vector = ollama_embedding.embed_query(enriched_price)

        payload = {
            "full_text": text_content,
            "h1": str(el.metadata.to_dict().get("filename", "Unknown File")),
            "type": el.category,
            "page": el.metadata.page_number
        }

        points.append(PointStruct(
            id=str(uuid.uuid4()),
            vector=vector,
            payload=payload
        ))

    if points:
        qdrant_client.upsert(
            collection_name=collection_name,
            points=points
        )
        print(f"   âœ… Uploaded {len(points)} vectors.")

ðŸš€ Processing: ../data/pdf/724126666-ERHA-Ultimate-Pricelist-24.pdf (This uses OCR and may take time...)


The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


   found 96 chunks. Embedding...
   âœ… Uploaded 96 vectors.
ðŸš€ Processing: ../data/pdf/661627558-Katalog-Dermies-Max-by-Erha-Hiress.pdf (This uses OCR and may take time...)
   found 21 chunks. Embedding...
   âœ… Uploaded 21 vectors.
