<a href="https://colab.research.google.com/github/smdbg/colab/blob/main/Step_3_Vector_DB_open.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import os
import shutil
from typing import Dict, List, Tuple

# pip install lancedb docling openai tiktoken python-dotenv transformers

import lancedb
import tiktoken
from docling.chunking import HybridChunker
from docling.document_converter import DocumentConverter
from lancedb.embeddings import get_registry
from lancedb.pydantic import LanceModel, Vector
from openai import OpenAI
from transformers.tokenization_utils_base import PreTrainedTokenizerBase

# --- CONFIGURATION ---
OPENAI_API_KEY = ""
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

# Initialize OpenAI client
client = OpenAI()

# --- CUSTOM TOKENIZER WRAPPER ---
class OpenAITokenizerWrapper(PreTrainedTokenizerBase):
    """Minimal wrapper for OpenAI's tokenizer to match HuggingFace interface."""

    def __init__(
        self, model_name: str = "cl100k_base", max_length: int = 8191, **kwargs
    ):
        super().__init__(model_max_length=max_length, **kwargs)
        self.tokenizer = tiktoken.get_encoding(model_name)
        self._vocab_size = self.tokenizer.max_token_value

    def tokenize(self, text: str, **kwargs) -> List[str]:
        return [str(t) for t in self.tokenizer.encode(text)]

    def _tokenize(self, text: str) -> List[str]:
        return self.tokenize(text)

    def _convert_token_to_id(self, token: str) -> int:
        return int(token)

    def _convert_id_to_token(self, index: int) -> str:
        return str(index)

    # --- FIX: –î–æ–±–∞–≤—è–º–µ –ª–∏–ø—Å–≤–∞—â–∏—è –º–µ—Ç–æ–¥ __len__ ---
    def __len__(self) -> int:
        return self._vocab_size

    def get_vocab(self) -> Dict[str, int]:
        return dict(enumerate(range(self.vocab_size)))

    @property
    def vocab_size(self) -> int:
        return self._vocab_size

    def save_vocabulary(self, *args) -> Tuple[str]:
        return ()

    @classmethod
    def from_pretrained(cls, *args, **kwargs):
        return cls()

# --- MAIN PIPELINE ---

# 1. Setup Tokenizer
# text-embedding-3-large –ø–æ–¥–¥—ä—Ä–∂–∞ 8191 —Ç–æ–∫–µ–Ω–∞
tokenizer = OpenAITokenizerWrapper(model_name="cl100k_base", max_length=8191)
MAX_TOKENS = 8191

# 2. Extract Data
print("‚è≥ Converting PDF...")
converter = DocumentConverter()
result = converter.convert("https://arxiv.org/pdf/2408.09869")

# 3. Chunking
print("üî™ Chunking...")
chunker = HybridChunker(
    tokenizer=tokenizer,
    max_tokens=MAX_TOKENS,
    merge_peers=True,
)

chunk_iter = chunker.chunk(dl_doc=result.document)
chunks = list(chunk_iter)
print(f"‚úÖ Created {len(chunks)} chunks.")

# 4. LanceDB Setup
if os.path.exists("data/lancedb"):
    shutil.rmtree("data/lancedb")

db = lancedb.connect("data/lancedb")

# 5. OpenAI Embedding Function (Native)
# LanceDB –∞–≤—Ç–æ–º–∞—Ç–∏—á–Ω–æ —â–µ –∏–∑–ø–æ–ª–∑–≤–∞ –∫–ª—é—á–∞ –æ—Ç os.environ
func = get_registry().get("openai").create(name="text-embedding-3-large")

# 6. Schema Definition
class ChunkMetadata(LanceModel):
    filename: str | None
    page_numbers: List[int] | None
    title: str | None

class Chunks(LanceModel):
    text: str = func.SourceField()
    vector: Vector(func.ndims()) = func.VectorField()
    metadata: ChunkMetadata

table = db.create_table("docling", schema=Chunks, mode="overwrite")

# 7. Insert Data
print("üöÄ Indexing with OpenAI...")
processed_chunks = [
    {
        "text": chunk.text,
        "metadata": {
            "filename": chunk.meta.origin.filename if chunk.meta.origin else "doc.pdf",
            "page_numbers": [
                page_no
                for page_no in sorted(
                    set(
                        prov.page_no
                        for item in chunk.meta.doc_items
                        for prov in item.prov
                    )
                )
            ]
            or None,
            "title": chunk.meta.headings[0] if chunk.meta.headings else None,
        },
    }
    for chunk in chunks
]

table.add(processed_chunks)

# 8. Verify
print(f"üéâ Done! Rows indexed: {table.count_rows()}")
print(table.to_pandas().head(2))

[32m[INFO] 2025-11-22 22:19:36,841 [RapidOCR] base.py:22: Using engine_name: torch[0m


‚è≥ Converting PDF...


[32m[INFO] 2025-11-22 22:19:36,916 [RapidOCR] download_file.py:60: File exists and is valid: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2025-11-22 22:19:36,917 [RapidOCR] torch.py:54: Using /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2025-11-22 22:19:37,330 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2025-11-22 22:19:37,335 [RapidOCR] download_file.py:60: File exists and is valid: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_ptocr_mobile_v2.0_cls_infer.pth[0m
[32m[INFO] 2025-11-22 22:19:37,337 [RapidOCR] torch.py:54: Using /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_ptocr_mobile_v2.0_cls_infer.pth[0m
[32m[INFO] 2025-11-22 22:19:37,460 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2025-11-22 22:19:37,543 [RapidOCR] download_file.py:60: File exists and is valid: /usr/local/lib/python3.12/dist-packages/rapidocr

üî™ Chunking...
‚úÖ Created 21 chunks.
üöÄ Indexing with OpenAI...
üéâ Done! Rows indexed: 21
                                                text  \
0  Christoph Auer Maksym Lysak Ahmed Nassar Miche...   
1  This technical report introduces Docling , an ...   

                                              vector  \
0  [0.005165622, -0.006144963, -0.021063859, 0.02...   
1  [0.0008674973, 0.0135612255, -0.024147633, -0....   

                                            metadata  
0  {'filename': '2408.09869v5.pdf', 'page_numbers...  
1  {'filename': '2408.09869v5.pdf', 'page_numbers...  


In [15]:
import lancedb

# --------------------------------------------------------------
# Connect to the database
# --------------------------------------------------------------

uri = "data/lancedb"
db = lancedb.connect(uri)


# --------------------------------------------------------------
# Load the table
# --------------------------------------------------------------

table = db.open_table("docling")


# --------------------------------------------------------------
# Search the table
# --------------------------------------------------------------

result = table.search(query="–≥–∞–∑–æ–±–µ—Ç–æ–Ω–Ωa", query_type="vector").limit(3)
result.to_pandas()

Unnamed: 0,text,vector,metadata,_distance
0,–ù–∞—Å—Ç–æ—è—â–∞—Ç–∞ –∏–Ω—Å—Ç—Ä—É–∫—Ü–∏—è —Ä–µ–≥–ª–∞–º–µ–Ω—Ç–∏—Ä–∞ –∏–∑–∏—Å–∫–≤–∞–Ω–∏—è—Ç...,"[0.027452853, -0.021154394, -0.0029410352, 0.0...",{'filename': 'BOFI 50014.1 –ü—Ä–∏–≥–æ—Ç–≤—è–Ω–µ –∏ –∑–∞–ø—ä–ª–≤...,0.948265
1,–ü—Ä–∏–≥–æ—Ç–≤—è–Ω–µ—Ç–æ –Ω–∞ –≥–∞–∑–æ–±–µ—Ç–æ–Ω–Ω–∞—Ç–∞ —Å–º–µ—Å –≤–∫–ª—é—á–≤–∞ —Å–ª–µ...,"[0.034642067, -0.034101136, -0.009550798, 0.00...",{'filename': 'BOFI 50014.1 –ü—Ä–∏–≥–æ—Ç–≤—è–Ω–µ –∏ –∑–∞–ø—ä–ª–≤...,0.953884
2,"–ü—Ä–æ–∏–∑–≤–æ–¥—Å—Ç–≤–µ–Ω–∏—è —É—á–∞—Å—Ç—ä–∫, –∫—ä–¥–µ—Ç–æ —Å–µ –∏–∑–ø—ä–ª–Ω—è–≤–∞ –ø...","[0.035472944, -0.0016953981, -0.006564234, 0.0...",{'filename': 'BOFI 50014.1 –ü—Ä–∏–≥–æ—Ç–≤—è–Ω–µ –∏ –∑–∞–ø—ä–ª–≤...,0.98089


In [9]:
!zip -r download.zip data

  adding: data/ (stored 0%)
  adding: data/lancedb/ (stored 0%)
  adding: data/lancedb/docling.lance/ (stored 0%)
  adding: data/lancedb/docling.lance/data/ (stored 0%)
  adding: data/lancedb/docling.lance/data/10110010000100111110001155b97640cda0da2bfc6b1749c6.lance (deflated 25%)
  adding: data/lancedb/docling.lance/_transactions/ (stored 0%)
  adding: data/lancedb/docling.lance/_transactions/0-021bd0ab-4b76-4b9d-8ea6-46e168351cb1.txn (deflated 36%)
  adding: data/lancedb/docling.lance/_transactions/1-43110221-7122-453a-8145-69687af7b38b.txn (deflated 9%)
  adding: data/lancedb/docling.lance/_versions/ (stored 0%)
  adding: data/lancedb/docling.lance/_versions/1.manifest (deflated 36%)
  adding: data/lancedb/docling.lance/_versions/2.manifest (deflated 35%)


In [16]:
import os
import shutil
import time
import glob
from typing import Dict, List, Tuple

import lancedb
import tiktoken
from docling.chunking import HybridChunker
from docling.document_converter import DocumentConverter
from lancedb.embeddings import get_registry
from lancedb.pydantic import LanceModel, Vector
from openai import OpenAI
from transformers.tokenization_utils_base import PreTrainedTokenizerBase

# --- CONFIGURATION ---
OPENAI_API_KEY = ""
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

INPUT_FOLDER = "INPUT"
DB_PATH = "data/lancedb"
MAX_TOKENS = 8191

# Initialize OpenAI client
client = OpenAI()

# --- TOKENIZER WRAPPER ---
class OpenAITokenizerWrapper(PreTrainedTokenizerBase):
    def __init__(self, model_name: str = "cl100k_base", max_length: int = 8191, **kwargs):
        super().__init__(model_max_length=max_length, **kwargs)
        self.tokenizer = tiktoken.get_encoding(model_name)
        self._vocab_size = self.tokenizer.max_token_value

    def tokenize(self, text: str, **kwargs) -> List[str]:
        return [str(t) for t in self.tokenizer.encode(text)]

    def _tokenize(self, text: str) -> List[str]:
        return self.tokenize(text)

    def _convert_token_to_id(self, token: str) -> int:
        return int(token)

    def _convert_id_to_token(self, index: int) -> str:
        return str(index)

    def __len__(self) -> int:
        return self._vocab_size

    def get_vocab(self) -> Dict[str, int]:
        return dict(enumerate(range(self.vocab_size)))

    @property
    def vocab_size(self) -> int:
        return self._vocab_size

    def save_vocabulary(self, *args) -> Tuple[str]:
        return ()

    @classmethod
    def from_pretrained(cls, *args, **kwargs):
        return cls()

# --- SCHEMA SETUP ---
func = get_registry().get("openai").create(name="text-embedding-3-large")

class ChunkMetadata(LanceModel):
    filename: str | None
    page_numbers: List[int] | None
    title: str | None

class Chunks(LanceModel):
    text: str = func.SourceField()
    vector: Vector(func.ndims()) = func.VectorField()
    metadata: ChunkMetadata

# --- HELPER FUNCTIONS ---

def get_or_create_table():
    """–°–≤—ä—Ä–∑–≤–∞ —Å–µ —Å –±–∞–∑–∞—Ç–∞."""
    db = lancedb.connect(DB_PATH)

    if "docling" in db.table_names():
        return db.open_table("docling")
    else:
        print("üÜï –°—ä–∑–¥–∞–≤–∞–Ω–µ –Ω–∞ –Ω–æ–≤–∞ —Ç–∞–±–ª–∏—Ü–∞ 'docling'...")
        return db.create_table("docling", schema=Chunks)

def process_single_file(file_path, table, tokenizer):
    filename = os.path.basename(file_path)
    print(f"\nüìÑ –û–±—Ä–∞–±–æ—Ç–∫–∞ –Ω–∞: {filename}")

    # --- DEDUPLICATION CHECK ---
    # –ü—Ä–æ–≤–µ—Ä—è–≤–∞–º–µ –¥–∞–ª–∏ —Ñ–∞–π–ª—ä—Ç –≤–µ—á–µ —Å—ä—â–µ—Å—Ç–≤—É–≤–∞
    existing_files = table.search().where(f"metadata.filename = '{filename}'").limit(1).to_pandas()

    if not existing_files.empty:
        print(f"   üîÑ –§–∞–π–ª—ä—Ç '{filename}' –≤–µ—á–µ —Å—ä—â–µ—Å—Ç–≤—É–≤–∞. –ò–∑—Ç—Ä–∏–≤–∞–Ω–µ –Ω–∞ —Å—Ç–∞—Ä–∏—Ç–µ –∑–∞–ø–∏—Å–∏...")
        table.delete(f"metadata.filename = '{filename}'")
        print("   üóëÔ∏è –°—Ç–∞—Ä–∏—Ç–µ –∑–∞–ø–∏—Å–∏ —Å–∞ –∏–∑—Ç—Ä–∏—Ç–∏. –î–æ–±–∞–≤—è–Ω–µ –Ω–∞ –Ω–æ–≤–∞—Ç–∞ –≤–µ—Ä—Å–∏—è...")
    # ---------------------------

    try:
        # 1. Docling Convert
        converter = DocumentConverter()
        result = converter.convert(file_path)

        # 2. Chunking
        chunker = HybridChunker(
            tokenizer=tokenizer,
            max_tokens=MAX_TOKENS,
            merge_peers=True,
        )
        chunks = list(chunker.chunk(dl_doc=result.document))
        print(f"   üß© –ì–µ–Ω–µ—Ä–∏—Ä–∞–Ω–∏ {len(chunks)} –ø–∞—Ä—á–µ—Ç–∞.")

        if not chunks:
            print("   ‚ö†Ô∏è –§–∞–π–ª—ä—Ç –µ –ø—Ä–∞–∑–µ–Ω –∏–ª–∏ –Ω–µ –º–æ–∂–µ –¥–∞ —Å–µ –ø—Ä–æ—á–µ—Ç–µ.")
            return False

        # 3. Prepare Data
        processed_chunks = [
            {
                "text": chunk.text,
                "metadata": {
                    "filename": filename,
                    "page_numbers": [
                        page_no for item in chunk.meta.doc_items for item_prov in item.prov for page_no in [item_prov.page_no]
                    ] or None,
                    "title": chunk.meta.headings[0] if chunk.meta.headings else None,
                },
            }
            for chunk in chunks
        ]

        # 4. Insert into LanceDB
        print("   üöÄ Indexing to LanceDB...")
        table.add(processed_chunks)
        print("   ‚úÖ –£—Å–ø–µ—à–Ω–æ –∏–Ω–¥–µ–∫—Å–∏—Ä–∞–Ω!")
        return True

    except Exception as e:
        print(f"   ‚ùå –ì–†–ï–®–ö–ê –ø—Ä–∏ –æ–±—Ä–∞–±–æ—Ç–∫–∞—Ç–∞ –Ω–∞ —Ñ–∞–π–ª–∞: {e}")
        return False

# --- MAIN LOOP ---

def main():
    # 1. Setup
    if not os.path.exists(INPUT_FOLDER):
        os.makedirs(INPUT_FOLDER)
        print(f"üìÅ –°—ä–∑–¥–∞–¥–µ–Ω–∞ –ø–∞–ø–∫–∞ '{INPUT_FOLDER}'. –°–ª–∞–≥–∞–π PDF —Ñ–∞–π–ª–æ–≤–µ –≤—ä—Ç—Ä–µ!")

    tokenizer = OpenAITokenizerWrapper(model_name="cl100k_base", max_length=8191)
    table = get_or_create_table()

    print(f"\nüëÄ –ú–æ–Ω–∏—Ç–æ—Ä–∏–Ω–≥ –Ω–∞ –ø–∞–ø–∫–∞ '{INPUT_FOLDER}'...")
    print("–ù–∞—Ç–∏—Å–Ω–∏ Ctrl+C –∑–∞ —Å–ø–∏—Ä–∞–Ω–µ.\n")

    try:
        while True:
            pdf_files = glob.glob(os.path.join(INPUT_FOLDER, "*.pdf"))

            if pdf_files:
                print(f"üîé –ù–∞–º–µ—Ä–µ–Ω–∏ {len(pdf_files)} —Ñ–∞–π–ª–∞ –∑–∞ –æ–±—Ä–∞–±–æ—Ç–∫–∞!")

                for file_path in pdf_files:
                    success = process_single_file(file_path, table, tokenizer)

                    if success:
                        try:
                            os.remove(file_path)
                            print(f"   üóëÔ∏è –ò–∑—Ç—Ä–∏—Ç –æ—Ç Input: {os.path.basename(file_path)}")
                        except OSError as e:
                            print(f"   ‚ö†Ô∏è –ù–µ –º–æ–≥–∞ –¥–∞ –∏–∑—Ç—Ä–∏—è —Ñ–∞–π–ª–∞: {e}")
                    else:
                        new_name = file_path + ".failed"
                        os.rename(file_path, new_name)
                        print(f"   ‚ö†Ô∏è –ü—Ä–µ–∏–º–µ–Ω—É–≤–∞–Ω –Ω–∞ {os.path.basename(new_name)}")

            time.sleep(10)

    except KeyboardInterrupt:
        print("\nüõë –°–ø–∏—Ä–∞–Ω–µ –Ω–∞ —Å–∫—Ä–∏–ø—Ç–∞. –î–æ–≤–∏–∂–¥–∞–Ω–µ!")

if __name__ == "__main__":
    main()


üëÄ –ú–æ–Ω–∏—Ç–æ—Ä–∏–Ω–≥ –Ω–∞ –ø–∞–ø–∫–∞ 'INPUT'...
–ù–∞—Ç–∏—Å–Ω–∏ Ctrl+C –∑–∞ —Å–ø–∏—Ä–∞–Ω–µ.


üõë –°–ø–∏—Ä–∞–Ω–µ –Ω–∞ —Å–∫—Ä–∏–ø—Ç–∞. –î–æ–≤–∏–∂–¥–∞–Ω–µ!


In [18]:
import os
import shutil
import time
import glob
from typing import Dict, List, Tuple

import lancedb
import tiktoken
from docling.chunking import HybridChunker
from docling.document_converter import DocumentConverter
from lancedb.embeddings import get_registry
from lancedb.pydantic import LanceModel, Vector
from openai import OpenAI
from transformers.tokenization_utils_base import PreTrainedTokenizerBase

# --- LOGGING & WARNINGS SUPPRESSION ---
import logging
import warnings
# –°–∫—Ä–∏–≤–∞–º–µ —Å–ø–∞–º–∞ –æ—Ç Docling –∏ RapidOCR
logging.getLogger("docling").setLevel(logging.ERROR)
logging.getLogger("docling.backend.msword_backend").setLevel(logging.ERROR) # –ó–∞ DOCX –≥—Ä–µ—à–∫–∏—Ç–µ
warnings.filterwarnings("ignore")

# --- CONFIGURATION ---
OPENAI_API_KEY = ""
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

INPUT_FOLDER = "INPUT"
DB_PATH = "data/lancedb"
MAX_TOKENS = 8191

# Initialize OpenAI client
client = OpenAI()

# --- TOKENIZER WRAPPER ---
class OpenAITokenizerWrapper(PreTrainedTokenizerBase):
    def __init__(self, model_name: str = "cl100k_base", max_length: int = 8191, **kwargs):
        super().__init__(model_max_length=max_length, **kwargs)
        self.tokenizer = tiktoken.get_encoding(model_name)
        self._vocab_size = self.tokenizer.max_token_value

    def tokenize(self, text: str, **kwargs) -> List[str]:
        return [str(t) for t in self.tokenizer.encode(text)]

    def _tokenize(self, text: str) -> List[str]:
        return self.tokenize(text)

    def _convert_token_to_id(self, token: str) -> int:
        return int(token)

    def _convert_id_to_token(self, index: int) -> str:
        return str(index)

    def __len__(self) -> int:
        return self._vocab_size

    def get_vocab(self) -> Dict[str, int]:
        return dict(enumerate(range(self.vocab_size)))

    @property
    def vocab_size(self) -> int:
        return self._vocab_size

    def save_vocabulary(self, *args) -> Tuple[str]:
        return ()

    @classmethod
    def from_pretrained(cls, *args, **kwargs):
        return cls()

# --- SCHEMA SETUP ---
func = get_registry().get("openai").create(name="text-embedding-3-large")

class ChunkMetadata(LanceModel):
    filename: str | None
    # FIX: –ü—Ä–µ–º–∞—Ö–≤–∞–º–µ "| None". LanceDB –ø—Ä–µ–¥–ø–æ—á–∏—Ç–∞ –≤–∏–Ω–∞–≥–∏ List, –¥–æ—Ä–∏ –¥–∞ –µ –ø—Ä–∞–∑–µ–Ω.
    # –¢–æ–≤–∞ –æ–ø—Ä–∞–≤—è –≥—Ä–µ—à–∫–∞—Ç–∞ "pyarrow.lib.DataType object has no attribute value_field"
    page_numbers: List[int]
    title: str | None

class Chunks(LanceModel):
    text: str = func.SourceField()
    vector: Vector(func.ndims()) = func.VectorField()
    metadata: ChunkMetadata

# --- HELPER FUNCTIONS ---

def get_or_create_table():
    """–°–≤—ä—Ä–∑–≤–∞ —Å–µ —Å –±–∞–∑–∞—Ç–∞."""
    db = lancedb.connect(DB_PATH)

    if "docling" in db.table_names():
        return db.open_table("docling")
    else:
        print("üÜï –°—ä–∑–¥–∞–≤–∞–Ω–µ –Ω–∞ –Ω–æ–≤–∞ —Ç–∞–±–ª–∏—Ü–∞ 'docling'...")
        return db.create_table("docling", schema=Chunks)

def process_single_file(file_path, table, tokenizer):
    filename = os.path.basename(file_path)
    print(f"\nüìÑ –û–±—Ä–∞–±–æ—Ç–∫–∞ –Ω–∞: {filename}")

    # --- DEDUPLICATION CHECK ---
    existing_files = table.search().where(f"metadata.filename = '{filename}'").limit(1).to_pandas()

    if not existing_files.empty:
        print(f"   üîÑ –§–∞–π–ª—ä—Ç '{filename}' –≤–µ—á–µ —Å—ä—â–µ—Å—Ç–≤—É–≤–∞. –ò–∑—Ç—Ä–∏–≤–∞–Ω–µ –Ω–∞ —Å—Ç–∞—Ä–∏—Ç–µ –∑–∞–ø–∏—Å–∏...")
        table.delete(f"metadata.filename = '{filename}'")
        print("   üóëÔ∏è –°—Ç–∞—Ä–∏—Ç–µ –∑–∞–ø–∏—Å–∏ —Å–∞ –∏–∑—Ç—Ä–∏—Ç–∏. –î–æ–±–∞–≤—è–Ω–µ –Ω–∞ –Ω–æ–≤–∞—Ç–∞ –≤–µ—Ä—Å–∏—è...")
    # ---------------------------

    try:
        # 1. Docling Convert
        converter = DocumentConverter()
        result = converter.convert(file_path)

        # 2. Chunking
        chunker = HybridChunker(
            tokenizer=tokenizer,
            max_tokens=MAX_TOKENS,
            merge_peers=True,
        )
        chunks = list(chunker.chunk(dl_doc=result.document))
        print(f"   üß© –ì–µ–Ω–µ—Ä–∏—Ä–∞–Ω–∏ {len(chunks)} –ø–∞—Ä—á–µ—Ç–∞.")

        if not chunks:
            print("   ‚ö†Ô∏è –§–∞–π–ª—ä—Ç –µ –ø—Ä–∞–∑–µ–Ω –∏–ª–∏ –Ω–µ –º–æ–∂–µ –¥–∞ —Å–µ –ø—Ä–æ—á–µ—Ç–µ.")
            return False

        # 3. Prepare Data
        processed_chunks = [
            {
                "text": chunk.text,
                "metadata": {
                    "filename": filename,
                    "page_numbers": [
                        page_no for item in chunk.meta.doc_items for item_prov in item.prov for page_no in [item_prov.page_no]
                    ], # FIX: –ú–∞—Ö–Ω–∞—Ö–º–µ "or None". –°–µ–≥–∞ –≤—Ä—ä—â–∞ [] –∞–∫–æ –µ –ø—Ä–∞–∑–Ω–æ.
                    "title": chunk.meta.headings[0] if chunk.meta.headings else "", # FIX: "" –≤–º–µ—Å—Ç–æ None –∑–∞ –ø–æ-—Å–∏–≥—É—Ä–Ω–æ
                },
            }
            for chunk in chunks
        ]

        # 4. Insert into LanceDB
        print("   üöÄ Indexing to LanceDB...")
        table.add(processed_chunks)
        print("   ‚úÖ –£—Å–ø–µ—à–Ω–æ –∏–Ω–¥–µ–∫—Å–∏—Ä–∞–Ω!")
        return True

    except Exception as e:
        print(f"   ‚ùå –ì–†–ï–®–ö–ê –ø—Ä–∏ –æ–±—Ä–∞–±–æ—Ç–∫–∞—Ç–∞ –Ω–∞ —Ñ–∞–π–ª–∞: {e}")
        return False

# --- MAIN LOOP ---

def main():
    if not os.path.exists(INPUT_FOLDER):
        os.makedirs(INPUT_FOLDER)
        print(f"üìÅ –°—ä–∑–¥–∞–¥–µ–Ω–∞ –ø–∞–ø–∫–∞ '{INPUT_FOLDER}'. –°–ª–∞–≥–∞–π —Ñ–∞–π–ª–æ–≤–µ –≤—ä—Ç—Ä–µ (PDF, DOCX, PPTX, XLSX)!")

    tokenizer = OpenAITokenizerWrapper(model_name="cl100k_base", max_length=8191)
    table = get_or_create_table()

    print(f"\nüëÄ –ú–æ–Ω–∏—Ç–æ—Ä–∏–Ω–≥ –Ω–∞ –ø–∞–ø–∫–∞ '{INPUT_FOLDER}'...")
    print("–ù–∞—Ç–∏—Å–Ω–∏ Ctrl+C –∑–∞ —Å–ø–∏—Ä–∞–Ω–µ.\n")

    # –°–ø–∏—Å—ä–∫ —Å –ø–æ–¥–¥—ä—Ä–∂–∞–Ω–∏ —Ñ–æ—Ä–º–∞—Ç–∏
    extensions = ["*.pdf", "*.docx", "*.pptx", "*.html", "*.md", "*.xlsx"]

    try:
        while True:
            files_to_process = []
            for ext in extensions:
                files_to_process.extend(glob.glob(os.path.join(INPUT_FOLDER, ext)))

            if files_to_process:
                print(f"üîé –ù–∞–º–µ—Ä–µ–Ω–∏ {len(files_to_process)} —Ñ–∞–π–ª–∞ –∑–∞ –æ–±—Ä–∞–±–æ—Ç–∫–∞!")

                for file_path in files_to_process:
                    success = process_single_file(file_path, table, tokenizer)

                    if success:
                        try:
                            os.remove(file_path)
                            print(f"   üóëÔ∏è –ò–∑—Ç—Ä–∏—Ç –æ—Ç Input: {os.path.basename(file_path)}")
                        except OSError as e:
                            print(f"   ‚ö†Ô∏è –ù–µ –º–æ–≥–∞ –¥–∞ –∏–∑—Ç—Ä–∏—è —Ñ–∞–π–ª–∞: {e}")
                    else:
                        new_name = file_path + ".failed"
                        try:
                            os.rename(file_path, new_name)
                            print(f"   ‚ö†Ô∏è –ü—Ä–µ–∏–º–µ–Ω—É–≤–∞–Ω –Ω–∞ {os.path.basename(new_name)}")
                        except:
                            pass # –ê–∫–æ –∏ –ø—Ä–µ–∏–º–µ–Ω—É–≤–∞–Ω–µ—Ç–æ –Ω–µ —Å—Ç–∞–Ω–µ, –ø—Ä–æ—Å—Ç–æ –ø—Ä–æ–¥—ä–ª–∂–∞–≤–∞–º–µ

            time.sleep(10)

    except KeyboardInterrupt:
        print("\nüõë –°–ø–∏—Ä–∞–Ω–µ –Ω–∞ —Å–∫—Ä–∏–ø—Ç–∞. –î–æ–≤–∏–∂–¥–∞–Ω–µ!")

if __name__ == "__main__":
    main()

üìÅ –°—ä–∑–¥–∞–¥–µ–Ω–∞ –ø–∞–ø–∫–∞ 'INPUT'. –°–ª–∞–≥–∞–π —Ñ–∞–π–ª–æ–≤–µ –≤—ä—Ç—Ä–µ (PDF, DOCX, PPTX, XLSX)!

üëÄ –ú–æ–Ω–∏—Ç–æ—Ä–∏–Ω–≥ –Ω–∞ –ø–∞–ø–∫–∞ 'INPUT'...
–ù–∞—Ç–∏—Å–Ω–∏ Ctrl+C –∑–∞ —Å–ø–∏—Ä–∞–Ω–µ.



[32m[INFO] 2025-11-22 23:00:27,930 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2025-11-22 23:00:27,975 [RapidOCR] download_file.py:60: File exists and is valid: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2025-11-22 23:00:27,976 [RapidOCR] torch.py:54: Using /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m


üîé –ù–∞–º–µ—Ä–µ–Ω–∏ 1 —Ñ–∞–π–ª–∞ –∑–∞ –æ–±—Ä–∞–±–æ—Ç–∫–∞!

üìÑ –û–±—Ä–∞–±–æ—Ç–∫–∞ –Ω–∞: BOFI 50014.1 –ü—Ä–∏–≥–æ—Ç–≤—è–Ω–µ –∏ –∑–∞–ø—ä–ª–≤–∞–Ω–µ –Ω–∞ –∏–∑–¥–µ–ª–∏—è—Ç–∞ —Å –≥–∞–∑–æ–±–µ—Ç–æ–Ω–Ωa —Å–º–µ—Å(3).pdf
   üîÑ –§–∞–π–ª—ä—Ç 'BOFI 50014.1 –ü—Ä–∏–≥–æ—Ç–≤—è–Ω–µ –∏ –∑–∞–ø—ä–ª–≤–∞–Ω–µ –Ω–∞ –∏–∑–¥–µ–ª–∏—è—Ç–∞ —Å –≥–∞–∑–æ–±–µ—Ç–æ–Ω–Ωa —Å–º–µ—Å(3).pdf' –≤–µ—á–µ —Å—ä—â–µ—Å—Ç–≤—É–≤–∞. –ò–∑—Ç—Ä–∏–≤–∞–Ω–µ –Ω–∞ —Å—Ç–∞—Ä–∏—Ç–µ –∑–∞–ø–∏—Å–∏...
   üóëÔ∏è –°—Ç–∞—Ä–∏—Ç–µ –∑–∞–ø–∏—Å–∏ —Å–∞ –∏–∑—Ç—Ä–∏—Ç–∏. –î–æ–±–∞–≤—è–Ω–µ –Ω–∞ –Ω–æ–≤–∞—Ç–∞ –≤–µ—Ä—Å–∏—è...


[32m[INFO] 2025-11-22 23:00:28,219 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2025-11-22 23:00:28,223 [RapidOCR] download_file.py:60: File exists and is valid: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_ptocr_mobile_v2.0_cls_infer.pth[0m
[32m[INFO] 2025-11-22 23:00:28,225 [RapidOCR] torch.py:54: Using /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_ptocr_mobile_v2.0_cls_infer.pth[0m
[32m[INFO] 2025-11-22 23:00:28,336 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2025-11-22 23:00:28,416 [RapidOCR] download_file.py:60: File exists and is valid: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_rec_infer.pth[0m
[32m[INFO] 2025-11-22 23:00:28,417 [RapidOCR] torch.py:54: Using /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_rec_infer.pth[0m


   üß© –ì–µ–Ω–µ—Ä–∏—Ä–∞–Ω–∏ 32 –ø–∞—Ä—á–µ—Ç–∞.
   üöÄ Indexing to LanceDB...
   ‚úÖ –£—Å–ø–µ—à–Ω–æ –∏–Ω–¥–µ–∫—Å–∏—Ä–∞–Ω!
   üóëÔ∏è –ò–∑—Ç—Ä–∏—Ç –æ—Ç Input: BOFI 50014.1 –ü—Ä–∏–≥–æ—Ç–≤—è–Ω–µ –∏ –∑–∞–ø—ä–ª–≤–∞–Ω–µ –Ω–∞ –∏–∑–¥–µ–ª–∏—è—Ç–∞ —Å –≥–∞–∑–æ–±–µ—Ç–æ–Ω–Ωa —Å–º–µ—Å(3).pdf
üîé –ù–∞–º–µ—Ä–µ–Ω–∏ 1 —Ñ–∞–π–ª–∞ –∑–∞ –æ–±—Ä–∞–±–æ—Ç–∫–∞!

üìÑ –û–±—Ä–∞–±–æ—Ç–∫–∞ –Ω–∞: MOST  –°—Ç–∞–Ω–¥–∞—Ä—Ç —Ä–∞—Å—á–µ—Ç–∞ –Ω–æ—Ä–º –ø–æ—Ç—Ä–µ–±–ª–µ–Ω–∏—è –º–∞—Ç–µ—Ä–∏–∞–ª–æ–≤ (–Ω–µ—É—Ç–≤).docx
   üß© –ì–µ–Ω–µ—Ä–∏—Ä–∞–Ω–∏ 11 –ø–∞—Ä—á–µ—Ç–∞.
   üöÄ Indexing to LanceDB...
   ‚úÖ –£—Å–ø–µ—à–Ω–æ –∏–Ω–¥–µ–∫—Å–∏—Ä–∞–Ω!
   üóëÔ∏è –ò–∑—Ç—Ä–∏—Ç –æ—Ç Input: MOST  –°—Ç–∞–Ω–¥–∞—Ä—Ç —Ä–∞—Å—á–µ—Ç–∞ –Ω–æ—Ä–º –ø–æ—Ç—Ä–µ–±–ª–µ–Ω–∏—è –º–∞—Ç–µ—Ä–∏–∞–ª–æ–≤ (–Ω–µ—É—Ç–≤).docx

üõë –°–ø–∏—Ä–∞–Ω–µ –Ω–∞ —Å–∫—Ä–∏–ø—Ç–∞. –î–æ–≤–∏–∂–¥–∞–Ω–µ!
