In [69]:
import os
import fitz  # PyMuPDF
import ocrmypdf
import re
from pathlib import Path
import pymupdf4llm
from llama_index.core.schema import Document
import logging
import weaviate

In [70]:
TEMP_DIR = Path("temp_ocr")
MARKDOWN_DIR = Path("output_chunk")
TEMP_DIR.mkdir(exist_ok=True)
MARKDOWN_DIR.mkdir(exist_ok=True)

# Set up logging
logging.basicConfig(filename='pipeline.log', level=logging.INFO, 
                    format='%(asctime)s:%(levelname)s:%(message)s')


In [71]:
def is_scanned_pdf(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        return all(not page.get_text().strip() for page in doc)
    except Exception as e:
        logging.error(f"Error checking if PDF is scanned: {e}")
        return False

In [72]:
def run_ocr(input_path, output_path):
    try:
        logging.info("Running OCR on scanned PDF")
        ocrmypdf.ocr(
            input_file=input_path,
            output_file=output_path,
            rotate_pages=True,
            deskew=True,
            force_ocr=True,
            skip_text=True
        )
        logging.info(f"OCR complete: {output_path}")
    except Exception as e:
        logging.error(f"OCR failed: {e}")
        raise

In [73]:
def remove_tables_from_markdown(md_text):
    try:
        # Remove sections that start with 'tables:' and continue until a double newline
        cleaned = re.sub(r'(?s)^tables:.*?\n\n', '', md_text)
        return cleaned
    except Exception as e:
        logging.error(f"Failed to clean tables from markdown: {e}")
        return md_text

In [74]:
def extract_chunks_with_llamareader(pdf_path, chunk_size=1024, chunk_overlap=128):
    try:
        reader = pymupdf4llm.LlamaMarkdownReader(
            margins=(0, 30, 0, 20),
            max_levels=3,
            body_limit=11,
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap
        )
        docs = reader.load_data(pdf_path)
        llama_docs = [Document(text=remove_tables_from_markdown(doc.text), metadata=doc.extra_info) for doc in docs]

        for i, doc in enumerate(llama_docs):
            out_path = MARKDOWN_DIR / f"{Path(pdf_path).stem}_chunk{i}.md"
            out_path.write_text(doc.text, encoding="utf-8")
            logging.info(f"Chunk {i} saved to {out_path}, headers={doc.metadata.get('header_path')}")

        return llama_docs
    except Exception as e:
        logging.error(f"Failed to extract chunks with LlamaMarkdownReader: {e}")
        return []

In [75]:
def process_pdf_pipeline(pdf_path):
    filename = Path(pdf_path).stem
    ocr_output = TEMP_DIR / f"{filename}_ocr.pdf"

    try:
        if is_scanned_pdf(pdf_path):
            logging.info("Scanned PDF detected")
            run_ocr(pdf_path, ocr_output)
            used_pdf = ocr_output
        else:
            logging.info("Born-digital PDF detected")
            used_pdf = Path(pdf_path)

        chunks = extract_chunks_with_llamareader(used_pdf)
        return chunks
    except Exception as e:
        logging.error(f"Failed to process PDF pipeline: {e}")
        return []


In [76]:
test_pdf = "pdfs/Mbogo_et_al_1996_Med_Vet_Ento.pdf"
results = process_pdf_pipeline(test_pdf)
print(f"✅ Processed {len(results)} chunks from: {test_pdf}")


✅ Processed 0 chunks from: pdfs/Mbogo_et_al_1996_Med_Vet_Ento.pdf
