# Docling Pipelines: All use-cases + LLM Enrichment + Toggles

This notebook lets you:
- Run all scripts in `scripts/` (with short descriptions).
- Toggle Formula Understanding and Picture Description before conversion.
- Select LLM provider via env (gemini or gpt) and enrich chunks with structured outputs.

Inputs go in `source/`. Outputs are written to `output/`.

In [None]:
# Install dependencies (safe to re-run)
!pip install -q -r requirements.txt

In [None]:
# Load environment and configure provider
import os
from pathlib import Path
from dotenv import load_dotenv

# Load .env if present. You can copy env.example to .env and fill keys.
if Path('.env').exists():
    load_dotenv('.env')
else:
    load_dotenv()  # load from environment only

# Provider: 'gemini' or 'gpt' (from env PROVIDER)
PROVIDER = os.getenv('PROVIDER', 'gemini').strip().lower()
OPENAI_MODEL = os.getenv('OPENAI_MODEL', 'gpt-4o-mini')
GEMINI_MODEL = os.getenv('GEMINI_MODEL', 'gemini-2.5-flash')

print('Provider:', PROVIDER)
print('OPENAI_MODEL:', OPENAI_MODEL)
print('GEMINI_MODEL:', GEMINI_MODEL)

In [None]:
# Paths
project_root = Path().resolve()
src_dir = project_root / 'source'
out_dir = project_root / 'output'
out_dir.mkdir(parents=True, exist_ok=True)
print('Project root:', project_root)
print('Source dir:', src_dir)
print('Output dir:', out_dir)
print('Source files:', list(src_dir.glob('*')))

## 1) Scripts quick-run (what each does)

- `general_convert.py`: basic PDF/URL conversion to Markdown/JSON.

In [None]:
%run scripts/general_convert.py

- `vlm_image_understanding.py`: VLM (SmolDocling) for image-heavy PDFs.

In [None]:
%run scripts/vlm_image_understanding.py

- `maths_processing.py`: converts and extracts math snippets heuristically.

In [None]:
%run scripts/maths_processing.py

- `contextual_hybrid_chunking.py`: HybridChunker raw + contextualized chunks.

In [None]:
%run scripts/contextual_hybrid_chunking.py

- `enrich_formula_understanding.py`: Formula Understanding enrichment (LaTeX/MathML).

In [None]:
%run scripts/enrich_formula_understanding.py

- `enrich_picture_description.py`: Picture Description enrichment (VLM captions).

In [None]:
%run scripts/enrich_picture_description.py

## 2) One-pass combination: picture description + formula enrichment + contextual chunkings + LLM enrichment options
- Toggle Formula/Picture in the cell below.
- Select provider via PROVIDER in .env (gemini/gpt).
- Produces TXT/JSONL of enriched chunks (structured).

In [None]:
# Toggles (set True/False or override via env if you prefer)
DO_FORMULA = True   # <- set False to disable Formula Understanding
DO_PICTURE = True   # <- set False to disable Picture Description
DO_LLM = True       # <- set False to disable LLM enrichment

print('DO_FORMULA:', DO_FORMULA, '| DO_PICTURE:', DO_PICTURE, '| DO_LLM:', DO_LLM)

from datetime import datetime
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.base_models import InputFormat
from docling.chunking import HybridChunker

# Configure Docling pipeline based on toggles
pdf_opts = PdfPipelineOptions()
pdf_opts.do_formula_enrichment = bool(DO_FORMULA)
pdf_opts.do_picture_description = bool(DO_PICTURE)
# Tip: you could choose a picture description preset here if desired
# from docling.datamodel.pipeline_options import smolvlm_picture_description
# pdf_opts.picture_description_options = smolvlm_picture_description

converter = DocumentConverter(
    format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_opts)}
)

# Pick a PDF
pdf_path = next((p for p in src_dir.glob('*.pdf')), None)
assert pdf_path is not None, 'Put a PDF into source/'

dl_doc = converter.convert(str(pdf_path)).document

# Chunk
chunker = HybridChunker()
chunks = list(chunker.chunk(dl_doc=dl_doc))

# LLM enrichment (conditional on DO_LLM)
if DO_LLM:
    # Create selected LLM client
    if PROVIDER == 'gpt':
        from clients.openai_client import OpenAIClient, EnrichedChunk
        llm = OpenAIClient(model=OPENAI_MODEL)
    elif PROVIDER == 'gemini':
        from clients.gemini_client import GeminiClient, EnrichedChunk
        llm = GeminiClient(model=GEMINI_MODEL)
    else:
        raise ValueError(f'Unsupported PROVIDER: {PROVIDER}')

# Write outputs
ts = datetime.now().strftime('%Y%m%d_%H%M%S')
base = f'{pdf_path.stem}__combo_llm__{PROVIDER}__{ts}' if DO_LLM else f'{pdf_path.stem}__combo_basic__{ts}'
txt_path = out_dir / f'{base}.txt'
jsonl_path = out_dir / f'{base}.jsonl'

import json
with txt_path.open('w', encoding='utf-8') as f_txt, jsonl_path.open('w', encoding='utf-8') as f_jsonl:
    for i, ch in enumerate(chunks):
        raw = ch.text or ''
        structural = chunker.contextualize(chunk=ch)
        
        if DO_LLM:
            enriched = llm.enrich_chunk(raw, context=structural)  # Pydantic validated
            # TXT with LLM enrichment
            f_txt.write(f'=== {i} ===\n')
            f_txt.write('-- title --\n' + (enriched.title or '') + '\n')
            f_txt.write('-- summary --\n' + enriched.summary + '\n')
            f_txt.write('-- key_points --\n' + '\n'.join('- ' + kp for kp in enriched.key_points) + '\n')
            f_txt.write('-- enriched_text --\n' + enriched.enriched_text + '\n\n')
            # JSONL with LLM enrichment
            f_jsonl.write(json.dumps({
                'index': i,
                'title': enriched.title,
                'summary': enriched.summary,
                'key_points': enriched.key_points,
                'enriched_text': enriched.enriched_text,
                'path': getattr(ch, 'path', None),
                'id': getattr(ch, 'id', None),
            }, ensure_ascii=False) + '\n')
        else:
            # Basic output without LLM enrichment
            f_txt.write(f'=== {i} ===\n')
            f_txt.write('-- raw_text --\n' + raw + '\n')
            f_txt.write('-- structural_context --\n' + structural + '\n\n')
            # JSONL without LLM enrichment
            f_jsonl.write(json.dumps({
                'index': i,
                'raw_text': raw,
                'structural_context': structural,
                'path': getattr(ch, 'path', None),
                'id': getattr(ch, 'id', None),
            }, ensure_ascii=False) + '\n')

print('Wrote:', txt_path)
print('Wrote:', jsonl_path)

## 3) Inspect outputs

In [None]:
for p in sorted(out_dir.glob('*')):
    print(p.name)