1. Imports and Configs

In [None]:
import os
from pathlib import Path
import io
import json
import numpy as np
from typing import List, Dict, Any, Tuple


DATA_DIR = Path("data")
DATA_DIR.mkdir(exist_ok=True)
PAGE_IMAGES_DIR = DATA_DIR / "page_images"
PAGE_IMAGES_DIR.mkdir(exist_ok=True)
FAISS_INDEX_PATH = DATA_DIR / "faiss_index.bin"
META_PATH = DATA_DIR / "meta.json"

# Configuration
IMAGE_DPI = 300
TOP_K_PAGES = 5
BATCH_SIZE = 16

2. Convert PDF → page images

In [None]:
from pdf2image import convert_from_path
from PIL import Image

def pdf_to_page_images(pdf_path: str, out_dir: Path, dpi: int = IMAGE_DPI) -> List[Path]:
    out_dir.mkdir(parents=True, exist_ok=True)
    images = convert_from_path(pdf_path, dpi=dpi)
    saved = []
    for i, img in enumerate(images, start=1):
        p = out_dir / f"{Path(pdf_path).stem}_page_{i:04d}.png"
        img.save(p, format="PNG")
        saved.append(p)
    return saved


3. Embed page images with Jina

In [None]:
import base64
import requests

JINA_API_KEY = os.getenv("JINA_API_KEY")
JINA_EMBED_URL = "https://api.jina.ai/v1/embed"

def embed_images_with_jina(image_paths: List[Path], model: str = "jina-embeddings-v3") -> np.ndarray:
    if not JINA_API_KEY:
        raise RuntimeError("No JINA_API_KEY in env")
    headers = {"Authorization": f"Bearer {JINA_API_KEY}"}
    embeddings = []
    for p in image_paths:
        with open(p, "rb") as f:
            b64 = base64.b64encode(f.read()).decode("utf-8")
        payload = {"model": model, "mode": "image", "data": [b64]}
        resp = requests.post(JINA_EMBED_URL, json=payload, headers=headers, timeout=60)
        resp.raise_for_status()
        emb = resp.json()["data"][0]["embedding"]
        embeddings.append(np.array(emb, dtype=np.float32))
    return np.vstack(embeddings)

4. Store embeddings in FAISS

In [None]:
import faiss, json

def build_faiss_index(embeddings, meta, index_path="faiss.idx", meta_path="meta.json"):
    d = embeddings.shape[1]
    index = faiss.IndexFlatIP(d)
    faiss.normalize_L2(embeddings)
    index.add(embeddings)
    faiss.write_index(index, index_path)
    with open(meta_path, "w") as f:
        json.dump(meta, f, indent=2)

5. Query → embed with Jina → retrieve relevant pages

In [None]:
def jina_text_embed(query, model="jina-embeddings-v3"):
    headers = {"Authorization": f"Bearer {JINA_API_KEY}"}
    payload = {"model": model, "input": [query]}
    r = requests.post(JINA_EMBED_URL, json=payload, headers=headers)
    r.raise_for_status()
    return np.array(r.json()["data"][0]["embedding"], dtype=np.float32).reshape(1, -1)

def retrieve(query, k=5, index_path="faiss.idx", meta_path="meta.json"):
    q_emb = jina_text_embed(query)
    faiss.normalize_L2(q_emb)
    index = faiss.read_index(index_path)
    D, I = index.search(q_emb, k)
    with open(meta_path) as f:
        meta = json.load(f)
    results = [{"score": float(s), "meta": meta[i]} for s, i in zip(D[0], I[0])]
    return results

6. Parse retrieved pages with unstructured

In [None]:
from unstructured.partition.auto import partition
import pytesseract
from PIL import Image

def parse_page(image_path):
    result = {"path": image_path, "text": ""}
    try:
        elements = partition(filename=image_path)
        texts = [e.text for e in elements if hasattr(e, "text") and e.text]
        result["text"] = "\n".join(texts)
    except:
        result["text"] = pytesseract.image_to_string(Image.open(image_path))
    return result

7. RAPTOR-like index & answer query (local LLMs)

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
from transformers import pipeline

text_model = SentenceTransformer("all-MiniLM-L6-v2")  
summarizer = pipeline("summarization", model="google/flan-t5-small")

def chunk_text(text, size=400, overlap=50):
    words, chunks = text.split(), []
    i = 0
    while i < len(words):
        chunks.append(" ".join(words[i:i+size]))
        i += size - overlap
    return chunks

def build_clusters(pages):
    chunks, metas = [], []
    for p in pages:
        for c in chunk_text(p["text"]):
            chunks.append(c)
            metas.append(p["path"])
    if not chunks: return []
    embs = text_model.encode(chunks, normalize_embeddings=True)
    n_clusters = max(2, int(len(chunks) ** 0.5))
    labels = AgglomerativeClustering(n_clusters=n_clusters).fit_predict(embs)
    clusters = {}
    for lab, chunk in zip(labels, chunks):
        clusters.setdefault(lab, []).append(chunk)
    cluster_summaries = []
    for lab, chs in clusters.items():
        text = " ".join(chs)[:1000]
        summary = summarizer(text, max_length=100, min_length=20)[0]["summary_text"]
        cluster_summaries.append(summary)
    return cluster_summaries

def answer_query(query, cluster_summaries):
    context = "\n\n".join(cluster_summaries[:3])
    prompt = f"Context:\n{context}\n\nQuestion: {query}\n\nAnswer:"
    ans = summarizer(prompt, max_length=150, min_length=30)[0]["summary_text"]
    return ans

Usage Flow

In [None]:
# 1. Convert PDF to images
pages = pdf_to_page_images("3M_2017_10K.pdf", "pages/")

# 2. Embed with Jina
embs = embed_images_with_jina(pages)

# 3. Index in FAISS
build_faiss_index(embs, [{"path": p} for p in pages])

# 4. Retrieve
hits = retrieve("Q4 revenue growth")
print("Top hits:", hits)

# 5. Parse retrieved
parsed = [parse_page(h["meta"]["path"]) for h in hits]

# 6. Build RAPTOR index + answer
clusters = build_clusters(parsed)
print(answer_query("What was the Q4 revenue growth?", clusters))