In [None]:
from pathlib import Path
import os
import uuid
from pdf2image import convert_from_path
import easyocr, pytesseract
import cv2, numpy as np
from tqdm import tqdm
from langdetect import detect
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

In [None]:
PDF_PATH = Path("../data/0.pdf")
EMBED_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
CHROMA_DIR = "chroma_db_notebook"
COLLECTION_NAME = "megher_nb"
TOP_K = 5

In [None]:
pages = convert_from_path(str(PDF_PATH), dpi=300)
print("Pages converted:", len(pages))

# show first page
plt.figure(figsize=(5,7))
plt.imshow(pages[0])
plt.axis("off")
plt.title("Page 1 (raw)")
plt.show()

In [None]:
def resize_image(img, max_side=2000):
    h,w = img.shape[:2]
    if max(h,w) <= max_side:
        return img
    scale = max_side / max(h,w)
    return cv2.resize(img, (int(w*scale), int(h*scale)), interpolation=cv2.INTER_AREA)

In [None]:
def binarize(img):
    if img.ndim == 3:
        img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    return cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                 cv2.THRESH_BINARY, 35, 11)

In [None]:
img0 = cv2.cvtColor(np.array(pages[0]), cv2.COLOR_RGB2BGR)
img0 = resize_image(img0)
b0 = binarize(img0)
plt.figure(figsize=(5,7))
plt.imshow(b0, cmap="gray")
plt.axis("off")
plt.title("Page 1 (binarized)")
plt.show()

In [None]:
reader = easyocr.Reader(["bn","en"], gpu=False)  # set gpu=True if available
pages_text = []
MAX_PAGES = min(10, len(pages))

In [None]:
for i in range(MAX_PAGES):
    pil = pages[i]
    img = cv2.cvtColor(np.array(pil), cv2.COLOR_RGB2BGR)
    img = resize_image(img)
    img_bin = binarize(img)
    try:
        easy_res = reader.readtext(img_bin, detail=0)
        text_easy = "\n".join(easy_res).strip()
    except Exception:
        text_easy = ""
    if len(text_easy) < 20:
        tesseract_txt = pytesseract.image_to_string(cv2.cvtColor(img, cv2.COLOR_BGR2RGB), lang="ben+eng")
        text = tesseract_txt.strip() if len(tesseract_txt.strip()) > len(text_easy) else text_easy
        engine = "tesseract" if text == tesseract_txt.strip() else "easyocr"
    else:
        text = text_easy
        engine = "easyocr"
    pages_text.append({"page": i+1, "text": text, "engine": engine})
print("OCRed pages (demo):", len(pages_text))
print("Sample (page 1) snippet:", pages_text[0]["text"][:300])

In [None]:
def clean_text(s):
    if not s:
        return s
    s = s.replace("-\n", "")
    s = s.replace("\r\n", "\n")
    s = " ".join(s.split())
    s = s.replace("|", "।")
    return s.strip()

for p in pages_text:
    p["clean"] = clean_text(p["text"])

print("Clean snippet (page1):", pages_text[0]["clean"][:300])

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=200)
documents = []
for p in pages_text:
    if not p["clean"]:
        continue
    chunks = splitter.split_text(p["clean"])
    for idx, c in enumerate(chunks):
        documents.append(Document(page_content=c, metadata={"page": p["page"], "chunk_id": f"{p['page']}_{idx}", "engine": p["engine"]}))

In [None]:
print("Chunks created (demo):", len(documents))
print("Example metadata:", documents[0].metadata)
print("Example chunk (first 200 chars):\n", documents[0].page_content[:200])

In [None]:
model = SentenceTransformer(EMBED_MODEL)
N = min(200, len(documents))

In [None]:
texts = [d.page_content for d in documents[:N]]
metas = [d.metadata for d in documents[:N]]
embs = model.encode(texts, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=True)

In [None]:
pca = PCA(n_components=2)
XY = pca.fit_transform(embs)

In [None]:
plt.figure(figsize=(8,6))
pages_list = [m["page"] for m in metas]
sc = plt.scatter(XY[:,0], XY[:,1], c=pages_list, cmap="tab20", s=30)
plt.colorbar(sc, label="page")
plt.title(f"PCA of chunk embeddings (first {N} chunks)")
plt.xlabel("PC1"); plt.ylabel("PC2")
plt.show()

In [None]:
client = chromadb.Client(Settings(chroma_db_impl="duckdb+parquet", persist_directory=CHROMA_DIR))
try:
    coll = client.get_collection(COLLECTION_NAME)
except Exception:
    coll = client.create_collection(COLLECTION_NAME)

In [None]:
ids = [m["chunk_id"] for m in metas]
docs_texts = texts
metadatas = metas
coll.add(ids=ids, documents=docs_texts, metadatas=metadatas, embeddings=embs.tolist())
client.persist()

In [None]:
print("Chroma items (approx):", len(coll.get().get("ids", [])))

In [None]:
query = "বৃষ্টির বর্ণনা কেন গুরুত্বপূর্ণ ছিল?"
q_emb = model.encode([query], convert_to_numpy=True, normalize_embeddings=True)[0]
res = coll.query(query_embeddings=[q_emb.tolist()], n_results=TOP_K, include=["metadatas","documents","distances"])uv sync


In [None]:
retrieved = []
for i in range(len(res["ids"][0])):
    retrieved.append({
        "id": res["ids"][0][i],
        "doc": res["documents"][0][i],
        "meta": res["metadatas"][0][i],
        "distance": res["distances"][0][i]
    })

for i, r in enumerate(retrieved):
    print(f"Rank {i+1} | id={r['id']} | page={r['meta']['page']} | distance={r['distance']:.4f}")
    print(r["doc"][:300].replace("\n"," ") + "\n" + "-"*60)

In [None]:
doc_embs = [model.encode([r["doc"]], convert_to_numpy=True, normalize_embeddings=True)[0] for r in retrieved]
sims = cosine_similarity([q_emb], doc_embs)[0]

In [None]:
plt.figure(figsize=(7,3))
plt.bar(range(1, len(sims)+1), sims)
plt.xticks(range(1, len(sims)+1))
plt.xlabel("Rank")
plt.ylabel("Cosine similarity")
plt.title("Query vs retrieved chunk similarity")
plt.ylim(0,1)
plt.show()

In [None]:
# show table
pd.DataFrame([{"rank": i+1, "id": r["id"], "page": r["meta"]["page"], "similarity": float(s)} for i,(r,s) in enumerate(zip(retrieved, sims))])

In [None]:
USE_OLLAMA = False  # set True to use local Ollama API at http://localhost:11434
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")

In [None]:
context = "\n\n---\n\n".join([r["doc"] for r in retrieved])
detected = detect(query)
if detected.startswith("bn"):
    prompt = f"রেফারেন্স:\n{context}\n\nপ্রশ্ন: {query}\n\nসংক্ষিপ্তভাবে বাংলায় উত্তর দিন:"
else:
    prompt = f"Reference:\n{context}\n\nQuestion: {query}\n\nAnswer briefly in English:"

if USE_OLLAMA:
    import requests
    resp = requests.post("http://localhost:11434/api/generate", json={"model": "llama3", "prompt": prompt, "max_tokens": 512}, timeout=60)
    print("Ollama:", resp.json())
elif OPENAI_API_KEY:
    from openai import OpenAI
    client = OpenAI(api_key=OPENAI_API_KEY)
    resp = client.chat.completions.create(model="gpt-4o-mini", messages=[{"role":"user","content":prompt}], max_tokens=512, temperature=0.2)
    ans = resp.choices[0].message.content
    print("LLM answer:\n", ans)
else:
    print("No LLM configured (set OPENAI_API_KEY or USE_OLLAMA=True).")

In [None]:
rows = []
for i,(r,s) in enumerate(zip(retrieved, sims)):
    rows.append({"rank": i+1, "id": r["id"], "page": r["meta"]["page"], "similarity": float(s), "snippet": r["doc"][:400]})
df = pd.DataFrame(rows)
out = Path("../data") / "retrieval_eval.csv"
df.to_csv(out, index=False)


In [None]:
print("Saved retrieval_eval.csv ->", out)
df.head()

## Next steps (practical)
- If this notebook works, extract the functions into the modular files (`src/ocr_extraction.py`, `src/preprocess.py`, etc.).
- For production ingestion, use a background worker (Celery/RQ) for large PDFs.
- Improve OCR by tuning `binarize`/deskew or using GPU for EasyOCR.
- Use GPU for sentence-transformers to speed up embedding large volumes.
