In [23]:
path_to_pdf = r"C:\Users\Rohan\Desktop\Rohan\ML\RAG\Animal-Kingdom.pdf"

In [24]:
from random import randint,random
import os
import fitz  # PyMuPDF
from tqdm.auto import tqdm
#read PDF using PyMuPDF
def read_document(path_to_pdf: str) -> str:
    doc = fitz.open(path_to_pdf)
    texts = []
    for page in doc:
        texts.append(page.get_text("text"))
    return "\n".join(texts)

read_document(path_to_pdf)


'Animal Kingdom \n\n\n\n\n1.\nAnimal Kingdom \nINTRODUCTION\nThere is a vast diversity in the number and types \nof animals in nature. They range from unicellular \nto multicellular. They are found in the deepest \nof the oceans, in snow-covered mountains, from \nthe poles to the equator. There are differences in \nthe structure and forms of the different animals. \nBut there are some fundamental features in them \nwhich show resemblance such as arrangement \nof cells, body plan, symmetry, segmentation, \ncoelom, germ layer, body temperature, skeleton \nand notochord, types of digestive, respiratory, \ncirculatory, excretory, and reproductive systems.\nWe need to classify organisms to make our study \neasier and to study the interrelationship between \ndifferent groups.\nBASES OF CLASSIFICATION\nHabitat\n\t\ny\nOn the basis of habitat, animals are divided into \nthe following types:\n\t\n⚪\nAquatic-Organisms that are found in water. \nThey can be: \n\t\n\x90\nMarine-Echinoderms, many s

In [25]:
import spacy
nlp = spacy.load("en_core_web_sm")

def split_text(text: str, chunk_size: int = 500):
    # cleaning
    text = text.replace("\n", " ")
    text = " ".join(text.split())
    text = text.replace("⚪", "").replace("y", "")
    if text and text[0].isdigit():
        text = text.split(" ", 1)[1]
    
    # sentence segmentation
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
    
    chunks, current_chunk, current_size = [], [], 0
    for sentence in sentences:
        if not sentence.endswith("."):
            sentence += "."
        sentence_size = len(sentence)
        
        if current_size + sentence_size > chunk_size and current_chunk:
            chunks.append(" ".join(current_chunk))
            current_chunk = [sentence]
            current_size = sentence_size
        else:
            current_chunk.append(sentence)
            current_size += sentence_size
    
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    
    return chunks

split_text(read_document(path_to_pdf))

['Animal Kingdom 1. Animal Kingdom INTRODUCTION There is a vast diversit in the number and tpes of animals in nature. The range from unicellular to multicellular. The are found in the deepest of the oceans, in snow-covered mountains, from the poles to the equator. There are differences in the structure and forms of the different animals.',
 'But there are some fundamental features in them which show resemblance such as arrangement of cells, bod plan, smmetr, segmentation, coelom, germ laer, bod temperature, skeleton and notochord, tpes of digestive, respirator, circulator, excretor, and reproductive sstems. We need to classif organisms to make our stud easier and to stud the interrelationship between different groups.',
 'BASES OF CLASSIFICATION Habitat  On the basis of habitat, animals are divided into the following tpes:  Aquatic-Organisms that are found in water. The can be: \x90 Marine-Echinoderms, man sponges and coelenterates. \x90 Fresh water-Prawns, some fishes and molluscs. Te

In [27]:
#!pip install chromadb sentence-transformers
import chromadb
from chromadb.utils import embedding_functions
from sentence_transformers import SentenceTransformer

# Initialize ChromaDB client with persistence
client = chromadb.PersistentClient(path="chroma_db")

# Configure sentence transformer embedding function
# Use a smaller, more memory-efficient model
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="all-MiniLM-L6-v2", device="cpu"
)

# Create or get existing collection
collection = client.get_or_create_collection(
    name="documents_collection",
    embedding_function=sentence_transformer_ef
)


In [28]:
def process_document(file_path: str):
    """Process a single document and prepare it for ChromaDB"""
    try:
        # Read the document
        content = read_document(path_to_pdf)
        # Split into chunks
        chunks = split_text(content)

        # Prepare metadata
        file_name = os.path.basename(file_path)
        metadatas = [{"source": file_name, "chunk": i} for i in range(len(chunks))]
        ids = [f"{file_name}_chunk_{i}" for i in range(len(chunks))]

        return ids, chunks, metadatas
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return [], [], []

process_document(path_to_pdf)

Error processing C:\Users\Rohan\Desktop\Rohan\ML\RAG\Animal-Kingdom.pdf: Unable to allocate 22.2 MiB for an array with shape (20203, 288) and data type float32


([], [], [])

In [29]:
def add_to_collection(collection, ids, texts, metadatas, batch_size: int = 100):
    if not texts:
        return
    for i in range(0, len(texts), batch_size):
        j = min(i + batch_size, len(texts))
        collection.add(documents=texts[i:j], metadatas=metadatas[i:j], ids=ids[i:j])

def process_and_add_documents(collection, folder_path: str):
    files = [os.path.join(folder_path, f) for f in os.listdir(folder_path)
             if os.path.isfile(os.path.join(folder_path, f)) and f.lower().endswith(".pdf")]
    for fp in files:
        print(f"Processing {os.path.basename(fp)}...")
        ids, texts, metas = process_document(fp)
        add_to_collection(collection, ids, texts, metas)
        print(f"Added {len(texts)} chunks.")


In [30]:
def semantic_search(collection, query: str, n_results: int = 2):
    """Perform semantic search on the collection"""
    results = collection.query(
        query_texts=[query],
        n_results=n_results
    )
    return results

def get_context_with_sources(results):
    """Extract context and source information from search results"""
    # Combine document chunks into a single context
    context = "\n\n".join(results['documents'][0])

    # Format sources with metadata
    sources = [
        f"{meta['source']} (chunk {meta['chunk']})" 
        for meta in results['metadatas'][0]
    ]

    return context, sources

# 0) Are there vectors in this collection?
print("Collection name:", collection.name)
print("Vector count:", collection.count())

Collection name: documents_collection
Vector count: 24


In [32]:
# 1) Ingest ONE file quickly (page-aware) 
# (If you already have process_document/add_to_collection defined, reuse those)
ids, texts, metas = process_document(path_to_pdf)
add_to_collection(collection, ids, texts, metas)
print("After ingest, count:", collection.count())

# 2) Query with diagnostics
query = "Amphibians  Meiolecithal"
results = semantic_search(collection, query, n_results=5)

# 3) Inspect the raw structure
print({k: len(v[0]) if v else 0 for k, v in results.items() if isinstance(v, list)})

# 4) Handle empty results cleanly
if not results.get("documents") or not results["documents"][0]:
    print("No results found. Possible causes:")
    print(" - Collection is empty or not the one you ingested into")
    print(" - Persistence path points to an empty DB (check 'chroma_db' folder)")
    print(" - Different embedding_function used now vs at ingest time")
else:
    # Show nicely
    for doc, meta, dist in zip(results["documents"][0],
                               results["metadatas"][0],
                               results["distances"][0]):
        print(f"[{meta.get('source')} | chunk {meta.get('chunk')} | sim={1-dist:.3f}]")
        print(doc[:300], "...\n")



Error processing C:\Users\Rohan\Desktop\Rohan\ML\RAG\Animal-Kingdom.pdf: Unable to allocate 22.2 MiB for an array with shape (20203, 288) and data type float32
After ingest, count: 24
{'ids': 5, 'documents': 5, 'included': 9, 'metadatas': 5, 'distances': 5}
[mini-Animal-Kingdom.pdf | chunk 23 | sim=0.396]
Examples: Eggs of Amphibians  Meiolecithal:. Yolk is present in the entire ooplasm, except for a little disc-shaped space in the ctoplasm for the nucleus. Examples: Eggs of Reptiles Definition Metamorphosis: Drastic changes through which the larva of an organism, structurall and morphologicall modif ...

[mini-Animal-Kingdom.pdf | chunk 16 | sim=0.385]
It is of three tpes:. Schizocoelom: It develops due to the split in the mesoderm sheet. Examples: Arthropods, molluscs, annelids  Enterocoelom: The mesoderm arises from the wall of the embronic gut as a hollow outgrowth and forms the coelom. Examples: Echinoderms, chordates  Haemocoelomates: In Arth ...

[mini-Animal-Kingdom.pdf | chunk