# RAG Based Application with PDF Processing

In [1]:
import requests
import json
import fitz  # pymupdf
from sentence_transformers import SentenceTransformer
from chromadb import PersistentClient

## Extracting the PDF Text

In [2]:
def extract_text_from_pdf(path):
    doc = fitz.open(path)
    pages = []
    for i, p in enumerate(doc):
        text = p.get_text("text")
        pages.append((i, text))   # IMPORTANT: tuple (page_number, text)
    return pages


## Divide the Text into Smaller Chunks

In [3]:
def chunk_text(text, chunk_size=1000, overlap=200):
    tokens = text.split()
    chunks = []
    i = 0
    while i < len(tokens):
        chunk = tokens[i:i+chunk_size]
        chunks.append(" ".join(chunk))
        i += chunk_size - overlap
    return chunks

## Processing the PDF For storing in Database

In [4]:
def ingest_pdf(pdf_path):
    pages = extract_text_from_pdf(pdf_path)
    all_chunks = []
    all_ids = []
    all_metadata = []

    for page_number, page_text in pages:
        chunks = chunk_text(page_text)

        for idx, chunk in enumerate(chunks):
            unique_id = f"{pdf_path}_page{page_number}_chunk{idx}"
            metadata = {
                "source": pdf_path,
                "page": page_number,
                "chunk": idx
            }
            all_chunks.append(chunk)
            all_ids.append(unique_id)
            all_metadata.append(metadata)

    return all_chunks, all_ids, all_metadata

## Setting Embedding Model

In [5]:
client = PersistentClient(path="./chroma_db")
collection = client.get_or_create_collection(
    name="pdf_data",
    metadata={"hnsw:space": "cosine"}
)
model = SentenceTransformer("all-MiniLM-L6-v2")  # local

## For adding PDFs to Vector Database

In [6]:
def add_pdf_to_chroma(pdf_path):
    chunks, ids, metadata = ingest_pdf(pdf_path)
    embeddings = model.encode(chunks, show_progress_bar=True)

    collection = client.get_or_create_collection(
        name="pdf_data",
        metadata={"hnsw:space": "cosine"}
    )

    collection.add(
        documents=chunks,
        metadatas=metadata,
        ids=ids,
        embeddings=embeddings.tolist()
    )
    print(f"PDF '{pdf_path}' added successfully!")


## Retriving the Related data from the Database

In [7]:
def retrieve_relevant_chunks(query, model, collection, top_k=5, min_similarity=0.25):
    query_emb = model.encode([query])[0]

    results = collection.query(
        query_embeddings=[query_emb],
        n_results=top_k,
        include=["documents", "metadatas", "distances"]
    )

    docs = results["documents"][0]
    metas = results["metadatas"][0]
    dists = results["distances"][0]   # cosine distance

    # Convert distance → similarity
    sims = [1 - d for d in dists]

    # If ALL similarities are low → return empty
    if max(sims) < min_similarity:
        return [], [], []

    return docs, metas, sims

## Building the Prompt for LLM

In [8]:
def build_prompt(chunks, question):
    context = ""

    for i, chunk in enumerate(chunks):
        context += f"\n--- Chunk {i+1} ---\n{chunk}\n"

    prompt = f"""
You are a helpful assistant.
You MUST follow these rules:

1. You are ONLY allowed to answer using the context.
2. If the answer is not explicitly found in the context, you MUST reply exactly:
   "I don't know based on the provided documents."
3. Do NOT use your own knowledge.
4. Do NOT guess. Do NOT assume.

CONTEXT:
{context}

QUESTION:
{question}

ANSWER:
"""
    return prompt


## Using Ollama Local API for LLM

In [9]:
def ask_ollama(prompt):
    url = "http://localhost:11434/api/generate"
    payload = {
        "model": "mistral",
        "prompt": prompt
    }
    response = requests.post(url, json=payload, stream=True)
    
    output = ""
    for line in response.iter_lines():
        if line:
            data = json.loads(line)
            output += data.get("response", "")
    return output

## Final Answer Function

In [10]:
def ask_question(question):
    # STEP 1 — retrieve chunks using vector search
    chunks, metadatas, ids = retrieve_relevant_chunks(
        query=question,
        model=model,
        collection=collection,
        top_k=5
    )

    if len(chunks) == 0:
        return "I don't know based on the provided documents."

    # STEP 2 — build LLM prompt using retrieved chunks
    prompt = build_prompt(chunks, question)

    # STEP 3 — ask the Ollama model
    answer = ask_ollama(prompt)

    return answer


## Adding PDFs to Database

In [13]:
add_pdf_to_chroma('book.pdf')

Batches:   0%|          | 0/53 [00:00<?, ?it/s]

PDF 'book2.pdf' added successfully!


In [14]:
response = ask_question("How can we create an array in JAVA?")
print(response)

 An array in Java can be created by using an array initializer as shown in Chunk 1:
```java
type[] var-name = {value1, value2, ...};
```
For example, to store the number of days in each month, the following code creates an initialized array of integers:
```java
int[] month_days = {31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31};
```
