## Document Q&A system

1. Read a PDF
2. Chunk the text
3. Create embeddings
4. Store them in ChromaDB
5. Answer user questions using retrieved chunks

```
pip install PyPDF2
```

In [None]:
import os
import PyPDF2
import chromadb
from chromadb.utils import embedding_functions
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()
aval_api_key=os.getenv("AVALAI_API_KEY")


In [None]:
# -------------------------
# Step 1: Read PDF
# -------------------------
def read_pdf(file_path):
    pdf_text = ""
    with open(file_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            pdf_text += page.extract_text() + "\n"
    return pdf_text


In [None]:
# -------------------------
# Step 2: Chunk text
# -------------------------
def chunk_text(text, chunk_size=300):
    """Split text into chunks of approximately chunk_size words"""
    words = text.split()
    chunks = [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks


In [None]:
# -------------------------
# Step 3: Setup ChromaDB & Embeddings
# -------------------------
# Initialize Chroma client
client = chromadb.Client()

# OpenAI embedding function
embedding_fn = embedding_functions.OpenAIEmbeddingFunction(
    api_key=aval_api_key,
    api_base="https://api.avalai.ir/v1",
    model_name="text-embedding-3-small"
)

collection_name = "qa_collection"

# Delete a collection if it exists
if collection_name in [c.name for c in client.list_collections()]:
    client.delete_collection(collection_name)

# Create collection for storing chunks
collection = client.create_collection(
    name=collection_name,
    embedding_function=embedding_fn
)



In [None]:
# -------------------------
# Step 4: Add PDF chunks to ChromaDB
# -------------------------
def add_chunks_to_chroma(chunks):
    collection.add(
        documents=chunks,
        metadatas=[{"source": f"chunk_{i}"} for i in range(len(chunks))],
        ids=[str(i) for i in range(len(chunks))],
    )

In [None]:
# -------------------------
# Step 5: Ask questions using LLM
# -------------------------
def ask_question(question):
    # Retrieve top 3 relevant chunks
    results = collection.query(query_texts=[question], n_results=3)
    retrieved_chunks = results['documents'][0]  # list of chunks

    # Combine chunks into context
    context = "\n\n".join(retrieved_chunks)

    # Initialize OpenAI client
    client_openai = OpenAI(
        api_key=aval_api_key,
        base_url="https://api.avalai.ir/v1"
    )

    # Generate answer
    response = client_openai.responses.create(
        model="gpt-4o-mini",
        input=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": f"Answer the question using the context below:\n\nContext:\n{context}\n\nQuestion: {question}"}
        ]
    )

    return response.output_text


In [None]:
# -------------------------
# Step 6: Main workflow
# -------------------------
if __name__ == "__main__":
    # Try one of these documents or put your own document
    pdf_file = "documents/attention_all_you_need.pdf"
    # pdf_file = "documents/react.pdf"
    
    print("Reading PDF...")
    text = read_pdf(pdf_file)

    print("Chunking text...")
    chunks = chunk_text(text, chunk_size=300)

    print("Adding chunks to ChromaDB...")
    add_chunks_to_chroma(chunks)

    while True:
        question = input("\nAsk a question (or type 'exit' to quit): ")
        if question.lower() == "exit":
            break
        answer = ask_question(question)

        print("\nQuestion:\n", question)
        print("\nAnswer:\n", answer)
