## A more advanced Document Q&A

1. Setup ChromaDB with Persistent Storage
2. Read Multiple PDFs from a directory and Add Metadata
documents_info = [
    {"path": "attention_all_you_need.pdf", "category": "attention"},
    {"path": "react.pdf", "category": "policy"},
    ...
]
3. Reduce the dimensionality of the vectors to 2D using t-SNE (t-distributed stochastic neighbor embedding)
4. Visualizing the Vector Store
5. Answer questions about any of the documents
6. Use langchain library


We use langchain:
```
pip install langchain langchain-openai langchain-chroma langchain-text-splitters  langchain-community langchain-classic 

pip install plotly scikit-learn
```

In [None]:
import os

import numpy as np
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_classic.chains import RetrievalQA

import PyPDF2
import plotly.graph_objects as go
from sklearn.manifold import TSNE

from dotenv import load_dotenv

load_dotenv()
aval_api_key=os.getenv("AVALAI_API_KEY")


In [None]:
# ===============================
# 2. CONFIGURATION
# ===============================

persist_directory = "./chroma_db"

embedding_model = OpenAIEmbeddings(
    api_key=aval_api_key,
    base_url="https://api.avalai.ir/v1",
    model="text-embedding-3-small",
)



In [None]:
# ===============================
# 3. LOAD MULTIPLE PDF DOCUMENTS
# ===============================

documents_info = [
    {"path": "documents/attention_all_you_need.pdf", "category": "attention"},
    {"path": "documents/react.pdf", "category": "policy"},
    # Add more PDFs here...
]

all_documents = []
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

for info in documents_info:
    filepath = info["path"]
    category = info["category"]

    if not os.path.exists(filepath):
        print(f"File not found: {filepath}")
        continue

    # Extract text from PDF
    with open(filepath, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        text = ""
        for page in reader.pages:
            try:
                text += page.extract_text() or ""
            except:
                pass

    # Split into chunks
    chunks = splitter.split_text(text)

    # Add metadata + store as Document objects
    for chunk in chunks:
        all_documents.append(
            Document(
                page_content=chunk,
                metadata={"category": category, "source": filepath}
            )
        )

print(f"Loaded {len(all_documents)} document chunks.")


In [None]:
# ===============================
# 4. BUILD CHROMADB VECTOR STORE
# ===============================

vectorstore = Chroma(
    collection_name="pdf_docs",
    embedding_function=embedding_model,
    persist_directory=persist_directory
)

vectorstore.add_documents(all_documents)

print("ChromaDB persisted.")


In [None]:
# Get one vector and find how many dimensions it has

collection = vectorstore._collection
sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"The vectors have {dimensions:,} dimensions")

In [None]:
# Prework

result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
doc_types = [metadata['category'] for metadata in result['metadatas']]
colors = [[ 'green', 'red'][['attention', 'policy'].index(t)] for t in doc_types]

In [None]:
# ===============================
# 5. TSNE VISUALIZATION
# ===============================

tsne = TSNE(n_components=2, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 2D scatter plot
fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='2D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x',yaxis_title='y'),
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()


In [None]:
# ===============================
# 6. Q&A OVER ALL DOCUMENTS
# ===============================

llm = ChatOpenAI(
    model="gpt-4o-mini",
    api_key=aval_api_key,
    base_url="https://api.avalai.ir/v1",
    temperature=0
)

qa = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectorstore.as_retriever()
)

query = "What is the main idea behind the attention mechanism?"
answer = qa.invoke(query)

print("\nQUESTION:", query)
print("ANSWER:", answer)
