# Load Existing ChromaDB Embeddings

This notebook shows how to retrieve and use your existing ChromaDB embeddings in a new notebook.

**Prerequisites:** You must have already created the ChromaDB database using `GENAI_PROJECT_CHROMADB.ipynb`

## Step 1: Install Required Packages

In [None]:
# Install packages if needed
!pip install -q chromadb==0.4.18
!pip install -q sentence-transformers==2.2.2
!pip install -q openai==1.12.0

print("‚úÖ Packages installed!")

## Step 2: Import Libraries

In [None]:
import chromadb
from sentence_transformers import SentenceTransformer
import os
from openai import OpenAI
import numpy as np

print("‚úÖ Libraries imported!")

## Step 3: Connect to Existing ChromaDB

In [None]:
# Connect to the SAME directory where you created the database
persist_directory = os.path.expanduser("~/FinancialAI/chromadb")

print(f"üìÅ Connecting to ChromaDB at: {persist_directory}")

# Initialize ChromaDB client
chroma_client = chromadb.PersistentClient(path=persist_directory)

# Get the existing collection (don't create a new one!)
collection = chroma_client.get_collection(name="financial_filings")

print(f"‚úÖ Connected to collection: {collection.name}")
print(f"üìä Total documents in collection: {collection.count()}")

## Step 4: Load the Same Embedding Model

In [None]:
# IMPORTANT: Use the SAME model you used to create embeddings
print("üì• Loading FinBERT model...")
embedder = SentenceTransformer("ProsusAI/finbert")

print(f"‚úÖ Model loaded!")
print(f"üìê Embedding dimension: {embedder.get_sentence_embedding_dimension()}")

## Step 5: Retrieve All Documents (Optional)

In [None]:
# Get all documents from the collection
all_data = collection.get()

print(f"üìö Retrieved Data:")
print(f"   Documents: {len(all_data['documents'])}")
print(f"   IDs: {len(all_data['ids'])}")
print(f"   Metadata: {len(all_data['metadatas'])}")

# Show first document as example
if len(all_data['documents']) > 0:
    print(f"\nüìÑ Example Document:")
    print(f"   ID: {all_data['ids'][0]}")
    print(f"   Metadata: {all_data['metadatas'][0]}")
    print(f"   Text (first 200 chars): {all_data['documents'][0][:200]}...")

## Step 6: Search with Semantic Query

In [None]:
# Example: Search for relevant documents
query = "What are the main revenue sources?"

print(f"üîç Searching for: '{query}'\n")

# Generate embedding for the query
query_embedding = embedder.encode([query], device='cuda')

# Search ChromaDB
results = collection.query(
    query_embeddings=query_embedding.tolist(),
    n_results=5  # Get top 5 results
)

# Display results
print(f"üìä Found {len(results['documents'][0])} relevant documents:\n")
print("=" * 70)

for i, (doc, metadata, distance) in enumerate(zip(
    results['documents'][0],
    results['metadatas'][0],
    results['distances'][0]
), 1):
    print(f"\nResult {i}:")
    print(f"Company: {metadata.get('company', 'N/A')}")
    print(f"Section: {metadata.get('section', 'N/A')}")
    print(f"Year: {metadata.get('year', 'N/A')}")
    print(f"Distance: {distance:.4f}")
    print(f"Text: {doc[:300]}...")
    print("-" * 70)

## Step 7: Filter by Metadata

In [None]:
# Example: Search only in specific company or section
query = "What are the risk factors?"

print(f"üîç Searching with metadata filter: '{query}'\n")

query_embedding = embedder.encode([query], device='cuda')

# Search with metadata filter
results = collection.query(
    query_embeddings=query_embedding.tolist(),
    n_results=5,
    where={"section": "Risk Factors"}  # Filter by section
    # Other filter examples:
    # where={"company": "APPLE INC"}
    # where={"year": "2023"}
    # where={"source": "user_upload"}
)

print(f"üìä Found {len(results['documents'][0])} documents from Risk Factors section\n")

for i, (doc, metadata) in enumerate(zip(
    results['documents'][0],
    results['metadatas'][0]
), 1):
    print(f"\n{i}. {metadata.get('company', 'N/A')} ({metadata.get('year', 'N/A')})")
    print(f"   {doc[:200]}...")

## Step 8: Get Specific Documents by ID

In [None]:
# If you know specific document IDs
# First, let's see what IDs exist
all_data = collection.get(limit=10)  # Get first 10 IDs

print("üìã First 10 Document IDs:")
for i, doc_id in enumerate(all_data['ids'], 1):
    print(f"   {i}. {doc_id}")

# Retrieve specific documents by ID
if len(all_data['ids']) > 0:
    specific_ids = all_data['ids'][:3]  # Get first 3
    
    specific_docs = collection.get(ids=specific_ids)
    
    print(f"\nüìÑ Retrieved {len(specific_docs['documents'])} specific documents")

## Step 9: View Collection Statistics

In [None]:
# Get comprehensive statistics
all_data = collection.get()

print("=" * 70)
print("   CHROMADB COLLECTION STATISTICS")
print("=" * 70)

print(f"\nTotal Documents: {collection.count()}")

# Analyze metadata
if all_data['metadatas']:
    companies = set(m.get('company', 'Unknown') for m in all_data['metadatas'])
    sections = set(m.get('section', 'Unknown') for m in all_data['metadatas'])
    years = set(m.get('year', 'Unknown') for m in all_data['metadatas'])
    sources = set(m.get('source', 'Unknown') for m in all_data['metadatas'])
    
    print(f"\nUnique Companies: {len(companies)}")
    print(f"Unique Sections: {len(sections)}")
    print(f"Unique Years: {len(years)}")
    print(f"Sources: {', '.join(sources)}")
    
    print(f"\nCompanies in database:")
    for company in sorted(companies)[:20]:  # Show first 20
        count = sum(1 for m in all_data['metadatas'] if m.get('company') == company)
        print(f"   ‚Ä¢ {company}: {count} chunks")
    
    if len(companies) > 20:
        print(f"   ... and {len(companies) - 20} more companies")

print("\n" + "=" * 70)

## Step 10: Ask Questions with OpenAI (RAG)

In [None]:
# Set your OpenAI API key
os.environ['OPENAI_API_KEY'] = 'your-api-key-here'  # Replace with your key

client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

def ask_question(question, top_k=5):
    """
    Ask a question using RAG (Retrieval-Augmented Generation)
    """
    print(f"‚ùì Question: {question}\n")
    print("üîç Searching for relevant context...\n")
    
    # Generate query embedding
    query_embedding = embedder.encode([question], device='cuda')
    
    # Search ChromaDB
    results = collection.query(
        query_embeddings=query_embedding.tolist(),
        n_results=top_k
    )
    
    # Build context
    context_parts = []
    for i, (doc, metadata) in enumerate(zip(
        results['documents'][0],
        results['metadatas'][0]
    ), 1):
        source = f"{metadata.get('company', 'Unknown')} | {metadata.get('section', 'Unknown')}"
        context_parts.append(f"[Source {i}: {source}]\n{doc}")
    
    context = "\n\n---\n\n".join(context_parts)
    
    # Generate answer
    prompt = f"""You are an expert financial analyst.

Context from financial documents:
{context}

Question: {question}

Instructions:
1. Answer ONLY using information from the context
2. Cite which company and section you're using
3. Be specific with numbers and metrics
4. If information is not in context, say so

Your analysis:"""
    
    print("üí≠ Generating answer...\n")
    
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are an expert financial analyst."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.3,
        max_tokens=800
    )
    
    answer = response.choices[0].message.content
    
    print("=" * 70)
    print("üìä ANSWER")
    print("=" * 70)
    print(answer)
    print("=" * 70)
    
    return answer

# Example usage
# answer = ask_question("What are the main business activities of these companies?")
# answer = ask_question("Compare the revenue growth strategies")
# answer = ask_question("What are the key risk factors mentioned?")

print("‚úÖ RAG function ready! Use: ask_question('your question')")

## Quick Reference: Common Operations

```python
# 1. Get all documents
all_data = collection.get()

# 2. Get limited documents
limited_data = collection.get(limit=100)

# 3. Get specific documents by ID
specific = collection.get(ids=["id1", "id2"])

# 4. Semantic search
embedding = embedder.encode(["your query"])
results = collection.query(
    query_embeddings=embedding.tolist(),
    n_results=5
)

# 5. Search with metadata filter
results = collection.query(
    query_embeddings=embedding.tolist(),
    n_results=5,
    where={"company": "APPLE INC"}
)

# 6. Count documents
total = collection.count()

# 7. Delete documents by ID
collection.delete(ids=["id_to_delete"])

# 8. Update documents
collection.update(
    ids=["id1"],
    documents=["new text"],
    metadatas=[{"key": "value"}]
)
```