<a href="https://colab.research.google.com/github/springboardmentor1234x-stack/Internal-Chatbot-with-RBAC/blob/Reethika-A/reethika_rbac_embeding_chunking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [52]:
import chromadb
from sentence_transformers import SentenceTransformer
import uuid

text_1 = """Quarterly Financial Report - FinSolve Technologies Inc. 2024 Executive Summary
In 2024, FinSolve Technologies Inc. delivered exceptional financial performance,
achieving significant year-over-year YoY growth across all quarters. Revenue increased
from $2.1 billion in Q1 to $2.6 billion in Q4."""

text_2 = """fueled by strong performance in existing markets and early traction in Southeast Asia.
Gross Margin 60%, driven by higher-margin product offerings and operational efficiencies.
Net Income $275 million, up 12% YoY."""

text_3 = """focused on securing long-term capital for growth. Risks Mitigation Risk Vendor cost
inflation in Latin America. Mitigation Renegotiated terms with local suppliers to secure
favorable pricing and reduce costs."""

documents = [text_1, text_2, text_3]
ids = ["finsolve_001", "finsolve_002", "finsolve_003"]

metadatas = [
    {"source": "financial_report", "department": "Finance", "access_roles": "executive_leadership, finance_manager, senior_auditor, board_member"},
    {"source": "financial_report", "department": "Finance", "access_roles": "executive_leadership, finance_manager, senior_auditor, board_member"},
    {"source": "financial_report", "department": "Finance", "access_roles": "executive_leadership, finance_manager, senior_auditor, board_member"}
]

print("1. Generating Embeddings...")
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(documents)
print(f"   - Embedding Vector Shape: {embeddings.shape}")
print(f"   - First 10 values of Chunk 1: {embeddings[:10]}\n")

print("2. Initializing ChromaDB Client and Collection...")
client = chromadb.Client()
collection_name = "Finance_Department"

try:
    client.delete_collection(name=collection_name)
    print(f"   - Deleted existing collection '{collection_name}'.")
except ValueError:
    print(f"   - Collection '{collection_name}' does not exist, creating new one.")
    pass

collection = client.create_collection(name=collection_name)
print(f"   - Collection '{collection.name}' created successfully.")

print("3. Inserting Data with Embeddings and Metadata...")
collection.add(
    ids=ids,
    documents=documents,
    embeddings=embeddings,
    metadatas=metadatas
)
print(f"   - Successfully inserted {collection.count()} documents.\n")

query_text = "What are the risks regarding vendor inflation?"
print(f"4. Querying: '{query_text}'")

query_embedding = model.encode([query_text])

results = collection.query(
    query_embeddings=query_embedding,
    n_results=1
)

print("\n--- Query Results ---")
print(f"Matched ID: {results['ids']}")
print(f"Metadata:   {results['metadatas']}")
print(f"Content:    {results['documents']}")

1. Generating Embeddings...
   - Embedding Vector Shape: (3, 384)
   - First 10 values of Chunk 1: [[ 0.00320291 -0.02146034  0.03344022 ... -0.06450392  0.06353835
   0.00875066]
 [ 0.0381811  -0.03147685 -0.00432307 ... -0.11669988  0.00779852
  -0.07578726]
 [ 0.02215847  0.01153829 -0.02211834 ... -0.13942073  0.06690983
   0.02842092]]

2. Initializing ChromaDB Client and Collection...
   - Deleted existing collection 'Finance_Department'.
   - Collection 'Finance_Department' created successfully.
3. Inserting Data with Embeddings and Metadata...
   - Successfully inserted 3 documents.

4. Querying: 'What are the risks regarding vendor inflation?'

--- Query Results ---
Matched ID: [['finsolve_003']]
Metadata:   [[{'access_roles': 'executive_leadership, finance_manager, senior_auditor, board_member', 'source': 'financial_report', 'department': 'Finance'}]]
Content:    [['focused on securing long-term capital for growth. Risks Mitigation Risk Vendor cost\ninflation in Latin America