# **Setting Up Chroma Cloud API Key**

In [None]:
from google.colab import userdata
CHROMA_API_KEY=userdata.get('CHROMA_API_KEY')

# **Sample Testing of chroma DB cloud**

In [None]:
!pip install chromadb

In [3]:
import chromadb

client = chromadb.CloudClient(
  api_key=CHROMA_API_KEY,
  tenant='20406c6d-8908-4b88-9823-4e6f3da11854',
  database='dev_db'
)

In [4]:
!pip install -q transformers sentence-transformers chromadb


In [5]:
# Sentence transformer library is used to create word embeddings that captures semantic meanings of the text
from sentence_transformers import SentenceTransformer


In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')  # Sentence-BERT model

# Example text to generate embeddings for
texts = [
    "This is an example sentence.",
    "Chroma DB stores embeddings efficiently.",
    "Hugging Face provides pre-trained models for embeddings."
]

In [15]:
embeddings = model.encode(texts)

In [9]:
collection = client.create_collection(name="my_embeddings")

In [11]:
# Add the embeddings to Chroma DB
for i, embedding in enumerate(embeddings):
    collection.add(
        ids=[str(i)],                    # Unique ID for each document (you can use any unique string)
        metadatas=[{"text": texts[i]}],  # Optional metadata
        documents=[texts[i]],            # Text associated with embedding
        embeddings=[embedding]           # The actual embedding vector
    )

print("Embeddings successfully stored in Chroma DB.")


Embeddings successfully stored in Chroma DB.


In [16]:
# Now you can query the stored embeddings
query = "What is Chroma DB?"
query_embedding = model.encode([query])  # Generate embedding for the query

# Search for the most similar embeddings in the collection
results = collection.query(
    query_embeddings=query_embedding,
    n_results=1  # number of similar results you want
)

print("Query Results:", results)

Query Results: {'ids': [['1']], 'distances': [[0.6530509]], 'embeddings': None, 'metadatas': [[{'text': 'Chroma DB stores embeddings efficiently.'}]], 'documents': [['Chroma DB stores embeddings efficiently.']], 'uris': None, 'data': None, 'included': ['metadatas', 'documents', 'distances']}


# **Tutorial Follow Up**

In [None]:
!pip install -q langchain langchain-community langchain-text-splitters chromadb requests

In [None]:
!pip install --upgrade langchain langchain_community

In [5]:
!wget -q https://www.dropbox.com/s/vs6ocyvpzzncvwh/new_articles.zip

In [6]:
!unzip -q new_articles.zip -d new_articles

# **Setting up Environment**

In [None]:
# Gemini API setup
GEMINI_API_KEY = userdata.get("GEMINI_API_KEY")
GEMINI_API_URL = "https://generativelanguage.googleapis.com/v1beta/models/gemini-embedding-001:embedContent"


# **Importing Libraries**

In [None]:
import requests
from typing import List
from langchain_core.embeddings import Embeddings
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DirectoryLoader, TextLoader

# **Loading Data**

In [9]:
loader = DirectoryLoader("/content/new_articles/", glob = "./*.txt", loader_cls= TextLoader)

In [10]:
document = loader.load()

In [None]:
document

In [11]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [12]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)
text = text_splitter.split_documents(document)

In [None]:
text

In [17]:
len(text)

233

In [27]:
text[4]

Document(metadata={'source': '/content/new_articles/05-04-microsoft-doubles-down-on-ai-with-new-bing-features.txt'}, page_content='In another knock against Microsoft, the company just a few months ago laid off the ethics and society team within its larger AI organization. The move left Microsoft without a dedicated team to ensure its AI principles are closely tied to product design.\n\nBird, though, asserts that meaningful progress has been made and that these sorts of AI issues aren’t solved overnight — public though Bing Chat may be. Among other measures, a team of human moderators is in place to watch for abuse, she said, such as users attempting to use Bing Chat to generate phishing emails.\n\nBut — as members of the press weren’t given the chance to interact with the latest version of Bing beyond curated demos — I can’t say to what extent all that’s made a difference. It’ll doubtless become clear once more folks get their hands on it.')

# **Getting the embeddings using gemini and storing them into ChromaDB**

In [None]:
# Custom Gemini Embedding class
class GeminiEmbedding(Embeddings):
    def __init__(self, api_key: str):
        self.api_key = api_key

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return [self._embed(text) for text in texts]

    def embed_query(self, text: str) -> List[float]:
        return self._embed(text)

    def _embed(self, text: str) -> List[float]:
        headers = {"Content-Type": "application/json"}
        params = {"key": self.api_key}
        payload = {"model": "models/gemini-embedding-001", "content": {"parts":[{"text": text}]}}
        response = requests.post(GEMINI_API_URL, json=payload, headers=headers, params=params)

        if response.status_code != 200:
            raise Exception(f"Gemini API error: {response.text}")

        data = response.json()
        return data["embedding"]["values"]

# Setup
embedding = GeminiEmbedding(api_key=GEMINI_API_KEY)

# Create Chroma DB
persist_directory = "db"
vectordb = Chroma.from_documents(documents=document, embedding=embedding, persist_directory=persist_directory)

print("Documents embedded into Chroma using Gemini")


# **Raw Query without LLM**

In [29]:
query = "How much money did Microsoft raise?"
results = vectordb.similarity_search(query, k=3) # retrieves the top k=3 most similar chunks.
for r in results:
    print(r.page_content[:200])


It’s that time of week again, folks — Week in Review (WiR) time. For those new to the scene, WiR is TechCrunch’s regular newsletter that recaps the biggest tech stories over the past few days. There’s
Microsoft doubles down on AI with new Bing features The company's betting the farm on generative AI

Microsoft is embarking on the next phase of Bing’s expansion. And — no surprise — it heavily revolv
The best way to avoid a down round is to found an AI startup

As we see unicorns slash staff and the prevalence of down rounds spike, it may seem that the startup ecosystem is chock-full of bad news a


# **RAG Pipeline**
It's like Search the knowledge base, then have Gemini explain the answer in natural language, citing the docs

In [18]:
retriever = vectordb.as_retriever(search_kwargs={"k": 3})

In [32]:
retriever.search_type

'similarity'

In [33]:
retriever.search_kwargs

{'k': 3}

In [20]:
!pip install -q langchain-google-genai


In [21]:

from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", google_api_key=GEMINI_API_KEY)

In [22]:
from langchain.chains import RetrievalQA

# create the chain to answer questions
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True
)


In [30]:
query = "How much money did Microsoft raised?"
response = qa_chain.invoke({"query": query})

print("Answer:", response["result"])
print("\nSources:")
for doc in response["source_documents"]:
    print("-", doc.metadata.get("source"))


Answer: This article doesn't state how much money Microsoft raised.  It discusses Microsoft's investments in AI and the expansion of Bing, but not funding rounds for Microsoft itself.

Sources:
- /content/new_articles/05-06-amazon-launches-free-channels-check-marks-come-to-gmail-and-openai-raises-more-moolah.txt
- /content/new_articles/05-04-microsoft-doubles-down-on-ai-with-new-bing-features.txt
- /content/new_articles/05-06-ai-startups-q1-investments.txt
