In [3]:
# setting the environment variables, the keys
import sys
import os

%load_ext autoreload
%autoreload 2

sys.path.insert(0, os.path.abspath('..'))

from config import set_environment
# for the keys - as explained early in chapter 2
set_environment("GOOGLE_API_KEY")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embeddings_model = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001")


In [5]:
text1 = "The cat is on the mat"
text2 = "A feeline rested on the carpet"
text3 = "Python is programming language"

embeddings = embeddings_model.embed_documents([text1, text2, text3])

print(f"Number of documents: {len(embeddings)}")
print(f"Dimensions per embedding: {len(embeddings[0])}")

Number of documents: 3
Dimensions per embedding: 3072


In [6]:

embeddings_model_text_004 = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
text1 = "The cat is on the mat"
text2 = "A feeline rested on the carpet"
text3 = "Python is programming language"

embeddings_004 = embeddings_model_text_004.embed_documents([text1, text2, text3])

print(f"Number of documents: {len(embeddings_004)}")
print(f"Dimensions per embedding: {len(embeddings_004[0])}")

Number of documents: 3
Dimensions per embedding: 768


## Vector Setup

In [8]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_chroma import Chroma
from langchain_core.documents import Document

In [13]:
embeddings_vec_model = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")

# Create some sample documents with explicit IDs
docs = [
    Document(page_content="Content about language models", metadata={"id": "doc_1"}),
    Document(page_content="Information about vector databases", metadata={"id": "doc_2"}),
    Document(page_content="Details about retrieval systems", metadata={"id": "doc_3"})
]

vector_store = Chroma(embedding_function = embeddings_vec_model)

vector_store.add_documents(docs)

result = vector_store.similarity_search("How do retreival system works", k = 2)
print(result)

[Document(id='a6c952ef-1451-41ce-a2cb-e50b7ceadd37', metadata={'id': 'doc_3'}, page_content='Details about retrieval systems'), Document(id='efad16d1-c74a-4630-a390-bab45ac95bfb', metadata={'id': 'doc_3'}, page_content='Details about retrieval systems')]


In [14]:
found_docs = vector_store.similarity_search("retrieval", k=1)
print(f"Found documents: {len(found_docs)}")

Found documents: 1


In [16]:
used_doc_ids = {doc.metadata["id"] for doc in found_docs}
print(used_doc_ids)

{'doc_3'}


In [17]:
remaining_docs = [doc for doc in docs if doc.metadata["id"] not in used_doc_ids]
print(f"Remaining documents available for MMR: {len(remaining_docs)}")

Remaining documents available for MMR: 2


In [23]:
if len(remaining_docs) > 0:
    mmr_results = vector_store.max_marginal_relevance_search(
        "retrieval systems",
        k=5,  # Only request what's available
        fetch_k=1,  # Only fetch what's available
        lambda_mult=0.5
    )
    print(mmr_results)

[Document(id='a0df51ce-8b3d-4083-bbeb-f71f37fd8034', metadata={'id': 'doc_3'}, page_content='Details about retrieval systems')]
