# Retrieval: Vectorstore-Backed Retriever

In [1]:
%load_ext dotenv
%dotenv

In [2]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings 
from langchain_chroma import Chroma
from langchain_core.documents import Document

In [3]:
import os
api_key = os.getenv("GEMINI_API_KEY")

embedding = GoogleGenerativeAIEmbeddings(
    model="models/embedding-001",
    google_api_key=api_key
)


vectorstore = Chroma(persist_directory = "./intro-to-ds-lectures", 
                     embedding_function = embedding)

In [4]:
len(vectorstore.get()['documents'])

41

In [5]:
retriever = vectorstore.as_retriever(search_type = 'mmr', 
                                     search_kwargs = {'k': 3, 
                                                      'lambda_mult': 0.7})

In [6]:
retriever

VectorStoreRetriever(tags=['Chroma', 'GoogleGenerativeAIEmbeddings'], vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x000001B16544CD70>, search_type='mmr', search_kwargs={'k': 3, 'lambda_mult': 0.7})

In [7]:
question = "What software do data scientists use?"

In [8]:
retrieved_docs = retriever.invoke(question)

In [9]:
retrieved_docs

[Document(id='926358a8-24e0-4b18-85c0-9a074afe8603', metadata={'Course Title': 'Introduction to Data and Data Science', 'Lecture Title': 'Programming Languages & Software Employed in Data Science - All the Tools You Need'}, page_content='Great! We hope we gave you a good idea about the level of applicability of the most frequently used programming and software tools in the field of data science. Thank you for watching!'),
 Document(id='aed9fd41-2ba1-4216-b95b-007a2eba891f', metadata={'Course Title': 'Introduction to Data and Data Science', 'Lecture Title': 'Programming Languages & Software Employed in Data Science - All the Tools You Need'}, page_content='It’s actually a software framework which was designed to address the complexity of big data and its computational intensity. Most notably, Hadoop distributes the computational tasks on multiple computers which is basically the way to handle big data nowadays. Power BI, SaS, Qlik, and especially Tableau are top-notch examples of softwa

In [10]:
for i in retrieved_docs:
    print(f"Page Content: {i.page_content}\n----------\nLecture Title:{i.metadata['Lecture Title']}\n")

Page Content: Great! We hope we gave you a good idea about the level of applicability of the most frequently used programming and software tools in the field of data science. Thank you for watching!
----------
Lecture Title:Programming Languages & Software Employed in Data Science - All the Tools You Need

Page Content: It’s actually a software framework which was designed to address the complexity of big data and its computational intensity. Most notably, Hadoop distributes the computational tasks on multiple computers which is basically the way to handle big data nowadays. Power BI, SaS, Qlik, and especially Tableau are top-notch examples of software designed for business intelligence visualizations
----------
Lecture Title:Programming Languages & Software Employed in Data Science - All the Tools You Need

Page Content: Analytics is essentially the application of logical and computational reasoning to the component parts obtained in an analysis. And in doing this you are looking for 