# Generation: Stuffing Documents

In [1]:
%load_ext dotenv
%dotenv

In [2]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings 
from langchain_chroma import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings 
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.runnables import RunnableParallel
from langchain_core.output_parsers import StrOutputParser

In [6]:
import os
api_key = os.getenv("GEMINI_API_KEY")

embedding = GoogleGenerativeAIEmbeddings(
    model="models/embedding-001",
    google_api_key=api_key
)


vectorstore = Chroma(persist_directory = "./intro-to-ds-lectures", 
                     embedding_function = embedding
                  )

In [7]:
len(vectorstore.get()['documents'])

41

In [8]:
retriever = vectorstore.as_retriever(search_type = 'mmr', 
                                     search_kwargs = {'k':3, 
                                                      'lambda_mult':0.7})

In [9]:
TEMPLATE = '''
Answer the following question:
{question}

To answer the question, use only the following context:
{context}

At the end of the response, specify the name of the lecture this context is taken from in the format:
Resources: *Lecture Title*
where *Lecture Title* should be substituted with the title of all resource lectures.
'''

prompt_template = PromptTemplate.from_template(TEMPLATE)

In [11]:
from langchain_google_genai import ChatGoogleGenerativeAI

In [12]:
chat = ChatGoogleGenerativeAI(
    google_api_key = os.getenv("GEMINI_API_KEY"),  
    model = "gemini-2.0-flash",
    temperature = 0,
    max_output_tokens = 100,
    model_kwargs = {"seed": 365}
)

In [13]:
question = "What software do data scientists use?"

In [14]:
chain = {'context': retriever, 
         'question': RunnablePassthrough()} | prompt_template

In [15]:
chain.invoke(question)

StringPromptValue(text="\nAnswer the following question:\nWhat software do data scientists use?\n\nTo answer the question, use only the following context:\n[Document(id='926358a8-24e0-4b18-85c0-9a074afe8603', metadata={'Course Title': 'Introduction to Data and Data Science', 'Lecture Title': 'Programming Languages & Software Employed in Data Science - All the Tools You Need'}, page_content='Great! We hope we gave you a good idea about the level of applicability of the most frequently used programming and software tools in the field of data science. Thank you for watching!'), Document(id='aed9fd41-2ba1-4216-b95b-007a2eba891f', metadata={'Lecture Title': 'Programming Languages & Software Employed in Data Science - All the Tools You Need', 'Course Title': 'Introduction to Data and Data Science'}, page_content='It’s actually a software framework which was designed to address the complexity of big data and its computational intensity. Most notably, Hadoop distributes the computational tasks

In [16]:
print("\nAnswer the following question:\nWhat software do data scientists use?\n\nTo answer the question, use only the following context:\n[Document(id='926358a8-24e0-4b18-85c0-9a074afe8603', metadata={'Course Title': 'Introduction to Data and Data Science', 'Lecture Title': 'Programming Languages & Software Employed in Data Science - All the Tools You Need'}, page_content='Great! We hope we gave you a good idea about the level of applicability of the most frequently used programming and software tools in the field of data science. Thank you for watching!'), Document(id='aed9fd41-2ba1-4216-b95b-007a2eba891f', metadata={'Lecture Title': 'Programming Languages & Software Employed in Data Science - All the Tools You Need', 'Course Title': 'Introduction to Data and Data Science'}, page_content='It’s actually a software framework which was designed to address the complexity of big data and its computational intensity. Most notably, Hadoop distributes the computational tasks on multiple computers which is basically the way to handle big data nowadays. Power BI, SaS, Qlik, and especially Tableau are top-notch examples of software designed for business intelligence visualizations'), Document(id='54bca6b9-0cfc-481a-a145-4d44446ce948', metadata={'Lecture Title': 'Analysis vs Analytics', 'Course Title': 'Introduction to Data and Data Science'}, page_content='Analytics is essentially the application of logical and computational reasoning to the component parts obtained in an analysis. And in doing this you are looking for patterns and exploring what you could do with them in the future. Here, analytics branches off into two areas: qualitative analytics – this is using your intuition and experience in conjunction with the analysis to plan your next business move')]\n\nAt the end of the response, specify the name of the lecture this context is taken from in the format:\nResources: *Lecture Title*\nwhere *Lecture Title* should be substituted with the title of all resource lectures.\n")


Answer the following question:
What software do data scientists use?

To answer the question, use only the following context:
[Document(id='926358a8-24e0-4b18-85c0-9a074afe8603', metadata={'Course Title': 'Introduction to Data and Data Science', 'Lecture Title': 'Programming Languages & Software Employed in Data Science - All the Tools You Need'}, page_content='Great! We hope we gave you a good idea about the level of applicability of the most frequently used programming and software tools in the field of data science. Thank you for watching!'), Document(id='aed9fd41-2ba1-4216-b95b-007a2eba891f', metadata={'Lecture Title': 'Programming Languages & Software Employed in Data Science - All the Tools You Need', 'Course Title': 'Introduction to Data and Data Science'}, page_content='It’s actually a software framework which was designed to address the complexity of big data and its computational intensity. Most notably, Hadoop distributes the computational tasks on multiple computers which 