# Retrieval: Similarity Search

In [1]:
%load_ext dotenv
%dotenv

In [2]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings 
from langchain_chroma import Chroma
from langchain_core.documents import Document

In [3]:
import os
api_key = os.getenv("GEMINI_API_KEY")

embedding = GoogleGenerativeAIEmbeddings(
    model="models/embedding-001",
    google_api_key=api_key
)

In [4]:
vectorstore = Chroma(persist_directory = "./intro-to-ds-lectures", 
                     embedding_function = embedding)

In [5]:
added_document = Document(page_content='Alright! So… How are the techniques used in data, business intelligence, or predictive analytics applied in real life? Certainly, with the help of computers. You can basically split the relevant tools into two categories—programming languages and software. Knowing a programming language enables you to devise programs that can execute specific operations. Moreover, you can reuse these programs whenever you need to execute the same action', 
                          metadata={'Course Title': 'Introduction to Data and Data Science', 
                                    'Lecture Title': 'Programming Languages & Software Employed in Data Science - All the Tools You Need'})

In [6]:
vectorstore.add_documents([added_document])

['cdc7c937-6add-4783-ab34-5c557b027e90']

In [7]:
question = "What programming languages do data scientists use?"

In [8]:
retrieved_docs = vectorstore.similarity_search(query = question, 
                                               k = 5)

In [9]:
retrieved_docs

[Document(id='926358a8-24e0-4b18-85c0-9a074afe8603', metadata={'Lecture Title': 'Programming Languages & Software Employed in Data Science - All the Tools You Need', 'Course Title': 'Introduction to Data and Data Science'}, page_content='Great! We hope we gave you a good idea about the level of applicability of the most frequently used programming and software tools in the field of data science. Thank you for watching!'),
 Document(id='e81da08d-7fb1-4b32-8e19-9d0e288b12b9', metadata={'Lecture Title': 'Programming Languages & Software Employed in Data Science - All the Tools You Need', 'Course Title': 'Introduction to Data and Data Science'}, page_content='Great! We hope we gave you a good idea about the level of applicability of the most frequently used programming and software tools in the field of data science. Thank you for watching!'),
 Document(id='0920f713-7241-4ec2-807f-3835fcda14a3', metadata={'Course Title': 'Introduction to Data and Data Science', 'Lecture Title': 'Programmin

In [10]:
for i in retrieved_docs:
    print(f"Page Content: {i.page_content}\n----------\nLecture Title:{i.metadata['Lecture Title']}\n")

Page Content: Great! We hope we gave you a good idea about the level of applicability of the most frequently used programming and software tools in the field of data science. Thank you for watching!
----------
Lecture Title:Programming Languages & Software Employed in Data Science - All the Tools You Need

Page Content: Great! We hope we gave you a good idea about the level of applicability of the most frequently used programming and software tools in the field of data science. Thank you for watching!
----------
Lecture Title:Programming Languages & Software Employed in Data Science - All the Tools You Need

Page Content: What about big data? Apart from R and Python, people working in this area are often proficient in other languages like Java or Scala. These two have not been developed specifically for doing statistical analyses, however they turn out to be very useful when combining data from multiple sources. All right! Let’s finish off with machine learning. When it comes to machin