## Retrieval: Similarity Search

In [3]:
# from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain_ollama import OllamaEmbeddings

import os
os.environ.setdefault("no_proxy", "127.0.0.1,localhost")
os.environ.setdefault("HTTPX_NO_PROXY", "127.0.0.1,localhost")

'127.0.0.1,localhost'

In [2]:
embedding = OllamaEmbeddings(model="llama3.2:3b")

In [4]:
vectorstore = Chroma(persist_directory = "./intro-to-ds-lectures", 
                     embedding_function = embedding)

In [5]:
added_document = Document(page_content='Alright! So… How are the techniques used in data, business intelligence, or predictive analytics applied in real life? Certainly, with the help of computers. You can basically split the relevant tools into two categories—programming languages and software. Knowing a programming language enables you to devise programs that can execute specific operations. Moreover, you can reuse these programs whenever you need to execute the same action', 
                          metadata={'Course Title': 'Introduction to Data and Data Science', 
                                    'Lecture Title': 'Programming Languages & Software Employed in Data Science - All the Tools You Need'})

In [6]:
vectorstore.add_documents([added_document])

['54dbd120-ec6c-4c24-bb0d-1133ddf0eddb']

In [7]:
question = "What programming languages do data scientists use?"

In [19]:
retrieved_docs = vectorstore.similarity_search(query = question, k = 5)
retrieved_docs

[Document(id='548dc1bb-d930-4ffe-87ce-ab02863b0a88', metadata={}, page_content='Moreover, you can reuse these programs whenever you need to execute the same action. As you can see from the infographic, R, and Python are the two most popular tools across all columns. Their biggest advantage is that they can manipulate data and are integrated within multiple data and data science software platforms. They are not just suitable for mathematical and statistical computations. In other words, R, and Python are adaptable'),
 Document(id='917f7a30-694d-4105-91b5-4a1dfd56dbb4', metadata={}, page_content='Moreover, you can reuse these programs whenever you need to execute the same action. As you can see from the infographic, R, and Python are the two most popular tools across all columns. Their biggest advantage is that they can manipulate data and are integrated within multiple data and data science software platforms. They are not just suitable for mathematical and statistical computations. In 

In [20]:
for i in retrieved_docs:
    print(f"Page Content: {i.page_content}\n----------\nLecture Title:{i.metadata['Lecture Title']}\n")

KeyError: 'Lecture Title'

## Retrieval: Maximal Marginal Relevance Search

In [12]:
# from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_ollama import OllamaEmbeddings
from langchain_chroma import Chroma
from langchain_core.documents import Document

In [13]:
embedding = OllamaEmbeddings(model="llama3.2:3b")

In [14]:
vectorstore = Chroma(persist_directory = "./intro-to-ds-lectures", 
                     embedding_function = embedding)

# added_document = Document(page_content='Alright! So… How are the techniques used in data, business intelligence, or predictive analytics applied in real life? Certainly, with the help of computers. You can basically split the relevant tools into two categories—programming languages and software. Knowing a programming language enables you to devise programs that can execute specific operations. Moreover, you can reuse these programs whenever you need to execute the same action', 
#                         metadata={'Course Title': 'Introduction to Data and Data Science', 
#                                   'Lecture Title': 'Programming Languages & Software Employed in Data Science - All the Tools You Need'})

# vectorstore.add_documents([added_document])

In [15]:
question = "What software do data scientists use?"

In [16]:
retrieved_docs = vectorstore.max_marginal_relevance_search(
    query=question, 
    k=3, 
    lambda_mult = 1, 
    filter = {"Lecture Title": "Programming Languages & Software Employed in Data Science - All the Tools You Need"}
)

In [17]:
for i in retrieved_docs:
    print(f"Page Content: {i.page_content}\n----------\nLecture Title:{i.metadata['Lecture Title']}\n")

Page Content: Alright! So… How are the techniques used in data, business intelligence, or predictive analytics applied in real life? Certainly, with the help of computers. You can basically split the relevant tools into two categories—programming languages and software. Knowing a programming language enables you to devise programs that can execute specific operations. Moreover, you can reuse these programs whenever you need to execute the same action
----------
Lecture Title:Programming Languages & Software Employed in Data Science - All the Tools You Need

