# Retrieval

## Import Libraries

In [1]:
import openai
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.retrievers import SVMRetriever, TFIDFRetriever
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

## Setting LLM and Embeddings

In [2]:
api_key = open('../api_key.txt').read()

In [3]:
embeddings = OpenAIEmbeddings(openai_api_key=api_key)

In [4]:
llm = OpenAI(openai_api_key=api_key, temperature=0)

## Retrieval

In [5]:
persist_directory = "db/chroma/"

In [6]:
vector_db = Chroma(
    embedding_function=embeddings,
    persist_directory=persist_directory
)

In [7]:
print(vector_db._collection.count())

209


### Maximum Marginal Relevance

Maximum marginal relevance strives to achieve both relevance to the query and diversity among the results.

In [8]:
texts = [
    """The Amanita phalloides has a large and imposing epigeous (aboveground) fruiting body (basidiocarp).""",
    """A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.""",
    """A. phalloides, a.k.a Death Cap, is one of the most poisonous of all known mushrooms.""",
]

In [9]:
small_db = Chroma.from_texts(
    texts=texts,
    embedding=embeddings
)

In [10]:
question = "Tell me about all-white mushrooms with large fruiting bodies"

In [11]:
small_db.similarity_search(
    query=question, 
    k=2
)

[Document(page_content='A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.', metadata={}),
 Document(page_content='The Amanita phalloides has a large and imposing epigeous (aboveground) fruiting body (basidiocarp).', metadata={})]

In [12]:
small_db.max_marginal_relevance_search(
    query=question, 
    k=2, 
    fetch_k=3
)

[Document(page_content='A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.', metadata={}),
 Document(page_content='A. phalloides, a.k.a Death Cap, is one of the most poisonous of all known mushrooms.', metadata={})]

#### Addressing Diversity

In VectorStores and Embeddings notebook we encountered one problem: how to enforce diversity in the search results.

In [13]:
question = "What did they say about Matlab?"

**Using Similarity Search**

In [14]:
similarity_search_document = vector_db.similarity_search(
    query = question, 
    k = 2
)

In [16]:
similarity_search_document[0].page_content[0:100]

'those homeworks will be done in either MATLA B or in Octave, which is sort of — I \nknow some people '

In [17]:
similarity_search_document[1].page_content[0:100]

'those homeworks will be done in either MATLA B or in Octave, which is sort of — I \nknow some people '

**Using Maximum Marginal Relevance**

Note the difference with using MMR

In [18]:
maximum_marginal_relevance_document = vector_db.max_marginal_relevance_search(
    query = question,
    k = 2,
)

In [19]:
maximum_marginal_relevance_document[0].page_content[0:100]

'those homeworks will be done in either MATLA B or in Octave, which is sort of — I \nknow some people '

In [20]:
maximum_marginal_relevance_document[1].page_content[0:100]

"many biologers are there here? Wow, just a few, not many. I'm surprised. Anyone from \nstatistics? Ok"

### Self Query Retriever

In [21]:
question = "what did they say about regression in the third lecture?"

In [22]:
docs = vector_db.similarity_search(
    query=question,
    k=3,
    filter={"source":"documents/MachineLearning-Lecture03.pdf"}
)

In [23]:
for doc in docs:
    print(doc.metadata)

{'page': 0, 'source': 'documents/MachineLearning-Lecture03.pdf'}
{'page': 14, 'source': 'documents/MachineLearning-Lecture03.pdf'}
{'page': 4, 'source': 'documents/MachineLearning-Lecture03.pdf'}


**Addressing Specificity: working with metadata using self-query retriever**

To address this, we can use SelfQueryRetriever, which uses an LLM to extract:

1. The query string to use for vector search
2. A metadata filter to pass in as well

Most vector databases support metadata filters, so this doesn't require any new databases or indexes.

In [24]:
metadata_field_info = [
    AttributeInfo(
        name="source",
        description="The lecture the chunk is from, should be one of `documents/MachineLearning-Lecture01.pdf`, `documents/MachineLearning-Lecture02.pdf`, or `documents/MachineLearning-Lecture03.pdf`",
        type="string",
    ),
    AttributeInfo(
        name="page",
        description="The page from the lecture",
        type="integer",
    ),
]

In [None]:
document_content_description = "Lecture notes"
retriever = SelfQueryRetriever.from_llm(
    llm = llm,
    vectorstore = vectordb,
    document_contents = document_content_description,
    metadata_field_info=metadata_field_info,
    verbose=True
)