#### Tuesday, December 19, 2023

[Advanced RAG 01 - Self Querying Retrieval](https://www.youtube.com/watch?v=f4LeWlt3T8Y&t=9s)

https://colab.research.google.com/drive/1MEVagsVRce15lzd_KNb86oAJmQx_WyZp?usp=sharing

This uses OpenAI: Used/Expired $4.45 / $38.00

This all runs.

In [None]:
# !pip -q install langchain huggingface_hub openai google-search-results tiktoken chromadb lark

In [1]:
import os
from getpass import getpass

# enter your api key
OPENAI_API_KEY = getpass("Enter your API key: ")
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [None]:
!pip show langchain

## Self-querying Retriever

In [2]:
from langchain.schema import Document
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma

embeddings = OpenAIEmbeddings()



## Example data with metadata attached

In [3]:
docs = [
    Document(
        page_content="Complex, layered, rich red with dark fruit flavors",
        metadata={"name":"Opus One", "year": 2018, "rating": 96, "grape": "Cabernet Sauvignon", "color":"red", "country":"USA"},
    ),
    Document(
        page_content="Luxurious, sweet wine with flavors of honey, apricot, and peach",
        metadata={"name":"Château d'Yquem", "year": 2015, "rating": 98, "grape": "Sémillon", "color":"white", "country":"France"},
    ),
    Document(
        page_content="Full-bodied red with notes of black fruit and spice",
        metadata={"name":"Penfolds Grange", "year": 2017, "rating": 97, "grape": "Shiraz", "color":"red", "country":"Australia"},
    ),
    Document(
        page_content="Elegant, balanced red with herbal and berry nuances",
        metadata={"name":"Sassicaia", "year": 2016, "rating": 95, "grape": "Cabernet Franc", "color":"red", "country":"Italy"},
    ),
    Document(
        page_content="Highly sought-after Pinot Noir with red fruit and earthy notes",
        metadata={"name":"Domaine de la Romanée-Conti", "year": 2018, "rating": 100, "grape": "Pinot Noir", "color":"red", "country":"France"},
    ),
    Document(
        page_content="Crisp white with tropical fruit and citrus flavors",
        metadata={"name":"Cloudy Bay", "year": 2021, "rating": 92, "grape": "Sauvignon Blanc", "color":"white", "country":"New Zealand"},
    ),
    Document(
        page_content="Rich, complex Champagne with notes of brioche and citrus",
        metadata={"name":"Krug Grande Cuvée", "year": 2010, "rating": 93, "grape": "Chardonnay blend", "color":"sparkling", "country":"New Zealand"},
    ),
    Document(
        page_content="Intense, dark fruit flavors with hints of chocolate",
        metadata={"name":"Caymus Special Selection", "year": 2018, "rating": 96, "grape": "Cabernet Sauvignon", "color":"red", "country":"USA"},
    ),
    Document(
        page_content="Exotic, aromatic white with stone fruit and floral notes",
        metadata={"name":"Jermann Vintage Tunina", "year": 2020, "rating": 91, "grape": "Sauvignon Blanc blend", "color":"white", "country":"Italy"},
    ),
]
vectorstore = Chroma.from_documents(docs, embeddings)

## Creating our self-querying retriever

In [4]:
from langchain.llms import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

metadata_field_info = [
    AttributeInfo(
        name="grape",
        description="The grape used to make the wine",
        type="string or list[string]",
    ),
    AttributeInfo(
        name="name",
        description="The name of the wine",
        type="string or list[string]",
    ),
    AttributeInfo(
        name="color",
        description="The color of the wine",
        type="string or list[string]",
    ),
    AttributeInfo(
        name="year",
        description="The year the wine was released",
        type="integer",
    ),
    AttributeInfo(
        name="country",
        description="The name of the country the wine comes from",
        type="string",
    ),
    AttributeInfo(
        name="rating", description="The Robert Parker rating for the wine 0-100", type="integer" #float
    ),
]
document_content_description = "Brief description of the wine"



In [5]:
llm = OpenAI(temperature=0)

retriever = SelfQueryRetriever.from_llm(
    llm,
    vectorstore,
    document_content_description,
    metadata_field_info,
    verbose=True
)

In [6]:
# This example only specifies a relevant query
retriever.get_relevant_documents("What are some red wines")

[Document(page_content='Elegant, balanced red with herbal and berry nuances', metadata={'name': 'Sassicaia', 'year': 2016, 'rating': 95, 'grape': 'Cabernet Franc', 'color': 'red', 'country': 'Italy'}),
 Document(page_content='Complex, layered, rich red with dark fruit flavors', metadata={'name': 'Opus One', 'year': 2018, 'rating': 96, 'grape': 'Cabernet Sauvignon', 'color': 'red', 'country': 'USA'}),
 Document(page_content='Highly sought-after Pinot Noir with red fruit and earthy notes', metadata={'name': 'Domaine de la Romanée-Conti', 'year': 2018, 'rating': 100, 'grape': 'Pinot Noir', 'color': 'red', 'country': 'France'}),
 Document(page_content='Intense, dark fruit flavors with hints of chocolate', metadata={'name': 'Caymus Special Selection', 'year': 2018, 'rating': 96, 'grape': 'Cabernet Sauvignon', 'color': 'red', 'country': 'USA'})]

In [7]:

retriever.get_relevant_documents("I want a wine that has fruity nodes")

[Document(page_content='Intense, dark fruit flavors with hints of chocolate', metadata={'name': 'Caymus Special Selection', 'year': 2018, 'rating': 96, 'grape': 'Cabernet Sauvignon', 'color': 'red', 'country': 'USA'}),
 Document(page_content='Crisp white with tropical fruit and citrus flavors', metadata={'name': 'Cloudy Bay', 'year': 2021, 'rating': 92, 'grape': 'Sauvignon Blanc', 'color': 'white', 'country': 'New Zealand'}),
 Document(page_content='Luxurious, sweet wine with flavors of honey, apricot, and peach', metadata={'name': "Château d'Yquem", 'year': 2015, 'rating': 98, 'grape': 'Sémillon', 'color': 'white', 'country': 'France'}),
 Document(page_content='Complex, layered, rich red with dark fruit flavors', metadata={'name': 'Opus One', 'year': 2018, 'rating': 96, 'grape': 'Cabernet Sauvignon', 'color': 'red', 'country': 'USA'})]

In [8]:
# This example specifies a query and a filter
retriever.get_relevant_documents("I want a wine that has fruity nodes and has a rating above 97")

[Document(page_content='Luxurious, sweet wine with flavors of honey, apricot, and peach', metadata={'name': "Château d'Yquem", 'year': 2015, 'rating': 98, 'grape': 'Sémillon', 'color': 'white', 'country': 'France'}),
 Document(page_content='Highly sought-after Pinot Noir with red fruit and earthy notes', metadata={'name': 'Domaine de la Romanée-Conti', 'year': 2018, 'rating': 100, 'grape': 'Pinot Noir', 'color': 'red', 'country': 'France'})]

In [9]:

retriever.get_relevant_documents(
    "What wines come from Italy?"
)

[Document(page_content='Elegant, balanced red with herbal and berry nuances', metadata={'name': 'Sassicaia', 'year': 2016, 'rating': 95, 'grape': 'Cabernet Franc', 'color': 'red', 'country': 'Italy'}),
 Document(page_content='Exotic, aromatic white with stone fruit and floral notes', metadata={'name': 'Jermann Vintage Tunina', 'year': 2020, 'rating': 91, 'grape': 'Sauvignon Blanc blend', 'color': 'white', 'country': 'Italy'})]

In [10]:
# This example specifies a query and composite filter
retriever.get_relevant_documents(
    "What's a wine after 2015 but before 2020 that's all earthy"
)

[Document(page_content='Elegant, balanced red with herbal and berry nuances', metadata={'name': 'Sassicaia', 'year': 2016, 'rating': 95, 'grape': 'Cabernet Franc', 'color': 'red', 'country': 'Italy'}),
 Document(page_content='Highly sought-after Pinot Noir with red fruit and earthy notes', metadata={'name': 'Domaine de la Romanée-Conti', 'year': 2018, 'rating': 100, 'grape': 'Pinot Noir', 'color': 'red', 'country': 'France'}),
 Document(page_content='Full-bodied red with notes of black fruit and spice', metadata={'name': 'Penfolds Grange', 'year': 2017, 'rating': 97, 'grape': 'Shiraz', 'color': 'red', 'country': 'Australia'}),
 Document(page_content='Complex, layered, rich red with dark fruit flavors', metadata={'name': 'Opus One', 'year': 2018, 'rating': 96, 'grape': 'Cabernet Sauvignon', 'color': 'red', 'country': 'USA'})]

## Filter K

We can also use the self query retriever to specify k: the number of documents to fetch.

We can do this by passing enable_limit=True to the constructor.

In [11]:
retriever = SelfQueryRetriever.from_llm(
    llm,
    vectorstore,
    document_content_description,
    metadata_field_info,
    enable_limit=True,
    verbose=True,
)

In [12]:
# This example only specifies a relevant query - k= 2
retriever.get_relevant_documents("what are two that have a rating above 97")

[Document(page_content='Luxurious, sweet wine with flavors of honey, apricot, and peach', metadata={'name': "Château d'Yquem", 'year': 2015, 'rating': 98, 'grape': 'Sémillon', 'color': 'white', 'country': 'France'}),
 Document(page_content='Highly sought-after Pinot Noir with red fruit and earthy notes', metadata={'name': 'Domaine de la Romanée-Conti', 'year': 2018, 'rating': 100, 'grape': 'Pinot Noir', 'color': 'red', 'country': 'France'})]

In [13]:
retriever.get_relevant_documents("what are two wines that come from australia or New zealand")

[Document(page_content='Crisp white with tropical fruit and citrus flavors', metadata={'name': 'Cloudy Bay', 'year': 2021, 'rating': 92, 'grape': 'Sauvignon Blanc', 'color': 'white', 'country': 'New Zealand'}),
 Document(page_content='Full-bodied red with notes of black fruit and spice', metadata={'name': 'Penfolds Grange', 'year': 2017, 'rating': 97, 'grape': 'Shiraz', 'color': 'red', 'country': 'Australia'})]