## Thursday, April 11, 2024

mamba activate langchain3

In [1]:
# Example: reuse your existing OpenAI setup
from openai import OpenAI

# Point to the local server
client = OpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio")

completion = client.chat.completions.create(
  model="TheBloke/Nous-Hermes-2-SOLAR-10.7B-GGUF/nous-hermes-2-solar-10.7b.Q8_0.gguf",
  messages=[
    {"role": "system", "content": "Always answer in rhymes."},
    {"role": "user", "content": "Introduce yourself."}
  ],
  temperature=0.7,
)

print(completion.choices[0].message)

ChatCompletionMessage(content="Hello there! I'm your guide, and you may know me well,\nFor I assist you with a rhyme, whenever you compel.", role='assistant', function_call=None, tool_calls=None)


In [2]:
# only target the 4090 ...
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device


device(type='cuda')

Document Loaders

In [3]:
from langchain.document_loaders import TextLoader

In [4]:
loader = TextLoader("../data/vocab.txt")
document = loader.load()

In [5]:
from langchain.document_loaders.csv_loader import CSVLoader

# id|custom_title|stubhub_title|vividseats_title
loader = CSVLoader(file_path='../data/titledata.csv', csv_args={
    'delimiter': '|',
    'quotechar': '"',
    'fieldnames': ['id', 'custom_title', 'stubhub_title', 'vividseats_title']})
document = loader.load()

First time running the below we get the error ...

* ImportError: pypdf package not found, please install it with `pip install pypdf`

In [None]:
# mamba install conda-forge::pypdf

In [6]:
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader("../data/Llama Getting Started Guide.pdf")
pages = loader.load_and_split()

First time running the next cell we got the error ...

* ValueError: Did not find mathpix_api_key, please add an environment variable `MATHPIX_API_KEY` which contains it, or pass `mathpix_api_key` as a named parameter.

Signed up, but then realized there is no free version of this, so yeah, bye bye MathPix ... !

In [7]:
from langchain.document_loaders import MathpixPDFLoader

loader = MathpixPDFLoader("../data/Llama Getting Started Guide.pdf")
data = loader.load()

ValueError: Did not find mathpix_api_key, please add an environment variable `MATHPIX_API_KEY` which contains it, or pass `mathpix_api_key` as a named parameter.

First time running the next cell produces the error ...

* ImportError: `pdfminer` package not found, please install it with `pip install pdfminer.six`

In [None]:
# mamba install conda-forge::pdfminer
# mamba install conda-forge::pdfminer.six

In [8]:
from langchain.document_loaders import PDFMinerLoader

loader = PDFMinerLoader("../data/Llama Getting Started Guide.pdf")
data = loader.load()

In [9]:
from langchain.document_loaders import PDFMinerPDFasHTMLLoader

loader = PDFMinerPDFasHTMLLoader("../data/Llama Getting Started Guide.pdf")
data = loader.load()

Document Transformers

In [10]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# This is a long document we can split up.
with open('../data/state_of_the_union.txt') as f:
    state_of_the_union = f.read()
    
len(state_of_the_union)

38540

In [11]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len,
    add_start_index=True,
)

texts = text_splitter.create_documents([state_of_the_union])
print(texts[0])
print(texts[1])

page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and' metadata={'start_index': 0}
page_content='of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.' metadata={'start_index': 82}


In [12]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    separator="\n\n",
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
    is_separator_regex=False,
)

texts = text_splitter.create_documents([state_of_the_union])
print(texts[0])

page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.  \n\nLast year COVID-19 kept us apart. This year we are finally together again. \n\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \n\nWith a duty to one another to the American people to the Constitution. \n\nAnd with an unwavering resolve that freedom will always triumph over tyranny. \n\nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. \n\nHe thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. \n\nHe met the Ukrainian people. \n\nFrom President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world.'


In [13]:
# This is a long document we can split up.
with open('../data/index.html') as f:
    html_string = f.read()

First time running the next cell produced the error ...

* ImportError: Unable to import lxml, please install with `pip install lxml`.

In [None]:
# pip install lxml

In [14]:
from langchain.text_splitter import HTMLHeaderTextSplitter

# html_string = "Your HTML content here..."
headers_to_split_on = [("h1", "Header 1"), ("h2", "Header 2")]

html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
html_header_splits = html_splitter.split_text(html_string)
print(html_header_splits[0])

page_content='API Core Experimental Python Docs  \nToggle Menu  \nPrev Up Next  \nLangChain 0.0.339rc1  \nAll modules for which code is available'


In [15]:
from langchain.text_splitter import HTMLHeaderTextSplitter, RecursiveCharacterTextSplitter

url = "https://example.com"
headers_to_split_on = [("h1", "Header 1"), ("h2", "Header 2")]
html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
html_header_splits = html_splitter.split_text_from_url(url)

chunk_size = 500
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size)
splits = text_splitter.split_documents(html_header_splits)


In [16]:
print(splits[0])

page_content='Example Domain'


In [17]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, Language

python_code = """
def hello_world():
    print("Hello, World!")
hello_world()
"""

In [18]:
# python_splitter = RecursiveCharacterTextSplitter.from_language(
#     language=Language.PYTHON, chunk_size=50
# )

python_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, chunk_size=200
)

In [19]:
python_docs = python_splitter.create_documents([python_code])
print(python_docs[0])

page_content='def hello_world():\n    print("Hello, World!")\nhello_world()'


In [20]:
js_code = """
function helloWorld() {
  console.log("Hello, World!");
}
helloWorld();
"""

In [21]:
# js_splitter = RecursiveCharacterTextSplitter.from_language(
#     language=Language.JS, chunk_size=60
# )

js_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.JS, chunk_size=200
)

In [22]:
js_docs = js_splitter.create_documents([js_code])
print(js_docs[0])

page_content='function helloWorld() {\n  console.log("Hello, World!");\n}\nhelloWorld();'


Running this next next cell for the first time generates the error ...

* ImportError: Could not import tiktoken python package. This is needed in order to for TokenTextSplitter. Please install it with `pip install tiktoken`.

In [23]:
# mamba install conda-forge::tiktoken

In [24]:

from langchain.text_splitter import TokenTextSplitter

# text_splitter = TokenTextSplitter(chunk_size=10)
text_splitter = TokenTextSplitter(chunk_size=200)

Running this next cell sucks up all 32gb of ram, then starts sucking up the swap space until it too maxes out .. had to kill the kernel (around the 4 minute mark). 

It also jumps around 100% usage on a single CPU core ... 

So yeah, you probably do not want to run this next cell!

In [None]:
texts = text_splitter.split_text(state_of_the_union)

In [None]:
print(texts[0])

Text Embedding Models

We are going to work with [HuggingFace Embeddings](https://python.langchain.com/docs/integrations/text_embedding/huggingfacehub/)

In [25]:
from langchain_community.embeddings import HuggingFaceEmbeddings

The next cell threw the error ...

* ImportError: Could not import sentence_transformers python package. Please install it with `pip install sentence-transformers`.

In [None]:
# mamba install conda-forge::sentence-transformers

In [26]:
# The all-mpnet-base-v2 model provides the best quality, while all-MiniLM-L6-v2 is 5 times faster and still offers good quality.
HuggingFaceEmbeddings(model_name='all-mpnet-base-v2')

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='all-mpnet-base-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [None]:
embeddings = HuggingFaceEmbeddings()

In [27]:
embeddings = HuggingFaceEmbeddings(model_name='all-mpnet-base-v2')

In [28]:
text = "This is a test document."

In [29]:
query_result = embeddings.embed_query(text)

In [30]:
query_result[:3]

[-0.04895174130797386, -0.03986193612217903, -0.021562788635492325]

In [31]:
documents = ["The sky is blue", "The sun is bright", "The sun in the sky is bright"]
document_embeddings = embeddings.embed_documents(documents)


In [32]:
document_embeddings[0][:10]

[-0.0022825205232948065,
 -0.034933365881443024,
 0.002956187818199396,
 -0.04306314140558243,
 -0.03997139632701874,
 -0.035806622356176376,
 -0.03606832027435303,
 -0.009458038955926895,
 -0.05495908111333847,
 0.030772054567933083]

Sentence Transformers

https://sbert.net/docs/pretrained_models.html

I added this to show how to use sentence-transformers outside of HuggingFaceEmbeddings.

The following models have been specifically trained for Semantic Search: Given a question / search query, these models are able to find relevant text passages. For more details, see Usage - Semantic Search.

[all-mpnet-base-v2](https://huggingface.co/sentence-transformers/all-mpnet-base-v2)

In [33]:
from sentence_transformers import SentenceTransformer
sentences = ["This is an example sentence", "Each sentence is converted"]

model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
embeddings = model.encode(sentences)
print(embeddings)


[[ 0.02250259 -0.07829171 -0.02303074 ... -0.00827929  0.02652689
  -0.00201898]
 [ 0.04170233  0.00109744 -0.0155342  ... -0.02181628 -0.0635936
  -0.00875288]]


In [34]:
document_embeddings = model.encode(documents)


In [35]:
# same as above!
document_embeddings[0][:10]

array([-0.00228252, -0.03493337,  0.00295619, -0.04306314, -0.0399714 ,
       -0.03580662, -0.03606832, -0.00945804, -0.05495908,  0.03077205],
      dtype=float32)

In [36]:
# https://sbert.net/docs/pretrained_models.html

from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1")

query_embedding = model.encode("How big is London")
passage_embedding = model.encode([
    "London has 9,787,426 inhabitants at the 2011 census",
    "London is known for its finacial district",
])

print("Similarity:", util.dot_score(query_embedding, passage_embedding))

# 0.5s .... after it has already been downloaded
# 1m 195s .... first download

Similarity: tensor([[0.5472, 0.6330]])


Vector Stores

Using the example langchain chroma code found at [Chroma](https://python.langchain.com/docs/integrations/vectorstores/chroma/)

In [37]:
from langchain.vectorstores import Chroma

As expected, running the next cell for the first time generates an error for Chroma ...

* ImportError: Could not import chromadb python package. Please install it with `pip install chromadb`.

In [38]:
# import
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import CharacterTextSplitter

In [39]:
# load the document and split it into chunks
loader = TextLoader("../data/state_of_the_union.txt")
documents = loader.load()

In [40]:
# split it into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)

[Sentence Transformers on HuggingFace](https://python.langchain.com/docs/integrations/text_embedding/sentence_transformers/)

* from langchain_community.embeddings import HuggingFaceEmbeddings
* from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings

The above 2 different imports are the same!!


In [41]:
# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# The above can ALSO be expressed as ... Confusing, right!? ...  !! Why would they change this???
# embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")


In [42]:
# load it into Chroma
dbChroma = Chroma.from_documents(docs, embedding_function)

In [43]:
# query it
query = "What did the president say about Ketanji Brown Jackson"
docs = dbChroma.similarity_search(query)

In [44]:
# print results
print(docs[0].page_content)

Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. 

Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. 

One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. 

And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.


The following cells are from the LangChain docs [Facebook AI Similarity Search (Faiss)](https://python.langchain.com/docs/integrations/vectorstores/faiss/)

In [None]:
# mamba install conda-forge::faiss

In [45]:
from langchain_community.vectorstores import FAISS

Use the same docs and embedding_function from the above Chroma example.

In [46]:
dbFAISS = FAISS.from_documents(docs, embedding_function)

In [47]:
print(dbFAISS.index.ntotal)

4


In [48]:
query = "What did the president say about Ketanji Brown Jackson"
docs = dbFAISS.similarity_search(query)

In [49]:
print(docs[0].page_content)

Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. 

Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. 

One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. 

And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.


As a Retriever

We can also convert the vectorstore into a Retriever class. This allows us to easily use it in other LangChain methods, which largely work with retrievers

In [50]:
retriever = dbFAISS.as_retriever()
docs = retriever.invoke(query)

In [51]:
print(docs[0].page_content)

Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. 

Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. 

One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. 

And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.


Similarity Search with score

There are some FAISS specific methods. One of them is similarity_search_with_score, which allows you to return not only the documents but also the distance score of the query to them. The returned distance score is L2 distance. Therefore, a lower score is better.

In [52]:
docs_and_scores = dbFAISS.similarity_search_with_score(query)

In [53]:
docs_and_scores[0]

(Document(page_content='Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.', metadata={'source': '../data/state_of_the_union.txt'}),
 1.1972051)

It is also possible to do a search for documents similar to a given embedding vector using similarity_search_by_vector which accepts an embedding vector as a parameter instead of a string.



In [54]:
embedding_vector = embedding_function.embed_query(query)
docs_and_scores = dbFAISS.similarity_search_by_vector(embedding_vector)

In [55]:
docs_and_scores[0]

Document(page_content='Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.', metadata={'source': '../data/state_of_the_union.txt'})

Retrievers

(We touched briefly on retrievers from the above FAISS example, but the [A Complete LangChain Guide](https://nanonets.com/blog/langchain/) tutorial covers them in more detail, which we will now resume ...)


Retrievers in LangChain are interfaces that return documents in response to an unstructured query. They are more general than vector stores, focusing on retrieval rather than storage. Although vector stores can be used as a retriever's backbone, there are other types of retrievers as well.

To set up a Chroma retriever, you first install it using pip install chromadb. Then, you load, split, embed, and retrieve documents using a series of Python commands. Here's a code example for setting up a Chroma retriever:

In [56]:
# from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma

full_text = open("../data/state_of_the_union.txt", "r").read()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
texts = text_splitter.split_text(full_text)

And once again, we will not be using OpenAIEmbeddings but HuggingFaceEmbeddings

In [57]:
from langchain_community.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name='all-mpnet-base-v2')

In [58]:
db = Chroma.from_texts(texts, embeddings)

InvalidDimensionException: Embedding dimension 768 does not match collection dimensionality 384

What!? We did something like this above, right? Read the state_of_the_union.txt file, generate the embeddings, then load them to Chroma ... and it worked! 

So why is this now failing?? ...

Here once again is the full working example from [Chroma](https://python.langchain.com/docs/integrations/vectorstores/chroma/)

In [60]:
# import
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import CharacterTextSplitter

# load the document and split it into chunks
loader = TextLoader("../data/state_of_the_union.txt")
documents = loader.load()

# split it into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)

# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# load it into Chroma
db = Chroma.from_documents(docs, embedding_function)

# query it
query = "What did the president say about Ketanji Brown Jackson"
docs = db.similarity_search(query)

# print results
print(docs[0].page_content)

Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. 

Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. 

One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. 

And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.


Is it because we used a different embedding model? ...

In [61]:
# embeddings = HuggingFaceEmbeddings(model_name='all-mpnet-base-v2')
embeddings = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2')

Yup! That was the problem! ... 'all-MiniLM-L6-v2' works but 'all-mpnet-base-v2' DOES NOT! ... How the hell am I supposed to know this up front??

In [63]:
dbChroma = Chroma.from_texts(texts, embeddings)

In [64]:
retrieverChroma = dbChroma.as_retriever()

In [65]:
retrieved_docs = retrieverChroma.invoke("What did the president say about Ketanji Brown Jackson?")
print(retrieved_docs[0].page_content)

One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. 

And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence. 

A former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans. 

And if we are to advance liberty and justice, we need to secure the Border and fix the immigration system. 

We can do both. At our border, we’ve installed new technology like cutting-edge scanners to better detect drug smuggling.  

We’ve set up joint patrols with Mexico and Guatemala to catch more human traffickers.


The MultiQueryRetriever automates prompt tuning by generating multiple queries for a user input query and combines the results. Here's an example of its simple usage:

In [67]:
from langchain.chat_models import ChatOpenAI
from langchain.retrievers.multi_query import MultiQueryRetriever

question = "What are the approaches to Task Decomposition?"
# llm = ChatOpenAI(temperature=0)
# "TheBloke/Nous-Hermes-2-SOLAR-10.7B-GGUF/nous-hermes-2-solar-10.7b.Q8_0.gguf"
llm = ChatOpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio", temperature=0)

In [68]:
# retriever_from_llm = MultiQueryRetriever.from_llm(
#     retriever=db.as_retriever(), llm=llm
# )
retriever_from_llm = MultiQueryRetriever.from_llm(
    retriever=retrieverChroma, llm=llm
)

In [69]:
unique_docs = retriever_from_llm.get_relevant_documents(query=question)
print("Number of unique documents:", len(unique_docs))

Number of unique documents: 4


Contextual Compression in LangChain compresses retrieved documents using the context of the query, ensuring only relevant information is returned. This involves content reduction and filtering out less relevant documents. The following code example shows how to use Contextual Compression Retriever:

In [70]:
from langchain.llms import OpenAI
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

# llm = OpenAI(temperature=0)
llm = OpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio", temperature=0)

  warn_deprecated(


In [71]:
compressor = LLMChainExtractor.from_llm(llm)

In [72]:
# compression_retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=retriever)
compression_retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=retrieverChroma)

In [73]:
compressed_docs = compression_retriever.get_relevant_documents("What did the president say about Ketanji Jackson Brown")
print(compressed_docs[0].page_content)



One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. 

And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.


The EnsembleRetriever combines different retrieval algorithms to achieve better performance. An example of combining BM25 and FAISS Retrievers is shown in the following code:

In [74]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain.vectorstores import FAISS

[BM25](https://python.langchain.com/docs/integrations/retrievers/bm25/)

First time running the next cell generates the error ...

* ImportError: Could not import rank_bm25, please install with `pip install rank_bm25`.

In [76]:
# pip install rank-bm25

In [78]:
# original code does not even include any reference to this mystery item known as 'doc_list'
# bm25_retriever = BM25Retriever.from_texts(doc_list).set_k(2)

# example from the BM25 docs ...
bm25_retriever = BM25Retriever.from_texts(["foo", "bar", "world", "hello", "foo bar"])

In [89]:
# But for our purposes, we will replace 'doc_list' with 'texts'
# faiss_vectorstore = FAISS.from_texts(doc_list, OpenAIEmbeddings())

# bm25_retriever = BM25Retriever.from_texts(texts).set_k(2) ... this errors => AttributeError: 'BM25Retriever' object has no attribute 'set_k'
bm25_retriever = BM25Retriever.from_texts(texts, k=2)
faiss_vectorstore = FAISS.from_texts(texts, embedding_function)

In [90]:
faiss_retriever = faiss_vectorstore.as_retriever(search_kwargs={"k": 2})

In [91]:
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, faiss_retriever], weights=[0.5, 0.5]
)

In [92]:
docs = ensemble_retriever.get_relevant_documents("apples")
print(docs[0].page_content)

Because I see the future that is within our grasp. 

Because I know there is simply nothing beyond our capacity. 

We are the only nation on Earth that has always turned every crisis we have faced into an opportunity. 

The only nation that can be defined by a single word: possibilities. 

So on this night, in our 245th year as a nation, I have come to report on the State of the Union. 

And my report is this: the State of the Union is strong—because you, the American people, are strong. 

We are stronger today than we were a year ago. 

And we will be stronger a year from now than we are today. 

Now is our moment to meet and overcome the challenges of our time. 

And we will, as one people. 

One America. 

The United States of America. 

May God bless you all. May God protect our troops.


MultiVector Retriever in LangChain allows querying documents with multiple vectors per document, which is useful for capturing different semantic aspects within a document. Methods for creating multiple vectors include splitting into smaller chunks, summarizing, or generating hypothetical questions. For splitting documents into smaller chunks, the following Python code can be used:

In [140]:
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.storage import InMemoryStore
from langchain.document_loaders import TextLoader
import uuid

In [141]:
loaders = [TextLoader("../data/state_of_the_union.txt"), TextLoader("../data/05-04-cma-generative-ai-review.txt")]
docs = [doc for loader in loaders for doc in loader.load()]
len(docs)

2

In [142]:
docs[0]

Document(page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.  \n\nLast year COVID-19 kept us apart. This year we are finally together again. \n\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \n\nWith a duty to one another to the American people to the Constitution. \n\nAnd with an unwavering resolve that freedom will always triumph over tyranny. \n\nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. \n\nHe thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. \n\nHe met the Ukrainian people. \n\nFrom President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world. \n\nGroups of citizen

In [143]:
len(docs[0].page_content)

38540

In [144]:
len(docs[1].page_content)

7493

In [145]:

text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000)
docs = text_splitter.split_documents(docs)
len(docs)

5

In [146]:
len(docs[0].page_content)

9947

In [147]:
# vectorstore = Chroma(collection_name="full_documents", 
#                      embedding_function=OpenAIEmbeddings())

vectorstore = Chroma(collection_name="full_documents", 
                     embedding_function=HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2'))


In [148]:
store = InMemoryStore()
id_key = "doc_id"
retriever = MultiVectorRetriever(vectorstore=vectorstore, docstore=store, id_key=id_key)

In [149]:
# this generates a different guid every time you run it ... 
uuid.uuid4()

UUID('63605f4e-3c75-4d2e-9ecc-6d0ab27d5ff3')

In [150]:
doc_ids = [str(uuid.uuid4()) for _ in docs]
doc_ids

['2f220098-a905-4206-88f9-250713294d1e',
 '4a39a9d7-5db3-43bc-bc45-9131f08c5eff',
 '03286b6d-03b7-443d-8e3f-8eaf00aa7cc0',
 '343a2ff0-b58c-49c7-bf7c-60a848899960',
 '7e8280e8-cc1e-4f33-9948-bbccda8b3852']

In [151]:
len(doc_ids)

5

In [152]:
child_text_splitter = RecursiveCharacterTextSplitter(chunk_size=400)

In [153]:
sub_docs = [sub_doc for doc in docs for sub_doc in child_text_splitter.split_documents([doc])]

In [154]:
len(sub_docs)

217

In [155]:
sub_docs[0]

Document(page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.  \n\nLast year COVID-19 kept us apart. This year we are finally together again. \n\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans.', metadata={'source': '../data/state_of_the_union.txt'})

In [158]:
print(len(sub_docs[0].page_content))

337


In [156]:
sub_docs[8]

Document(page_content='Let each of us here tonight in this Chamber send an unmistakable signal to Ukraine and to the world. \n\nPlease rise if you are able and show that, Yes, we the United States of America stand with the Ukrainian people. \n\nThroughout our history we’ve learned this lesson when dictators do not pay a price for their aggression they cause more chaos.   \n\nThey keep moving.', metadata={'source': '../data/state_of_the_union.txt'})

In [160]:
# I added this line here to fix an issue with the notebook.
doc_ids = [str(uuid.uuid4()) for _ in sub_docs]
# doc_ids

In [162]:
for sub_doc in sub_docs:
    sub_doc.metadata[id_key] = doc_ids[sub_docs.index(sub_doc)]

In [163]:
sub_docs[0]

Document(page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.  \n\nLast year COVID-19 kept us apart. This year we are finally together again. \n\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans.', metadata={'source': '../data/state_of_the_union.txt', 'doc_id': '134fee76-aeb5-4240-b1c3-a5426c310f60'})

In [164]:
retriever.vectorstore.add_documents(sub_docs)

['2a85a400-f509-4b97-803e-e64d5932edd1',
 'd7ec8845-7323-44bd-9488-eb6d4fd4539d',
 '7b5dd0ba-7352-4c68-9e6f-31ee615d2f4d',
 'ffe7d6b9-fa30-4d29-a304-0513be6ace76',
 '32a98953-76b1-4537-80c8-15830afc1463',
 'f100d80b-cfc6-47db-9646-70a30366fe0e',
 'f8a99c49-9dce-47bb-8dfb-300b9207a18e',
 '8fee575f-4006-4983-808f-69f4e5882626',
 'df054105-0678-469f-87fc-55617f3dbf7b',
 'abd7f855-16be-4a11-83bf-b9834c77b85c',
 'beaa83b4-06b8-4b47-acd3-99f810fb5460',
 '6a384110-624a-4315-b014-0d460dc424ba',
 '5f715461-bd1a-4fca-af2b-5e0fc87f251b',
 '14b3f4c8-1b7c-4df1-b0be-e45fc70c8cb0',
 '09aa9852-d159-4cb0-abf1-5b6bd216097d',
 '0e6b8d0f-2daa-425f-b9cb-301ee69821dc',
 'f4d38015-27f6-494d-809a-375c50ac5eec',
 'd05110a4-10db-451f-bb73-af265125f73b',
 '933aec8d-fb63-46bd-a748-bd2143191a51',
 'afb9aae3-341f-442d-a05b-e3815567e1c5',
 'c0cecc3c-a2a6-4cfa-b14c-5b5bdc4f8460',
 'b49fc7a5-26f4-4f33-ab67-b4d2ac7aa6fa',
 'cb574abb-11b0-446c-8a23-94564e074938',
 '362ba3d8-509e-49a8-9432-515e6395c2ab',
 '1767efe1-f257-

In [165]:
retriever.docstore.mset(list(zip(doc_ids, docs)))

Generating summaries for better retrieval due to more focused content representation is another method. Here's an example of generating summaries:

In [190]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.document import Document

In [191]:
# chain = (lambda x: x.page_content) | ChatPromptTemplate.from_template("Summarize the following document:\n\n{doc}") | ChatOpenAI(max_retries=0) | StrOutputParser()


chatOpenAI = ChatOpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio", temperature=0, max_retries=0)
chain = (lambda x: x.page_content) | ChatPromptTemplate.from_template("Summarize the following document:\n\n{doc}") |  chatOpenAI | StrOutputParser()
summaries = chain.batch(docs, {"max_concurrency": 5})

In [192]:
summary_docs = [Document(page_content=s, metadata={id_key: doc_ids[i]}) for i, s in enumerate(summaries)]
retriever.vectorstore.add_documents(summary_docs)
retriever.docstore.mset(list(zip(doc_ids, docs)))

Generating hypothetical questions relevant to each document using LLM is another approach. This can be done with the following code:

In [193]:
functions = [{"name": "hypothetical_questions", "parameters": {"questions": {"type": "array", "items": {"type": "string"}}}}]
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser

In [194]:
# chain = (lambda x: x.page_content) | ChatPromptTemplate.from_template("Generate 3 hypothetical questions:\n\n{doc}") | ChatOpenAI(max_retries=0).bind(functions=functions, function_call={"name": "hypothetical_questions"}) | JsonKeyOutputFunctionsParser(key_name="questions")

chatOpenAI = ChatOpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio", temperature=0, max_retries=0).bind(functions=functions, function_call={"name": "hypothetical_questions"})
chain = (lambda x: x.page_content) | ChatPromptTemplate.from_template("Generate 3 hypothetical questions:\n\n{doc}") | chatOpenAI | JsonKeyOutputFunctionsParser(key_name="questions")

In [195]:
hypothetical_questions = chain.batch(docs, {"max_concurrency": 8})

OutputParserException: Could not parse function call: 'function_call'