In [1]:
!pip install -q langchain langchain-community langchain-chroma langchain-google-genai langchain-huggingface

# Retrieval

In [2]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader('state_of_the_union.txt')

data = loader.load()



In [3]:
data[0].page_content[:100]

'Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and th'

In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)

In [5]:
from langchain_classic.storage import InMemoryStore
from langchain_chroma import Chroma

In [6]:
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

embedd_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [7]:
vectorstore = Chroma(
    collection_name="full_documents", embedding_function=embedd_model)

In [8]:
store = InMemoryStore()

In [9]:
from langchain_classic.retrievers import ParentDocumentRetriever
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
)

In [10]:
retriever.add_documents(data, ids=None)

In [11]:
store.yield_keys()

<generator object InMemoryBaseStore.yield_keys at 0x7f58aa09b5a0>

In [12]:
retrieved_docs= retriever.invoke("What did the president say about Ketanji Brown Jackson")


In [13]:
print(len(retrieved_docs[0].page_content))

38539


# Child -> Parent retrieval

In [14]:
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)

In [15]:
store2 = InMemoryStore()

In [16]:
vectorstore2 = Chroma(
    collection_name="full_documents_tree", embedding_function=embedd_model)

In [17]:
tree_retriever = ParentDocumentRetriever(
    vectorstore=vectorstore2,
    docstore=store2,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)

In [18]:
tree_retriever.add_documents(data, ids=None)

In [19]:
retrieved_docs2= tree_retriever.invoke("What did the president say about Ketanji Brown Jackson")

In [20]:
len(retrieved_docs2[0].page_content)

1849