In [1]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings
from langchain_postgres.vectorstores import PGVector
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel
from langchain_core.runnables import RunnablePassthrough
from langchain_ollama import ChatOllama
from langchain_core.documents import Document
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryStore
import uuid
import os

In [None]:
connection = "postgresql+psycopg://langchain:langchain@localhost:6024/langchain"
collection_name = "summaries"
embeddings_model = OllamaEmbeddings(model="mxbai-embed-large")
# Load the document from the root path
os.chdir("..")

loader = TextLoader("test.txt",encoding="utf-8")
docs = loader.load()

print("length of loaded docs: ", len(docs[0].page_content))


length of loaded docs:  2765


> **Warning**: Running above cell multiple times in the same session will cause an error, because chdir method is not idempotent.

In [6]:
# Split the document 
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.split_documents(docs)

prompt_text = "Summarize the following document: \n\n{doc}"

prompt = ChatPromptTemplate.from_template(prompt_text)
llm = ChatOllama(temperature=0, model="llama3.1")
# chain doc here ->: https://python.langchain.com/v0.1/docs/modules/data_connection/retrievers/multi_vector/

summarize_chain = (
    {"doc": lambda x: x.page_content} | prompt | llm | StrOutputParser()
)
# batch the chain across the chunks
summaries = summarize_chain.batch(chunks, {"max_concurrency": 5})

Now, let's define the vector store and doc store to store the raw summaries and their embeddings:

In [9]:
# Vectorstore to use to index the child chunks
vectorstore = PGVector(
    embeddings=embeddings_model,
    collection_name=collection_name,
    connection=connection,
    use_jsonb=True,
)
# The storage layer for the parent documents
store = InMemoryStore()
id_key = "doc_id"

# Indexing the summaries in our vector store, whilst retaining the original documents in our document store:
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key=id_key,
)

# Changed from summaries to chunks since we need same lenght as docs
doc_ids = [str(uuid.uuid4()) for _ in chunks]

# Each summary is linked to the original document by the doc_id
summary_docs = [
    Document(page_content=s, metadata={id_key: doc_ids[i]})
    for i, s in enumerate(summaries)
]

# Add the document summaries to the vector store for similarity search
retriever.vectorstore.add_documents(summary_docs)

# Store the original documents in the document store, linked to their summaries
# via doc_ids
# this allows us to first search summaries efficiently, then fect the full
# docs when needed
retriever.docstore.mset(list(zip(doc_ids, chunks)))

# vector store retrieves the summaries
sub_docs = retriever.vectorstore.similarity_search(
    "the everlasting wake", k=2
)

Let's retrieve the relevant full context document based on a query:

In [11]:
# Whereas the retriever will return the larger source document chunks:
retrieved_docs = retriever.invoke("the everlasting wake")
retrieved_docs

[Document(metadata={'source': 'test.txt'}, page_content='No fear remains, no chains restrain,\nFor every loss, a gift remains,\nA journey traced in endless spheres,\nAcross the sea of vanished years.\n\nEpilogue: The Everlasting Wake\n\nThe ship is gone, yet still it sails,\nBeyond the stars, beyond the veils,\nA spectral dream in cosmic flight,\nA tale retold in endless night.\n\nFor every heart that dares to roam,\nBeyond the shores that feel like home,\nA voyage waits, a path unknown,\nWhere every soul may find its own.'),
 Document(metadata={'source': 'test.txt'}, page_content='The Forest of Forgotten Names\n\nBeyond the city’s dreaming gate,\nA forest waits in patient state,\nWhere trees bear names upon their bark,\nAnd memories glow in embered dark.\n\nHere whispers drift like autumn leaves,\nOf those the world no longer grieves,\nYet still they linger, still they call,\nIn hopes that someone hears at all.\n\nEach name a tale, each branch a past,\nA legacy too vast to last,\nYet 

Let's go back to the [README file](../README.md#raptor-recursive-abstractive-processing-for-tree-oriented-retrieval).