<a href="https://colab.research.google.com/github/russellemergentai/MistralDocker/blob/main/MultiVectorRetriever.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install langchain
!pip install langchain-chroma
!pip install langchain-openai
!pip install langchain-text-splitters
!pip install uuid

from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryByteStore
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
import uuid

#login
from google.colab import drive
drive.mount('/content/drive')

from huggingface_hub import login
from google.colab import userdata

# Load the secret token
HF_TOKEN = userdata.get('HF_TOKEN')

# Check if the token is set and log in
if HF_TOKEN:
    login(HF_TOKEN)
    print("logged in to Hugging Face")
else:
    print("Hugging Face login failed")



Mounted at /content/drive
logged in to Hugging Face


In [12]:
!pip install langchain
!pip install -U langchain-community
!pip install -U langchain-huggingface

from langchain.embeddings import HuggingFaceEmbeddings
from pathlib import Path
from langchain.document_loaders import TextLoader

model_path = "intfloat/e5-large-unsupervised"

embeddings = HuggingFaceEmbeddings(
    model_name=model_path,
    model_kwargs={'device': 'cuda'},
    encode_kwargs={'normalize_embeddings': False}
)

parent_splitter = RecursiveCharacterTextSplitter(chunk_size=500) #A
child_splitter = RecursiveCharacterTextSplitter(chunk_size=250) #B

child_chunks_collection = Chroma(
    collection_name="uk_child_chunks",
    embedding_function=embeddings,
)

child_chunks_collection.reset_collection()

doc_byte_store = InMemoryByteStore()
doc_key = "doc_id"

multi_vector_retriever = MultiVectorRetriever(
    vectorstore=child_chunks_collection,
    byte_store=doc_byte_store
)

#A Splitter to generate parent coarse chunks from original documents (parsed from web pages)
#B Splitter to generate child granular chunks from parent coarse chunks
#C Vector store collection to host child granular chunks
#D Make sure the collection is empty
#E Document store to host parent coarse chunks
#F Retriever to link parent coarse chunks to child granular chunks

all_documents = []

directory_path="/content/drive/MyDrive/Target"

for file_path in Path(directory_path).rglob('*'):
    if file_path.is_file():
        loader = TextLoader(str(file_path), encoding='UTF-8')
        documents = loader.load()
        all_documents.extend(documents)

coarse_chunks = parent_splitter.split_documents(all_documents)

coarse_chunks_ids = [str(uuid.uuid4()) for _ in coarse_chunks]
all_granular_chunks = []

for i, coarse_chunk in enumerate(coarse_chunks):
    coarse_chunk_id = coarse_chunks_ids[i]
    granular_chunks = child_splitter.split_documents([coarse_chunk])

    for granular_chunk in granular_chunks:
        granular_chunk.metadata[doc_key] = coarse_chunk_id
        all_granular_chunks.extend(granular_chunks)

multi_vector_retriever.vectorstore.add_documents(all_granular_chunks)
multi_vector_retriever.docstore.mset(list(zip(coarse_chunks_ids, coarse_chunks)))

#A Loader for one destination
#B Documents of one destination
#C Transform HTML docs into clean text docs
#D Split the destination content into parent coarse chunks
#E Iterate over the parent coarse chunks
#F Create child granular chunks from each parent coarse chunk
#G Link each child granular chunk to its parent coarse chunk
#H Ingest the child granular chunks into the vector store
#I Ingest the parent coarse chunks into the document store

retrieved_docs = multi_vector_retriever.invoke("who is summit lead engineer")

print(retrieved_docs)

[Document(metadata={'source': '/content/drive/MyDrive/Target/summit.txt'}, page_content='[Summit application]\nVenkata Pothuganti os the SUMMIT Dev LEAD engineer who has 11 staff reporting to him, some of whom are Sameer Mhatre, Debby Butcher and Susan Phillips.\n\nPuru Tella (F) - dev. Onshore virtusa.\n\nAndy Bonnett - PO\n\n**Summit datacentre Hosting*** Rockingham hosts Summit DR, Production and UAT are hosted in Peterborough P1/P2/P3 datacentres. Andy Murray has been working on this, needs to go to P4.'), Document(metadata={'source': '/content/drive/MyDrive/Target/stf.txt'}, page_content='[Finastra vendor]\nFinastra is the vendor for summit - Emmanual Kouri is the Summit account manager\n\n\n[Apex SF application]\n**Apex snow identifier** is AL09252 CatA.\n\n**Apex engineering dev team lead** is ed mandell.\n\n**Apex staff** are: Ganesh - virtusa india (experienced);Milind - permanent london; Shengli - apex contractor london; Laurent - apex contractor london (most experienced pers