In [1]:
import os
from dotenv import load_dotenv
from langchain_community.document_loaders import Docx2txtLoader
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import faiss
from langchain_openai import OpenAI, OpenAIEmbeddings
from langchain.chains import RetrievalQAWithSourcesChain

In [2]:
load_dotenv()

True

In [3]:
loader = PyPDFLoader("./data/resume-sample-engineering.pdf")

In [4]:
pages = loader.load()
for i in range(len(pages)):
    pages[i].metadata["county"]="San Francisco"

In [5]:
pages[0]

Document(page_content='SALLIE N. GINEER  \nsengineer@buffalo.edu, 716 -123-4567 , linkedin.com/in/salliengineer  \n \nEDUCATION   \nBachelor of Science, Mechanical Engineering and Aerospace Engineering  (Double Major), May 20 XX \nUniversity at Buffalo, T he State University of New York  \n• Honors: Dean’s List , GPA: 3.2/4.0  \n \nENGINEERING WORK EXPERIENCE  \nEngineering Intern, FS -Elliott, Export, PA , May 20 XX - August 20 XX  \nProject: Motor Vibration Isolation System  \n• Identif ied critical design requirements and range of  solutions for a group project  utilizing  QFD tools .  \n• Developed Tuned Vibration Neutralizer effective for motors of various sizes and operating frequencies.  \n• Optimize d design parameters  using MatL ab, Simulink and Excel to  run simulations.  \n• Built a scaled prototype with a team of 3 and reduced motor vibrations by more than 99%.  \n• Managed group meetings as team leader and monitored quality of work delivered by each member.  \n \nENGINEER

In [6]:
documents = []
documents.extend(pages)

In [7]:
documents

[Document(page_content='SALLIE N. GINEER  \nsengineer@buffalo.edu, 716 -123-4567 , linkedin.com/in/salliengineer  \n \nEDUCATION   \nBachelor of Science, Mechanical Engineering and Aerospace Engineering  (Double Major), May 20 XX \nUniversity at Buffalo, T he State University of New York  \n• Honors: Dean’s List , GPA: 3.2/4.0  \n \nENGINEERING WORK EXPERIENCE  \nEngineering Intern, FS -Elliott, Export, PA , May 20 XX - August 20 XX  \nProject: Motor Vibration Isolation System  \n• Identif ied critical design requirements and range of  solutions for a group project  utilizing  QFD tools .  \n• Developed Tuned Vibration Neutralizer effective for motors of various sizes and operating frequencies.  \n• Optimize d design parameters  using MatL ab, Simulink and Excel to  run simulations.  \n• Built a scaled prototype with a team of 3 and reduced motor vibrations by more than 99%.  \n• Managed group meetings as team leader and monitored quality of work delivered by each member.  \n \nENGINEE

# Parent Document Retriever w/o filter

In [11]:
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore
from langchain.vectorstores import Chroma

In [12]:
child_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100)

In [13]:
embedding = OpenAIEmbeddings(model="text-embedding-3-large")
vectorstore_wo_filter = Chroma(
    collection_name="docs", embedding_function=embedding
)

In [22]:
from langchain.storage._lc_store import create_kv_docstore
from langchain.storage import LocalFileStore

fs_wo_filter = LocalFileStore("./store_location_wo_filter")
store_wo_filter = create_kv_docstore(fs_wo_filter)
retriever_wo_filter = ParentDocumentRetriever(
    vectorstore=vectorstore_wo_filter,
    docstore=store_wo_filter,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter
)

In [23]:
retriever_wo_filter.add_documents(documents, ids=None)

In [24]:
vectorstore_wo_filter.persist()

In [25]:
vectorstore_wo_filter.similarity_search(query="where did sallie interned?", k=3)

[Document(page_content='SALLIE N. GINEER  \nsengineer@buffalo.edu, 716 -123-4567 , linkedin.com/in/salliengineer  \n \nEDUCATION   \nBachelor of Science, Mechanical Engineering and Aerospace Engineering  (Double Major), May 20 XX \nUniversity at Buffalo, T he State University of New York  \n• Honors: Dean’s List , GPA: 3.2/4.0  \n \nENGINEERING WORK EXPERIENCE  \nEngineering Intern, FS -Elliott, Export, PA , May 20 XX - August 20 XX  \nProject: Motor Vibration Isolation System', metadata={'county': 'San Francisco', 'doc_id': 'f98e7b62-592b-4157-9dff-4f84df38fb86', 'page': 0, 'source': './data/resume-sample-engineering.pdf'}),
 Document(page_content='SALLIE N. GINEER  \nsengineer@buffalo.edu, 716 -123-4567 , linkedin.com/in/salliengineer  \n \nEDUCATION   \nBachelor of Science, Mechanical Engineering and Aerospace Engineering  (Double Major), May 20 XX \nUniversity at Buffalo, T he State University of New York  \n• Honors: Dean’s List , GPA: 3.2/4.0  \n \nENGINEERING WORK EXPERIENCE  \n

In [26]:
retriever_wo_filter.get_relevant_documents("where did sallie interned?", k=1)

[Document(page_content='SALLIE N. GINEER  \nsengineer@buffalo.edu, 716 -123-4567 , linkedin.com/in/salliengineer  \n \nEDUCATION   \nBachelor of Science, Mechanical Engineering and Aerospace Engineering  (Double Major), May 20 XX \nUniversity at Buffalo, T he State University of New York  \n• Honors: Dean’s List , GPA: 3.2/4.0  \n \nENGINEERING WORK EXPERIENCE  \nEngineering Intern, FS -Elliott, Export, PA , May 20 XX - August 20 XX  \nProject: Motor Vibration Isolation System  \n• Identif ied critical design requirements and range of  solutions for a group project  utilizing  QFD tools .  \n• Developed Tuned Vibration Neutralizer effective for motors of various sizes and operating frequencies.  \n• Optimize d design parameters  using MatL ab, Simulink and Excel to  run simulations.  \n• Built a scaled prototype with a team of 3 and reduced motor vibrations by more than 99%.  \n• Managed group meetings as team leader and monitored quality of work delivered by each member.  \n \nENGINEE

# Parent Document Retriever w filter

In [27]:
from typing import List, Dict, Optional
from langchain.retrievers.parent_document_retriever import ParentDocumentRetriever
from langchain.schema import Document
from pydantic import Field

def join_recursive_documents(documents: List[Document], start_index: int = 0) -> List[Document]:
    """
    Join a list of documents that were split using RecursiveCharacterTextSplitter.
    """
    joined_docs = []
    current_doc = None
    for doc in documents:
        if current_doc is None or doc.metadata["start_index"] != current_doc.metadata["start_index"]:
            if current_doc is not None:
                joined_docs.append(current_doc)
            current_doc = doc
        else:
            current_doc.page_content += doc.page_content
    if current_doc is not None:
        joined_docs.append(current_doc)
    return joined_docs

class FilteredParentDocumentRetriever(ParentDocumentRetriever):
    filter: Optional[Dict] = Field(default=None, description="Filter to apply to the search results.")

    class Config:
        arbitrary_types_allowed = True

    def __init__(self, vectorstore, docstore, child_splitter, parent_splitter, filter: Optional[Dict] = None, **kwargs):
        super().__init__(vectorstore=vectorstore, docstore=docstore, child_splitter=child_splitter, parent_splitter=parent_splitter, **kwargs)
        self.filter = filter

    def get_relevant_documents(self, query: str) -> List[Document]:
        docs_and_scores = self.vectorstore.similarity_search_with_score(query)
        docs = [doc for doc, _ in docs_and_scores]
        if self.filter:
            filtered_docs = [doc for doc in docs if all(doc.metadata.get(k) in v for k, v in self.filter.items())]
            parent_docs = join_recursive_documents(filtered_docs)
            return [self.docstore[doc.metadata[self.id_key]] for doc in parent_docs]
        else:
            parent_docs = join_recursive_documents(docs)
            return [self.docstore[doc.metadata[self.id_key]] for doc in parent_docs]

In [29]:
vectorstore_filter = Chroma(
    collection_name="docs1", embedding_function=embedding
)

In [30]:
fs_filter = LocalFileStore("./store_location_wo_filter")
store_filter = create_kv_docstore(fs_filter)
retriever_filter = ParentDocumentRetriever(
    vectorstore=vectorstore_filter,
    docstore=store_filter,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter
)

In [31]:
from langchain.vectorstores import Chroma

filter={'county': 'San Francisco'}
retriever = FilteredParentDocumentRetriever(vectorstore_filter, store_filter, child_splitter, parent_splitter, filter=filter)

In [32]:
retriever.get_relevant_documents("where did sallie interned?")

[]