In [27]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
import os
from dotenv import load_dotenv
from langchain.vectorstores import Chroma
from langchain.retrievers import BM25Retriever, EnsembleRetriever
import re
from langchain.schema.output_parser import StrOutputParser
from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor,LLMChainFilter, DocumentCompressorPipeline, EmbeddingsFilter
from langchain_community.document_transformers import EmbeddingsRedundantFilter
from langchain.chains import RetrievalQA
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore

In [5]:
load_dotenv()

True

In [6]:
doc_path = '../data/pdf.pdf'

In [7]:
prompt = ChatPromptTemplate(input_variables=['original_query'],
                            messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[],template='You are a helpful assistant that generates multiple search queries based on a single input query.')),
                            HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['original_query'], template='Generate multiple search queries related to: {question} \n OUTPUT (3 queries):'))])

In [8]:
def preprocess_text(text: str) -> str:

    text = text.lower()
    text = text.encode("ascii", errors="ignore").decode()
    text = re.sub(r'\s+', ' ', text)
    text = text.replace('\n', ' ').strip()
    return text

In [9]:
loader=PyPDFLoader(doc_path)
docs=loader.load()

In [10]:
for doc in docs:
    doc.page_content = preprocess_text(doc.page_content)

In [29]:
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000,chunk_overlap=100)
child_splitter = RecursiveCharacterTextSplitter(chunk_size=200,chunk_overlap=30)

In [12]:
embeddings = OpenAIEmbeddings()

In [28]:
store = InMemoryStore()

In [32]:
vectorstore=Chroma.from_documents(docs, embeddings)

In [33]:
parent_document_retriever = ParentDocumentRetriever(
            vectorstore=vectorstore,
            docstore=store,
            child_splitter=child_splitter,
            parent_splitter=parent_splitter,
        )

In [35]:
keyword_retriever = BM25Retriever.from_documents(docs)
keyword_retriever.k = 3
keyword_retriever

BM25Retriever(vectorizer=<rank_bm25.BM25Okapi object at 0x000001222E079AF0>, k=3)

In [46]:
ensemble_retriever = EnsembleRetriever(retrievers=[parent_document_retriever,keyword_retriever],weights=[0.4, 0.6])
ensemble_retriever

EnsembleRetriever(retrievers=[ParentDocumentRetriever(vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x000001222E079460>, docstore=<langchain_core.stores.InMemoryStore object at 0x0000012229C3F340>, search_kwargs={}, child_splitter=<langchain_text_splitters.character.RecursiveCharacterTextSplitter object at 0x0000012229C3F130>, parent_splitter=<langchain_text_splitters.character.RecursiveCharacterTextSplitter object at 0x0000012229C3F370>), BM25Retriever(vectorizer=<rank_bm25.BM25Okapi object at 0x000001222E079AF0>, k=3)], weights=[0.4, 0.6])

In [47]:
llm = ChatOpenAI()

In [48]:
generate_queries = (
    prompt | llm | StrOutputParser() | (lambda x: x.split("\n"))
)
generate_queries

ChatPromptTemplate(input_variables=['question'], input_types={}, partial_variables={}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], input_types={}, partial_variables={}, template='You are a helpful assistant that generates multiple search queries based on a single input query.'), additional_kwargs={}), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['question'], input_types={}, partial_variables={}, template='Generate multiple search queries related to: {question} \n OUTPUT (3 queries):'), additional_kwargs={})])
| ChatOpenAI(client=<openai.resources.chat.completions.completions.Completions object at 0x000001222E21DD00>, async_client=<openai.resources.chat.completions.completions.AsyncCompletions object at 0x000001222E21D580>, root_client=<openai.OpenAI object at 0x000001222E21D8B0>, root_async_client=<openai.AsyncOpenAI object at 0x000001222E21DDC0>, model_kwargs={}, openai_api_key=SecretStr('**********'))
| StrOutputParser()
| Runn

In [49]:
compressor1 = LLMChainExtractor.from_llm(llm)
compressor2 = LLMChainFilter.from_llm(llm)

redundant_filter = EmbeddingsRedundantFilter(embeddings=embeddings)
relevant_filter = EmbeddingsFilter(embeddings=embeddings, similarity_threshold=0.76)

In [50]:
compression_retriever1 = ContextualCompressionRetriever(base_compressor=compressor1, base_retriever=ensemble_retriever)
compression_retriever2 = ContextualCompressionRetriever(base_compressor=compressor2, base_retriever=ensemble_retriever)

pipeline_compressor = DocumentCompressorPipeline(transformers=[splitter, redundant_filter, relevant_filter])
compression_retriever3 = ContextualCompressionRetriever(base_compressor=pipeline_compressor, base_retriever=ensemble_retriever)

In [51]:
chain1 = RetrievalQA.from_chain_type(llm=llm, retriever=compression_retriever1)
chain2 = RetrievalQA.from_chain_type(llm=llm, retriever=compression_retriever2)
chain3 = RetrievalQA.from_chain_type(llm=llm, retriever=compression_retriever3)

In [52]:
query = 'How to prevent data contamination?'

response1 = chain1.invoke(query)
response2 = chain2.invoke(query)
response3 = chain3.invoke(query)

In [53]:
response1['result']

'To prevent data contamination, it is important to establish and maintain data governance practices, implement security measures to protect data integrity, enforce data validation rules, regularly clean and audit data, train staff on data handling best practices, and implement access controls to limit who can modify data. Regularly updating software and systems, creating backup copies of data, and monitoring data changes can also help in preventing data contamination.'

In [54]:
response2['result']

'To prevent data contamination, it is essential to establish and follow best practices in data management. This includes ensuring data integrity, implementing data validation procedures, restricting unauthorized access to data, regularly backing up data, and maintaining a clean data environment. Additionally, conducting regular audits and quality checks can help identify and address any potential causes of data contamination.'

In [55]:
response3['result']

'To prevent data contamination, it is essential to carefully monitor data collection processes, ensure data integrity and quality controls are in place, and regularly validate and verify the data being used. Employing data anonymization techniques, access controls, and encryption methods can also help prevent unauthorized access and manipulation of data. Regularly updating security measures and training personnel on data handling best practices can further contribute to preventing data contamination.'