In [28]:
import langchain
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
import os
from dotenv import load_dotenv
from langchain.vectorstores import Chroma
from langchain.retrievers import BM25Retriever, EnsembleRetriever
import re
from langchain.schema.output_parser import StrOutputParser
from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain.load import dumps, loads
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor,LLMChainFilter, DocumentCompressorPipeline, EmbeddingsFilter
from langchain_community.document_transformers import EmbeddingsRedundantFilter
from langchain.chains import RetrievalQA

In [2]:
load_dotenv()

True

In [3]:
doc_path = '../data/pdf.pdf'

In [4]:
def reciprocal_rank_fusion(results: list[list], k=60):
    fused_scores = {}
    for docs in results:
        for rank, doc in enumerate(docs):
            doc_str = dumps(doc)
            if doc_str not in fused_scores:
                fused_scores[doc_str] = 0
            previous_score = fused_scores[doc_str]
            fused_scores[doc_str] += 1 / (rank + k)

    reranked_results = [
        (loads(doc), score)
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]
    return reranked_results

In [5]:
prompt = ChatPromptTemplate(input_variables=['original_query'],
                            messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[],template='You are a helpful assistant that generates multiple search queries based on a single input query.')),
                            HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['original_query'], template='Generate multiple search queries related to: {question} \n OUTPUT (3 queries):'))])

In [6]:
def preprocess_text(text: str) -> str:

    text = text.lower()
    text = text.encode("ascii", errors="ignore").decode()
    text = re.sub(r'\s+', ' ', text)
    text = text.replace('\n', ' ').strip()
    return text

In [7]:
loader=PyPDFLoader(doc_path)
docs=loader.load()

In [8]:
for doc in docs:
    doc.page_content = preprocess_text(doc.page_content)

In [9]:
splitter = RecursiveCharacterTextSplitter(chunk_size=200,chunk_overlap=30)
chunks = splitter.split_documents(docs)

In [10]:
embeddings = OpenAIEmbeddings()

In [11]:
vectorstore=Chroma.from_documents(chunks,embeddings)
vectorstore_retreiver = vectorstore.as_retriever(search_kwargs={"k": 5})
vectorstore_retreiver

VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x000001E98EA73F40>, search_kwargs={'k': 5})

In [12]:
keyword_retriever = BM25Retriever.from_documents(chunks)
keyword_retriever.k = 3
keyword_retriever

BM25Retriever(vectorizer=<rank_bm25.BM25Okapi object at 0x000001E98C9F2310>, k=3)

In [13]:
ensemble_retriever = EnsembleRetriever(retrievers=[vectorstore_retreiver,keyword_retriever],weights=[0.3, 0.7])
ensemble_retriever

EnsembleRetriever(retrievers=[VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x000001E98EA73F40>, search_kwargs={'k': 5}), BM25Retriever(vectorizer=<rank_bm25.BM25Okapi object at 0x000001E98C9F2310>, k=3)], weights=[0.3, 0.7])

In [14]:
llm = ChatOpenAI()

In [16]:
generate_queries = (
    prompt | llm | StrOutputParser() | (lambda x: x.split("\n"))
)
generate_queries

ChatPromptTemplate(input_variables=['question'], input_types={}, partial_variables={}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], input_types={}, partial_variables={}, template='You are a helpful assistant that generates multiple search queries based on a single input query.'), additional_kwargs={}), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['question'], input_types={}, partial_variables={}, template='Generate multiple search queries related to: {question} \n OUTPUT (3 queries):'), additional_kwargs={})])
| ChatOpenAI(client=<openai.resources.chat.completions.completions.Completions object at 0x000001E98F45F0A0>, async_client=<openai.resources.chat.completions.completions.AsyncCompletions object at 0x000001E98C9F2DF0>, root_client=<openai.OpenAI object at 0x000001E98F3A2BB0>, root_async_client=<openai.AsyncOpenAI object at 0x000001E98C9F2370>, model_kwargs={}, openai_api_key=SecretStr('**********'))
| StrOutputParser()
| Runn

In [25]:
compressor1 = LLMChainExtractor.from_llm(llm)
compressor2 = LLMChainFilter.from_llm(llm)

redundant_filter = EmbeddingsRedundantFilter(embeddings=embeddings)
relevant_filter = EmbeddingsFilter(embeddings=embeddings, similarity_threshold=0.76)

In [27]:
compression_retriever1 = ContextualCompressionRetriever(base_compressor=compressor1, base_retriever=ensemble_retriever)
compression_retriever2 = ContextualCompressionRetriever(base_compressor=compressor2, base_retriever=ensemble_retriever)

pipeline_compressor = DocumentCompressorPipeline(transformers=[splitter, redundant_filter, relevant_filter])
compression_retriever3 = ContextualCompressionRetriever(base_compressor=pipeline_compressor, base_retriever=ensemble_retriever)

In [29]:
chain1 = RetrievalQA.from_chain_type(llm=llm, retriever=compression_retriever1)
chain2 = RetrievalQA.from_chain_type(llm=llm, retriever=compression_retriever2)
chain3 = RetrievalQA.from_chain_type(llm=llm, retriever=compression_retriever3)

In [31]:
query = 'How to prevent data contamination?'

response1 = chain1.invoke(query)
response2 = chain2.invoke(query)
response3 = chain3.invoke(query)

In [38]:
response1['result']

'To prevent data contamination, it is essential to follow best practices and protocols for data collection, storage, and processing. Implement the following measures:\n\n1. **Data Collection:** Ensure proper training and supervision of data collectors to reduce errors and contamination during the data collection process.\n   \n2. **Quality Control:** Implement quality control measures during data collection, entry, and processing to identify and correct any contamination early on.\n   \n3. **Regular Data Cleaning:** Regularly clean and validate the data to detect and correct any errors or inconsistencies that could lead to contamination.\n   \n4. **Data Storage:** Store data in secure and controlled environments to prevent unauthorized access or tampering that could lead to contamination.\n   \n5. **Data Sharing Protocols:** Establish clear protocols for sharing data to prevent unintentional contamination or misuse of data.\n   \n6. **Documentation:** Maintain detailed documentation of

In [36]:
response2['result']

'To prevent data contamination, it is important to regularly conduct data checks and quality assurance processes on the data you are using. This includes verifying the sources of your data, checking for anomalies or inconsistencies, and ensuring that your data is properly cleaned and validated. Maintaining strict control over data access and monitoring the data inputs can also help in preventing data contamination.'

In [37]:
response3['result']

"To prevent data contamination, one approach is to carefully vet and verify the data sources you are using. It's important to implement stringent quality checks and validation processes to ensure that the data you are working with is clean and reliable. Additionally, setting up clear guidelines for data collection, annotation, and storage can help reduce the risk of contamination. Regular monitoring and auditing of data sets can also be beneficial in detecting any potential sources of contamination early on."