In [27]:
import langchain
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.schema.runnable import RunnablePassthrough
import os
from dotenv import load_dotenv
from langchain.vectorstores import Chroma
from langchain.retrievers import BM25Retriever, EnsembleRetriever
import re
from langchain.schema.output_parser import StrOutputParser
from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain.load import dumps, loads
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import FlashrankRerank
from flashrank.Ranker import Ranker

In [2]:
load_dotenv()

True

In [3]:
doc_path = '../data/pdf.pdf'

In [4]:
def reciprocal_rank_fusion(results: list[list], k=60):
    fused_scores = {}
    for docs in results:
        for rank, doc in enumerate(docs):
            doc_str = dumps(doc)
            if doc_str not in fused_scores:
                fused_scores[doc_str] = 0
            previous_score = fused_scores[doc_str]
            fused_scores[doc_str] += 1 / (rank + k)

    reranked_results = [
        (loads(doc), score)
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]
    return reranked_results

In [5]:
prompt = ChatPromptTemplate(input_variables=['original_query'],
                            messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[],template='You are a helpful assistant that generates multiple search queries based on a single input query.')),
                            HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['original_query'], template='Generate multiple search queries related to: {question} \n OUTPUT (3 queries):'))])

In [6]:
def preprocess_text(text: str) -> str:

    text = text.lower()
    text = text.encode("ascii", errors="ignore").decode()
    text = re.sub(r'\s+', ' ', text)
    text = text.replace('\n', ' ').strip()
    return text

In [8]:
loader=PyPDFLoader(doc_path)
docs=loader.load()

In [9]:
for doc in docs:
    doc.page_content = preprocess_text(doc.page_content)

In [10]:
splitter = RecursiveCharacterTextSplitter(chunk_size=200,chunk_overlap=30)
chunks = splitter.split_documents(docs)

In [11]:
embeddings = OpenAIEmbeddings()

In [12]:
vectorstore=Chroma.from_documents(chunks,embeddings)
vectorstore_retreiver = vectorstore.as_retriever(search_kwargs={"k": 5})
vectorstore_retreiver

VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x000001A5A4060B50>, search_kwargs={'k': 5})

In [13]:
keyword_retriever = BM25Retriever.from_documents(chunks)
keyword_retriever.k = 3
keyword_retriever

BM25Retriever(vectorizer=<rank_bm25.BM25Okapi object at 0x000001A5A2E36790>, k=3)

In [15]:
ensemble_retriever = EnsembleRetriever(retrievers=[vectorstore_retreiver,keyword_retriever],weights=[0.3, 0.7])
ensemble_retriever

EnsembleRetriever(retrievers=[VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x000001A5A4060B50>, search_kwargs={'k': 5}), BM25Retriever(vectorizer=<rank_bm25.BM25Okapi object at 0x000001A5A2E36790>, k=3)], weights=[0.3, 0.7])

In [16]:
llm = ChatOpenAI()

In [17]:
generate_queries = (
    prompt | llm | StrOutputParser() | (lambda x: x.split("\n"))
)
generate_queries

ChatPromptTemplate(input_variables=['question'], input_types={}, partial_variables={}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], input_types={}, partial_variables={}, template='You are a helpful assistant that generates multiple search queries based on a single input query.'), additional_kwargs={}), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['question'], input_types={}, partial_variables={}, template='Generate multiple search queries related to: {question} \n OUTPUT (3 queries):'), additional_kwargs={})])
| ChatOpenAI(client=<openai.resources.chat.completions.completions.Completions object at 0x000001A5A2E36D60>, async_client=<openai.resources.chat.completions.completions.AsyncCompletions object at 0x000001A5A2E36580>, root_client=<openai.OpenAI object at 0x000001A5A2E36940>, root_async_client=<openai.AsyncOpenAI object at 0x000001A5A2E36BE0>, model_kwargs={}, openai_api_key=SecretStr('**********'))
| StrOutputParser()
| Runn

In [18]:
ragfusion_chain = generate_queries | ensemble_retriever.map() | reciprocal_rank_fusion
langchain.debug = True
ragfusion_chain.input_schema.schema()

C:\Users\Hp\AppData\Local\Temp\ipykernel_10060\249600610.py:3: PydanticDeprecatedSince20: The `schema` method is deprecated; use `model_json_schema` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  ragfusion_chain.input_schema.schema()


{'properties': {'question': {'title': 'Question', 'type': 'string'}},
 'required': ['question'],
 'title': 'PromptInput',
 'type': 'object'}

In [19]:
template = """Answer the question based only on the following context:
{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

full_rag_fusion_chain = (
    {
        "context": ragfusion_chain,
        "question": RunnablePassthrough()
    }
    | prompt
    | llm
    | StrOutputParser()
)

full_rag_fusion_chain.input_schema.schema()

C:\Users\Hp\AppData\Local\Temp\ipykernel_10060\690582261.py:18: PydanticDeprecatedSince20: The `schema` method is deprecated; use `model_json_schema` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  full_rag_fusion_chain.input_schema.schema()


{'properties': {'question': {'title': 'Question', 'type': 'string'},
  'root': {'title': 'Root'}},
 'required': ['question', 'root'],
 'title': 'RunnableParallel<context,question>Input',
 'type': 'object'}

In [None]:
ranker = Ranker() # I have used Nano model. Small, medium and large options are also available.

small_ranker = Ranker(model_name="ms-marco-MiniLM-L-12-v2", cache_dir="/opt")
medium_ranker = Ranker(model_name="rank-T5-flan", cache_dir="/opt")
large_ranker = Ranker(model_name="ms-marco-MultiBERT-L-12", cache_dir="/opt")

In [31]:
compressor = FlashrankRerank()

INFO:flashrank.Ranker:Downloading ms-marco-MultiBERT-L-12...
ms-marco-MultiBERT-L-12.zip: 100%|██████████| 98.7M/98.7M [00:31<00:00, 3.33MiB/s]


In [33]:
compression_retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=ensemble_retriever)

In [37]:
response1 = full_rag_fusion_chain.invoke("How to prevent data contamination?")

[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence] Entering Chain run with input:
[0m{
  "input": "How to prevent data contamination?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence > chain:RunnableParallel<context,question>] Entering Chain run with input:
[0m{
  "input": "How to prevent data contamination?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence > chain:RunnableParallel<context,question> > chain:RunnableSequence] Entering Chain run with input:
[0m{
  "input": "How to prevent data contamination?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence > chain:RunnableParallel<context,question> > chain:RunnableSequence > prompt:ChatPromptTemplate] Entering Prompt run with input:
[0m{
  "input": "How to prevent data contamination?"
}
[36;1m[1;3m[chain/end][0m [1m[chain:RunnableSequence > chain:RunnableParallel<context,question> > chain:RunnableSequence > prompt:ChatPromptTemplate] [2ms] Exiting Prompt run with output:
[0m[outputs

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[36;1m[1;3m[llm/end][0m [1m[chain:RunnableSequence > chain:RunnableParallel<context,question> > chain:RunnableSequence > llm:ChatOpenAI] [660ms] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "1. Techniques for data cleanliness and integrity maintenance \n2. Best practices for mitigating data contamination risks \n3. Steps to ensure data purity and accuracy",
        "generation_info": {
          "finish_reason": "stop",
          "logprobs": null
        },
        "type": "ChatGeneration",
        "message": {
          "lc": 1,
          "type": "constructor",
          "id": [
            "langchain",
            "schema",
            "messages",
            "AIMessage"
          ],
          "kwargs": {
            "content": "1. Techniques for data cleanliness and integrity maintenance \n2. Best practices for mitigating data contamination risks \n3. Steps to ensure data purity and accuracy",
            "additional_kwargs": {
            

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


[36;1m[1;3m[chain/end][0m [1m[chain:RunnableSequence > chain:RunnableParallel<context,question> > chain:RunnableSequence > chain:RunnableEach<EnsembleRetriever>] [538ms] Exiting Chain run with output:
[0m[outputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence > chain:RunnableParallel<context,question> > chain:RunnableSequence > chain:reciprocal_rank_fusion] Entering Chain run with input:
[0m[inputs]
[36;1m[1;3m[chain/end][0m [1m[chain:RunnableSequence > chain:RunnableParallel<context,question> > chain:RunnableSequence > chain:reciprocal_rank_fusion] [8ms] Exiting Chain run with output:
[0m[outputs]
[36;1m[1;3m[chain/end][0m [1m[chain:RunnableSequence > chain:RunnableParallel<context,question> > chain:RunnableSequence] [1.21s] Exiting Chain run with output:
[0m[outputs]
[36;1m[1;3m[chain/end][0m [1m[chain:RunnableSequence > chain:RunnableParallel<context,question>] [1.22s] Exiting Chain run with output:
[0m[outputs]
[32;1m[1;3m[chain/start][0m [1m[c

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[36;1m[1;3m[llm/end][0m [1m[chain:RunnableSequence > llm:ChatOpenAI] [1.59s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "To prevent data contamination, it is important to carefully examine a set of examples, compare annotations provided by humans with samples generated by models through manual scrutiny, and conduct data checks even when using vendors to source annotations. Additionally, employing a process similar to general fine-tuning methods with notable differences related to safety concerns can help mitigate data contamination.",
        "generation_info": {
          "finish_reason": "stop",
          "logprobs": null
        },
        "type": "ChatGeneration",
        "message": {
          "lc": 1,
          "type": "constructor",
          "id": [
            "langchain",
            "schema",
            "messages",
            "AIMessage"
          ],
          "kwargs": {
            "content": "To prevent data contamination, it

In [38]:
response1

'To prevent data contamination, it is important to carefully examine a set of examples, compare annotations provided by humans with samples generated by models through manual scrutiny, and conduct data checks even when using vendors to source annotations. Additionally, employing a process similar to general fine-tuning methods with notable differences related to safety concerns can help mitigate data contamination.'