In [38]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI, ChatOpenAI
import os
from dotenv import load_dotenv
from langchain.vectorstores import Chroma
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain.chains import RetrievalQA

In [26]:
load_dotenv()

True

In [17]:
doc_path = '../data/pdf.pdf'

In [None]:
loader=PyPDFLoader(doc_path)
docs=loader.load()
docs

In [None]:
splitter = RecursiveCharacterTextSplitter(chunk_size=200,chunk_overlap=30)
chunks = splitter.split_documents(docs)
chunks

In [27]:
embeddings = OpenAIEmbeddings()

In [31]:
vectorstore=Chroma.from_documents(chunks,embeddings)
vectorstore_retreiver = vectorstore.as_retriever(search_kwargs={"k": 3})
vectorstore_retreiver

VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x000001A122E7FBB0>, search_kwargs={'k': 3})

In [35]:
keyword_retriever = BM25Retriever.from_documents(chunks)
keyword_retriever.k =  3
keyword_retriever

BM25Retriever(vectorizer=<rank_bm25.BM25Okapi object at 0x000001A124AC06A0>, k=3)

In [36]:
ensemble_retriever = EnsembleRetriever(retrievers=[vectorstore_retreiver,keyword_retriever],weights=[0.3, 0.7])
ensemble_retriever

EnsembleRetriever(retrievers=[VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x000001A122E7FBB0>, search_kwargs={'k': 3}), BM25Retriever(vectorizer=<rank_bm25.BM25Okapi object at 0x000001A124AC06A0>, k=3)], weights=[0.3, 0.7])

In [39]:
llm = ChatOpenAI()

In [40]:
normal_chain = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=vectorstore_retreiver
)

In [41]:
hybrid_chain = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=ensemble_retriever
)

In [43]:
response1 = normal_chain.invoke("What is Dataset Contamination?")
response1

{'query': 'What is Dataset Contamination?',
 'result': 'Dataset contamination refers to the presence of irrelevant, misleading, or incorrect data within a dataset used for training or evaluation in machine learning or data analysis tasks. This contaminated data can negatively impact the performance and accuracy of models trained on the dataset.'}

In [44]:
response1 = hybrid_chain.invoke("What is Dataset Contamination?")
response1

{'query': 'What is Dataset Contamination?',
 'result': "Dataset contamination refers to a situation where the training data used to develop a machine learning model contains information or patterns from the evaluation data that the model will later be tested on. This can lead to the model performing better on the evaluation data than it would in a real-world scenario where it encounters previously unseen data. Controlling for dataset contamination is important in order to ensure that the model's performance is reliable and generalizable."}