In [1]:
import os
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain_community.llms import Ollama
from langchain.chains import RetrievalQA
from collections import defaultdict

In [2]:
pdf_folder_path = './data/'


In [3]:
all_text = ""


In [4]:
for filename in os.listdir(pdf_folder_path):
    if filename.endswith('.pdf'):
        pdf_path = os.path.join(pdf_folder_path, filename)
        loader = PyMuPDFLoader(pdf_path)
        docs = loader.load()
        for doc in docs:
            all_text += doc.page_content + "\n" 

print(f"Total text length: {len(all_text)} characters.")

Total text length: 565065 characters.


In [5]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=200)


In [6]:
chunks = text_splitter.split_text(all_text)


In [7]:
print(f"Number of chunks created: {len(chunks)}")
print(f"Sample chunk: {chunks[0]}")

Number of chunks created: 1835
Sample chunk: Large Language Models: A Survey
Shervin Minaee, Tomas Mikolov, Narjes Nikzad, Meysam Chenaghlu
Richard Socher, Xavier Amatriain, Jianfeng Gao
Abstract—Large Language Models (LLMs) have drawn a
lot of attention due to their strong performance on a wide
range of natural language tasks, since the release of ChatGPT
in November 2022. LLMs’ ability of general-purpose language
understanding and generation is acquired by training billions of


In [8]:
rrf_docs = [Document(page_content=chunk) for chunk in chunks]


In [9]:
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


  warn_deprecated(
  from tqdm.autonotebook import tqdm, trange
Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



In [10]:
db2 = Chroma.from_documents(rrf_docs, embedding=embedding, persist_directory="./chroma_db/RRF")


In [11]:
db3 = Chroma(persist_directory="./chroma_db/RRF", embedding_function=embedding)


  warn_deprecated(


In [12]:
retriever = db3.as_retriever(search_kwargs={"k": 3})


In [13]:
llm = Ollama(model="llama3")


In [14]:
def generate_related_queries(original_query, num_queries=3):
    prompt = f"Generate {num_queries} related queries for: {original_query}"
    response = llm(prompt)
    related_queries = response.split('\n') 
    return related_queries

In [15]:
def retrieve_results(queries, retriever, top_k=3):
    all_results = {}
    for query in queries:
        results = retriever.get_relevant_documents(query)
        all_results[query] = results[:top_k]
    return all_results

In [16]:
def reciprocal_rank_fusion(all_results):
    combined_scores = defaultdict(float)
    for query, results in all_results.items():
        for rank, result in enumerate(results, start=1):
            doc_id = result.metadata.get("doc_id")  
            if doc_id not in combined_scores:
                combined_scores[doc_id] = 0
            combined_scores[doc_id] += 1 / rank
    sorted_results = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
    return sorted_results

In [17]:
def run_fusion_ranking_pipeline_with_qa(query, retriever, llm, num_related_queries=3, top_k=3):
    related_queries = generate_related_queries(query, num_related_queries)
    all_queries = [query] + related_queries  
    
    all_results = retrieve_results(all_queries, retriever, top_k)
    
    final_ranked_results = reciprocal_rank_fusion(all_results)
    
    final_documents = [doc for doc_id, _ in final_ranked_results for doc in rrf_docs if doc.metadata.get("doc_id") == doc_id]
    
    qa_chain = RetrievalQA.from_chain_type(llm, retriever=retriever)
    
    answer = qa_chain.run(input_documents=final_documents, query=query)
    
    return answer

In [20]:
original_query = "What is Rag model?"
final_answer = run_fusion_ranking_pipeline_with_qa(original_query, retriever, llm)
print("Final Answer:", final_answer)