In [1]:
%pip install --upgrade pip

# Uninstall conflicting packages
%pip uninstall -y langchain-core langchain-openai langchain-experimental langchain-community langchain chromadb beautifulsoup4 python-dotenv PyPDF2 rank_bm25

# Install compatible versions of langchain-core and langchain-openai
%pip install langchain-core==0.3.6
%pip install langchain-openai==0.2.1
%pip install langchain-experimental==0.3.2
%pip install langchain-community==0.3.1
%pip install langchain==0.3.1

# Install remaining packages
%pip install chromadb==0.5.11
%pip install python-dotenv==1.0.1

# new
%pip install PyPDF2==3.0.1 -q --user
%pip install rank_bm25==0.2.2

%pip install langchain-google-genai

Found existing installation: langchain-core 0.3.13
Uninstalling langchain-core-0.3.13:
  Successfully uninstalled langchain-core-0.3.13
Found existing installation: langchain-openai 0.2.1
Uninstalling langchain-openai-0.2.1:
  Successfully uninstalled langchain-openai-0.2.1
Found existing installation: langchain-experimental 0.3.2
Uninstalling langchain-experimental-0.3.2:
  Successfully uninstalled langchain-experimental-0.3.2
Found existing installation: langchain-community 0.3.1
Uninstalling langchain-community-0.3.1:
  Successfully uninstalled langchain-community-0.3.1
Found existing installation: langchain 0.3.1
Uninstalling langchain-0.3.1:
  Successfully uninstalled langchain-0.3.1
Found existing installation: chromadb 0.5.11
Uninstalling chromadb-0.5.11:
  Successfully uninstalled chromadb-0.5.11
[0mFound existing installation: python-dotenv 1.0.1
Uninstalling python-dotenv-1.0.1:
  Successfully uninstalled python-dotenv-1.0.1
[0mFound existing installation: rank-bm25 0.2.2
U

In [2]:
import os
os.environ['USER_AGENT'] = 'RAGUserAgent'

import openai
import chromadb

# core
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_core.prompts import PromptTemplate

# community
from langchain_community.vectorstores import Chroma
from langchain_community.retrievers import BM25Retriever

# langchain
import langchain
from langchain import hub
from langchain_openai import ChatOpenAI

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain_google_genai import GoogleGenerativeAIEmbeddings

from PyPDF2 import PdfReader
from google.colab import userdata

In [5]:
def format_docs(docs):
  return "\n\n".join(doc.page_content for doc in docs)

def extract_score(llm_output):
  score = 0
  try:
    score = float(llm_output.strip())
  except ValueError:
    pass

  return score

def conditional_answer(x):
  relevance_score = extract_score(x['relevance_score'])
  if relevance_score < 4:
    return "I have no idea"
  else:
    return x['answer']

In [15]:
class RagPipeline:
  def __init__(self, pdf_path='/content/sample_data/google-2023-environmental-report.pdf'):
    os.environ['GOOGLE_API_KEY'] = userdata.get('GOOGLE_API_KEY')
    os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')
    openai.api_key = userdata.get('OPENAI_API_KEY')

    self.llm = ChatOpenAI(model_name='gpt-4o-mini', temperature=0)
    self.gemini_embedding = GoogleGenerativeAIEmbeddings(model='models/embedding-001')
    self.str_output_parser = StrOutputParser()
    self.prompt = hub.pull('jclemens24/rag-prompt')
    self.pdf_path = pdf_path

    self.relevance_prompt_template = PromptTemplate.from_template(
        """
          Given the following question and retrieved context, determine if the context is relevant to the question.
          Provide a score from 1 to 5, where 1 is not at all relevant and 5 is highly relevant.
          Return ONLY the numeric score, without any additional text or explanation.

          Question: {question}
          Retrieved Context: {retrieved_context}

          Relevance Score:
        """
    )


  def _get_sources(self):
    pdf_reader = PdfReader(self.pdf_path)
    text = ""
    for page in pdf_reader.pages:
      text += page.extract_text()

    splits = RecursiveCharacterTextSplitter(
        separators=["\n\n", "\n", ". ", " ", ""],
        chunk_size=1000,
        chunk_overlap=200
    ).split_text(text)

    documents = [Document(page_content=text,metadata={"id": str(i)} ) for i, text in enumerate(splits)]
    return documents

  def _indexing(self):
    documents = self._get_sources()
    vector_stores = Chroma.from_documents(
       documents = documents,
       embedding = self.gemini_embedding,
       collection_name = 'google_environmental_report',
       client = chromadb.Client()
    )

    #  Dense retrievers use dense vector representations to encode documents and queries. These vectors capture semantic meaning, allowing the retriever to understand the context and nuances of the text
    dense_retriever=vector_stores.as_retriever(search_kwargs={"k": 10})
    # Sparse retrievers, like BM25, use sparse vector representations based on term frequency and inverse document frequency (TF-IDF). These vectors are typically high-dimensional but sparse, meaning most of the values are zero
    sparse_retriever = BM25Retriever.from_documents(documents, k=10)

    return dense_retriever, sparse_retriever

  # Custom hybrid search function (as opposed to using LangChain EnsembleRetriever)
  def hybrid_search(self,query, k=10, dense_weight=0.5, sparse_weight=0.5):
    dense_retriever, sparse_retriever = self._indexing()

    # Step 1: Retrieve the top-k documents from both dense search and sparse search.
    dense_docs = dense_retriever.get_relevant_documents(query)[:k]
    dense_doc_ids = [doc.metadata['id'] for doc in dense_docs]
    print("\nCompare IDs:")
    print("dense IDs: ", dense_doc_ids)
    sparse_docs = sparse_retriever.get_relevant_documents(query)[:k]
    sparse_doc_ids = [doc.metadata['id'] for doc in sparse_docs]
    print("sparse IDs: ", sparse_doc_ids)

    # Combine the document IDs and remove duplicates
    all_doc_ids = list(set(dense_doc_ids + sparse_doc_ids))

    # Create dictionaries to store the reciprocal ranks
    dense_reciprocal_ranks = {doc_id: 0.0 for doc_id in all_doc_ids}
    sparse_reciprocal_ranks = {doc_id: 0.0 for doc_id in all_doc_ids}

    # Step 2: Calculate the reciprocal rank for each document in dense and sparse search results.
    for i, doc_id in enumerate(dense_doc_ids):
        dense_reciprocal_ranks[doc_id] = 1.0 / (i + 1)

    for i, doc_id in enumerate(sparse_doc_ids):
        sparse_reciprocal_ranks[doc_id] = 1.0 / (i + 1)

    # Step 3: Sum the reciprocal ranks for each document.
    combined_reciprocal_ranks = {doc_id: 0.0 for doc_id in all_doc_ids}
    for doc_id in all_doc_ids:
        combined_reciprocal_ranks[doc_id] = dense_weight * dense_reciprocal_ranks[doc_id] + sparse_weight * sparse_reciprocal_ranks[doc_id]

    # Step 4: Sort the documents based on their combined reciprocal rank scores.
    sorted_doc_ids = sorted(all_doc_ids, key=lambda doc_id: combined_reciprocal_ranks[doc_id], reverse=True)

    # Step 5: Retrieve the documents based on the sorted document IDs.
    sorted_docs = []
    all_docs = dense_docs + sparse_docs
    for doc_id in sorted_doc_ids:
        matching_docs = [doc for doc in all_docs if doc.metadata['id'] == doc_id]
        if matching_docs:
            doc = matching_docs[0]
            doc.metadata['score'] = combined_reciprocal_ranks[doc_id]
            doc.metadata['rank'] = sorted_doc_ids.index(doc_id) + 1
            if len(matching_docs) > 1:
                doc.metadata['retriever'] = 'both'
            elif doc in dense_docs:
                doc.metadata['retriever'] = 'dense'
            else:
                doc.metadata['retriever'] = 'sparse'
            sorted_docs.append(doc)

    # Step 7: Return the final ranked and sorted list, truncated by the top-k parameter
    return sorted_docs[:k]

  def chaining(self):

    rag_chain_from_docs = (
        RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
        | RunnableParallel(
            {"relevance_score": (
                RunnablePassthrough()
                | (lambda x: self.relevance_prompt_template.format(question=x['question'], retrieved_context=x['context']))
                | self.llm
                | self.str_output_parser
            ), "answer": (
                RunnablePassthrough()
                | self.prompt
                | self.llm
                | self.str_output_parser
            )}
        )
        | RunnablePassthrough().assign(final_answer=conditional_answer)
    )

    rag_chain_with_source = RunnableParallel(
        {'context': self.hybrid_search, 'question': RunnablePassthrough()}
    ).assign(answer=rag_chain_from_docs)

    return rag_chain_with_source



In [16]:
user_query="What are Google's environmental initiatives?"
rag = RagPipeline()


# User Query
result = rag.chaining().invoke(user_query)
relevance_score = result['answer']['relevance_score']
final_answer = result['answer']['final_answer']
retrieved_docs = result['context']

print(f"\nOriginal Question: {user_query}\n")
print(f"Relevance Score: {relevance_score}\n")
print(f"Final Answer:\n{final_answer}\n\n")

print("Retrieved Documents:")
for i, doc in enumerate(retrieved_docs, start=1):
    doc_id = doc.metadata['id']
    doc_score = doc.metadata.get('score', 'N/A')
    doc_rank = doc.metadata.get('rank', 'N/A')
    doc_retriever = doc.metadata.get('retriever', 'N/A')
    print(f"Document {i}: Document ID: {doc_id} Score: {doc_score} Rank: {doc_rank} Retriever: {doc_retriever}\n")
    print(f"Content:\n{doc.page_content}\n")


  dense_docs = dense_retriever.get_relevant_documents(query)[:k]



Compare IDs:
dense IDs:  ['307', '308', '276', '117', '17', '69', '12', '0', '22', '70']
sparse IDs:  ['150', '309', '298', '311', '328', '415', '139', '432', '91', '22']

Original Question: What are Google's environmental initiatives?

Relevance Score: 5

Final Answer:
Google's environmental initiatives include a variety of programs and strategies aimed at promoting sustainability and reducing environmental impact. Key initiatives include:

1. **Employee Engagement**: Google fosters a culture of sustainability by providing employees with opportunities to engage in environmental issues, participate in sustainability courses, and work on projects related to sustainability, such as Project Sunroof.

2. **Supplier Engagement**: Google works with suppliers to build a low-carbon, circular supply chain. This includes helping suppliers reduce energy consumption and greenhouse gas emissions, as well as requiring them to report environmental data.

3. **Community Involvement**: Google engages 