In [2]:
%pip install --upgrade pip

# Uninstall conflicting packages
%pip uninstall -y langchain-core langchain-openai langchain-experimental langchain-community langchain chromadb beautifulsoup4 python-dotenv PyPDF2 rank_bm25 faiss-cpu weaviate-client langchain-weaviate

# Install compatible versions of langchain-core and langchain-openai
%pip install langchain-core==0.3.6
%pip install langchain-openai==0.2.1
%pip install langchain-experimental==0.3.2
%pip install langchain-community==0.3.1
%pip install langchain==0.3.1

# Install remaining packages
%pip install chromadb==0.5.11
%pip install python-dotenv==1.0.1
%pip install PyPDF2==3.0.1 -q --user
%pip install rank_bm25==0.2.2

# new vector stores
%pip install faiss-cpu==1.8.0.post1
%pip install weaviate-client==4.8.1
%pip install langchain-weaviate==0.0.3

# google embeddings
%pip install langchain-google-genai

%pip install --upgrade langchain-together==0.2.0

# Restart the kernel after installation

Found existing installation: langchain-core 0.3.15
Uninstalling langchain-core-0.3.15:
  Successfully uninstalled langchain-core-0.3.15
Found existing installation: langchain-openai 0.2.1
Uninstalling langchain-openai-0.2.1:
  Successfully uninstalled langchain-openai-0.2.1
Found existing installation: langchain-experimental 0.3.2
Uninstalling langchain-experimental-0.3.2:
  Successfully uninstalled langchain-experimental-0.3.2
Found existing installation: langchain-community 0.3.1
Uninstalling langchain-community-0.3.1:
  Successfully uninstalled langchain-community-0.3.1
Found existing installation: langchain 0.3.1
Uninstalling langchain-0.3.1:
  Successfully uninstalled langchain-0.3.1
Found existing installation: chromadb 0.5.11
Uninstalling chromadb-0.5.11:
  Successfully uninstalled chromadb-0.5.11
[0mFound existing installation: python-dotenv 1.0.1
Uninstalling python-dotenv-1.0.1:
  Successfully uninstalled python-dotenv-1.0.1
Found existing installation: PyPDF2 3.0.1
Uninstal

Collecting rank_bm25==0.2.2
  Using cached rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Using cached rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2
Collecting faiss-cpu==1.8.0.post1
  Using cached faiss_cpu-1.8.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.7 kB)
Using cached faiss_cpu-1.8.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.0 MB)
Installing collected packages: faiss-cpu
Successfully installed faiss-cpu-1.8.0.post1
Collecting weaviate-client==4.8.1
  Using cached weaviate_client-4.8.1-py3-none-any.whl.metadata (3.6 kB)
Collecting protobuf<6.0dev,>=5.26.1 (from grpcio-health-checking<2.0.0,>=1.57.0->weaviate-client==4.8.1)
  Using cached protobuf-5.28.3-cp38-abi3-manylinux2014_x86_64.whl.metadata (592 bytes)
Using cached weaviate_client-4.8.1-py3-none-any.whl (374 kB)
Using cached protobuf-5.28.3-cp38-abi3-manylinux2014_x86_64.whl (316 kB)
I

In [1]:
import os
from enum import Enum

from PyPDF2 import PdfReader
import chromadb

os.environ['USER_AGENT'] = 'RAGUserAgent'

# openai
import openai

# langchain
import langchain
from langchain import hub
from langchain_openai import ChatOpenAI
from langchain_together import ChatTogether
from langchain.retrievers import EnsembleRetriever
from langchain.text_splitter import RecursiveCharacterTextSplitter

# core
from langchain_core.documents.base import Document
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

# community
from langchain_community.retrievers import BM25Retriever,WikipediaRetriever
from langchain_community.vectorstores import Chroma,FAISS,Weaviate

from langchain_google_genai import GoogleGenerativeAIEmbeddings
from google.colab import userdata
from tqdm import tqdm

import weaviate
from langchain_weaviate.vectorstores import WeaviateVectorStore
from weaviate.embedded import EmbeddedOptions



In [10]:
class ModelType(Enum):
  OPENAI = 'openai'
  LLAMA = 'llama'
  MISTRAL = 'mistral'

In [2]:
class VectorType(Enum):
  FAISS = 'faiss'
  CHROMA = 'chroma'
  WEAVIATE = 'weaviate'


In [3]:
class RetrieverType(Enum):
  MMR = 'mmr' #MMR is a technique used to retrieve relevant items from a query while avoiding redundancy
  BM25 = 'bm25'
  DEFAULT = 'default'
  SIMILARITY='similarity_score_threshold'

In [4]:
def format_docs(docs):
  return "\n\n".join(doc.page_content for doc in docs)

def extract_score(llm_output):
  score = 0
  try:
    score = float(llm_output.strip())
  except ValueError:
    pass

  return score

def conditional_answer(x):
  relevance_score = extract_score(x['relevance_score'])
  if relevance_score < 4:
    return "I have no idea"
  else:
    return x['answer']

In [5]:
def get_retriever(vector,documents, embedding,collection_name,description_name,search_type,search_kwargs):
  vector_store = get_vector(vector,documents, embedding,collection_name,description_name)
  if search_type == RetrieverType.MMR:
    return vector_store.as_retriever(search_type="mmr")
  elif search_type == RetrieverType.SIMILARITY:
    return vector_store.as_retriever(search_type="similarity_score_threshold",search_kwargs=search_kwargs)
  else:
    return vector_store.as_retriever(search_kwargs=search_kwargs)


In [6]:
def get_vector(vector,documents, embedding,collection_name,description_name):
  if vector == VectorType.FAISS:
    return create_faiss_vectorstore(documents, embedding)
  elif vector == VectorType.CHROMA:
    return create_chroma_vectorstore(documents, embedding, collection_name)
  elif vector == VectorType.WEAVIATE:
    return create_weviate_vectorstore(documents,embedding,collection_name,description_name)

In [7]:
def create_faiss_vectorstore(documents, embedding):
  return FAISS.from_documents(
      documents = documents,
      embedding = embedding
  )


In [8]:
def create_chroma_vectorstore(documents, embedding, collection_name):
  chroma_client = chromadb.Client()
  return Chroma.from_documents(
      documents = documents,
      embedding = embedding,
      collection_name = collection_name,
      client = chroma_client
  )

In [9]:
def create_weviate_vectorstore(documents,
                               embedding,
                               collection_name,
                               description_name):

  weaviate_client = weaviate.Client(embedded_options=EmbeddedOptions())
  try:
    weaviate_client.schema.delete_class(collection_name)
  except:
    pass
  structure = {
      "class": collection_name,
      "description": description_name,
      "properties": [
          {
              "name": "text",
              "dataType": ["text"],
              "description": "The text content of the document"
          },
          {
              "name": "doc_id",
              "dataType": ["string"],
              "description": "Document Id"
          },
          {
              "name": "source",
              "dataType": ["string"],
              "description": "Source of the document"
          }
      ]
  }

  weaviate_client.schema.create_class(structure)
  vector_store= Weaviate(
      client=weaviate_client,
      embedding=embedding,
      index_name=collection_name,
      text_key="text",
      attributes=["doc_id", "source"],
      by_text=False
  )

  weaviate_client.batch.configure(batch_size=100)
  with weaviate_client.batch as batch:
    for doc in tqdm(documents, desc="Processing documents"):
        properties = {
            "text": doc.page_content,
            "doc_id": doc.metadata["doc_id"],
            "source": doc.metadata["source"]
        }
        vector = embedding.embed_query(doc.page_content)
        batch.add_data_object(
            data_object=properties,
            class_name=collection_name,
            vector=vector
        )

  return vector_store

In [14]:
class RagPipeline:
  def __init__(self, source,vector_type,retriever_type, model_type):
    os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')
    os.environ['GOOGLE_API_KEY'] = userdata.get('GOOGLE_API_KEY')
    os.environ['TOGETHER_API_KEY'] = userdata.get('TOGETHER_API_KEY')

    openai.api_key = userdata.get('OPENAI_API_KEY')

    self.retriever_type = retriever_type
    self.vector_type = vector_type
    self.source = source
    self.str_output_parser = StrOutputParser()
    self.gemini_embeddings = GoogleGenerativeAIEmbeddings(model='models/embedding-001')
    #self.llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)
    if model_type == ModelType.OPENAI:
      self.llm = self.llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)
    elif model_type == ModelType.LLAMA:
      self.llm = ChatTogether(model="meta-llama/Llama-3-70b-chat-hf", temperature=0)
    elif model_type == ModelType.MISTRAL:
      self.llm = ChatTogether(model="mistralai/Mixtral-8x22B-Instruct-v0.1", temperature=0)

    self.prompt = hub.pull('jclemens24/rag-prompt')
    print(self.prompt)

    self.relevance_prompt_template = PromptTemplate.from_template(
        """
          Given the following question and retrieved context, determine if the context is relevant to the question.
          Provide a score from 1 to 5, where 1 is not at all relevant and 5 is highly relevant.
          Return ONLY the numeric score, without any additional text or explanation.

          Question: {question}
          Retrieved Context: {retrieved_context}

          Relevance Score:
        """
    )

    if self.vector_type == VectorType.WEAVIATE:
        self.id = "doc_id"
    else:
        self.id = "id"

  def retrieve(self):
      full_text = ""
      for page in PdfReader(self.source).pages:
        full_text += page.extract_text()

      splits = (RecursiveCharacterTextSplitter(chunk_size=1000,
                                              chunk_overlap=200)
                .split_text(full_text)
      )


      dense_documents = [Document(page_content=text, metadata={
          self.id: str(i),"source": "dense"
      }) for i, text in enumerate(splits)]

      sparse_documents=[Document(page_content=text, metadata={
          self.id: str(i), "source": "sparse"
      })for i, text in enumerate(splits)]

      vectorstore = get_vector(self.vector_type,
                               dense_documents,
                               self.gemini_embeddings,
                               'Google_Environment_report',
                               'Google Environment report as of 2023')

      dense_retriever=get_retriever(self.vector_type,
                                   dense_documents,
                                   self.gemini_embeddings,
                                   'Google_Environment_report',
                                   'Google Environment report as of 2023',
                                    self.retriever_type,
                                    {"score_threshold": 0.5})

      #dense_retriever = vectorstore.as_retriever(search_kwargs={'k': 10})
      sparse_retriever = BM25Retriever.from_documents(sparse_documents)
      ensemble_retriever = EnsembleRetriever(
          retrievers=[dense_retriever, sparse_retriever],
          weights=[0.7, 0.3]
      )

      return ensemble_retriever

  def augment(self, retriever):
      rag_chain_from_docs = (
          RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
          | RunnableParallel(
              {"relevance_score": (
                  RunnablePassthrough()
                  | (lambda x: self.relevance_prompt_template.format(question=x['question'], retrieved_context=x['context']))
                  | self.llm
                  | self.str_output_parser
              ), "answer": (
                  RunnablePassthrough()
                  | self.prompt
                  | self.llm
                  | self.str_output_parser
              )}
          )
          | RunnablePassthrough().assign(final_answer=conditional_answer)

      )

      rag_chain_with_source = RunnableParallel(
          {"context": retriever, "question": RunnablePassthrough()}
      ).assign(answer=rag_chain_from_docs)

      return rag_chain_with_source

  def generate(self, question, chain):
      result = chain.invoke(question)
      print(result)
      retrieved_docs = result['context']

      print(f"Original Question: {question}\n")
      print(f"Relevance Score: {result['answer']['relevance_score']}\n")
      print(f"Final Answer:\n{result['answer']['final_answer']}\n\n")
      print("Retrieved Documents:")
      for i, doc in enumerate(retrieved_docs, start=1):
          # note: if using the Weaviate vectorstore, change 'id' to 'doc_id'
          print(f"Document {i}: Document ID: {doc.metadata[self.id]} source: {doc.metadata['source']}")
          print(f"Content:\n{doc.page_content}\n")


In [17]:
rag = RagPipeline(source='/content/sample_data/google-2023-environmental-report.pdf',
                  vector_type=VectorType.CHROMA,
                  retriever_type=RetrieverType.SIMILARITY,
                  model_type=ModelType.LLAMA)

input_variables=['context', 'question'] input_types={} partial_variables={} metadata={'lc_hub_owner': 'jclemens24', 'lc_hub_repo': 'rag-prompt', 'lc_hub_commit_hash': '1a1f3ccb9a5a92363310e3b130843dfb2540239366ebe712ddd94982acc06734'} messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know.\nQuestion: {question} \nContext: {context} \nAnswer:"), additional_kwargs={})]




In [None]:
#rag = RagPipeline(source='/content/sample_data/google-2023-environmental-report.pdf',vector_type=VectorType.FAISS)

In [None]:
#rag = RagPipeline(source='/content/sample_data/google-2023-environmental-report.pdf',vector_type=VectorType.WEAVIATE)

In [18]:
retriever = rag.retrieve()
augmentor = rag.augment(retriever)
rag.generate("What are Google's environmental initiatives?",augmentor)


{'context': [Document(metadata={'id': '307', 'source': 'dense'}, page_content='Employee \nengagement\nSustainability is part of our culture, and we give our \nemployees opportunities to engage on environmental issues and put their passions into practice.\nTo celebrate Earth Day, we host an annual virtual event \nfor employees to learn more about what Google is doing to empower people to make more sustainable choices, support our partners and customers, and operate our business more sustainably. Throughout the year, we invite thought leaders and experts to speak about the latest sustainability trends, and we post some of these talks publicly on our Talks at Google YouTube channel. Employees can also access online sustainability courses and can join global and local internal community groups focused on sustainability topics. Some employees also take on a 20% project —an opportunity to work'), Document(metadata={'id': '150', 'source': 'sparse'}, page_content='sustainability, and we’re par

In [None]:
%pip install --upgrade --quiet wikipedia==1.4.0

In [None]:
def WikipediaExtraction():
  retriever = WikipediaRetriever(load_max_docs=10)
  docs = retriever.get_relevant_documents("What are the top 10 haunted places in Europe?")
  metadata_title = docs[0].metadata['title']
  metadata_summary = docs[0].metadata['summary']
  metadata_source = docs[0].metadata['source']
  page_content = docs[0].page_content
  print(f"First document returned:\n")
  print(f"Title: {metadata_title}\n")
  print(f"Summary: {metadata_summary}\n")
  print(f"Source: {metadata_source}\n")
  print(f"Page content:\n\n{page_content}\n")

WikipediaExtraction()