In [None]:
%pip install --upgrade pip

# Uninstall conflicting packages
%pip uninstall -y langchain-core langchain-openai langchain-experimental langchain-community langchain chromadb beautifulsoup4 python-dotenv PyPDF2 rank_bm25 faiss-cpu weaviate-client langchain-weaviate

# Install compatible versions of langchain-core and langchain-openai
%pip install langchain-core==0.3.6
%pip install langchain-openai==0.2.1
%pip install langchain-experimental==0.3.2
%pip install langchain-community==0.3.1
%pip install langchain==0.3.1

# Install remaining packages
%pip install chromadb==0.5.11
%pip install python-dotenv==1.0.1
%pip install PyPDF2==3.0.1 -q --user
%pip install rank_bm25==0.2.2

# new vector stores
%pip install faiss-cpu==1.8.0.post1
%pip install weaviate-client==4.8.1
%pip install langchain-weaviate==0.0.3

# google embeddings
%pip install langchain-google-genai

# Restart the kernel after installation

In [None]:
import os
from enum import Enum

from PyPDF2 import PdfReader
import chromadb

os.environ['USER_AGENT'] = 'RAGUserAgent'

# openai
import openai

# langchain
import langchain
from langchain import hub
from langchain_openai import ChatOpenAI
from langchain.retrievers import EnsembleRetriever
from langchain.text_splitter import RecursiveCharacterTextSplitter

# core
from langchain_core.documents.base import Document
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

# community
from langchain_community.retrievers import BM25Retriever
from langchain_community.vectorstores import Chroma,FAISS,Weaviate

from langchain_google_genai import GoogleGenerativeAIEmbeddings
from google.colab import userdata
from tqdm import tqdm

import weaviate
from langchain_weaviate.vectorstores import WeaviateVectorStore
from weaviate.embedded import EmbeddedOptions



In [None]:
def format_docs(docs):
  return "\n\n".join(doc.page_content for doc in docs)

def extract_score(llm_output):
  score = 0
  try:
    score = float(llm_output.strip())
  except ValueError:
    pass

  return score

def conditional_answer(x):
  relevance_score = extract_score(x['relevance_score'])
  if relevance_score < 4:
    return "I have no idea"
  else:
    return x['answer']

In [None]:
def get_vector(vector,documents, embedding,collection_name,description_name):
  if vector == VectorType.FAISS:
    return create_faiss_vectorstore(documents, embedding)
  elif vector == VectorType.CHROMA:
    return create_chroma_vectorstore(documents, embedding, collection_name)
  elif vector == VectorType.WEAVIATE:
    return create_weviate_vectorstore(documents,embedding,collection_name,description_name)

In [None]:
def create_faiss_vectorstore(documents, embedding):
  return FAISS.from_documents(
      documents = documents,
      embedding = embedding
  )


In [None]:
def create_chroma_vectorstore(documents, embedding, collection_name):
  chroma_client = chromadb.Client()
  return Chroma.from_documents(
      documents = documents,
      embedding = embedding,
      collection_name = collection_name,
      client = chroma_client
  )

In [None]:
def create_weviate_vectorstore(documents,
                               embedding,
                               collection_name,
                               description_name):

  weaviate_client = weaviate.Client(embedded_options=EmbeddedOptions())
  try:
    weaviate_client.schema.delete_class(collection_name)
  except:
    pass
  structure = {
      "class": collection_name,
      "description": description_name,
      "properties": [
          {
              "name": "text",
              "dataType": ["text"],
              "description": "The text content of the document"
          },
          {
              "name": "doc_id",
              "dataType": ["string"],
              "description": "Document Id"
          },
          {
              "name": "source",
              "dataType": ["string"],
              "description": "Source of the document"
          }
      ]
  }

  weaviate_client.schema.create_class(structure)
  vector_store= Weaviate(
      client=weaviate_client,
      embedding=embedding,
      index_name=collection_name,
      text_key="text",
      attributes=["doc_id", "source"],
      by_text=False
  )

  weaviate_client.batch.configure(batch_size=100)
  with weaviate_client.batch as batch:
    for doc in tqdm(documents, desc="Processing documents"):
        properties = {
            "text": doc.page_content,
            "doc_id": doc.metadata["doc_id"],
            "source": doc.metadata["source"]
        }
        vector = embedding.embed_query(doc.page_content)
        batch.add_data_object(
            data_object=properties,
            class_name=collection_name,
            vector=vector
        )

  return vector_store

In [None]:
class VectorType(Enum):
  FAISS = 'faiss'
  CHROMA = 'chroma'
  WEAVIATE = 'weaviate'


In [None]:
class RagPipeline:
  def __init__(self, source,vector_type):
    os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')
    os.environ['GOOGLE_API_KEY'] = userdata.get('GOOGLE_API_KEY')
    openai.api_key = userdata.get('OPENAI_API_KEY')

    self.vector_type = vector_type
    self.source = source
    self.str_output_parser = StrOutputParser()
    self.gemini_embeddings = GoogleGenerativeAIEmbeddings(model='models/embedding-001')
    self.llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)

    self.prompt = hub.pull('jclemens24/rag-prompt')
    print(self.prompt)

    self.relevance_prompt_template = PromptTemplate.from_template(
        """
          Given the following question and retrieved context, determine if the context is relevant to the question.
          Provide a score from 1 to 5, where 1 is not at all relevant and 5 is highly relevant.
          Return ONLY the numeric score, without any additional text or explanation.

          Question: {question}
          Retrieved Context: {retrieved_context}

          Relevance Score:
        """
    )

    if self.vector_type == VectorType.WEAVIATE:
        self.id = "doc_id"
    else:
        self.id = "id"

  def retrieve(self):
      full_text = ""
      for page in PdfReader(self.source).pages:
        full_text += page.extract_text()

      splits = (RecursiveCharacterTextSplitter(chunk_size=1000,
                                              chunk_overlap=200)
                .split_text(full_text)
      )


      dense_documents = [Document(page_content=text, metadata={
          self.id: str(i),"source": "dense"
      }) for i, text in enumerate(splits)]

      sparse_documents=[Document(page_content=text, metadata={
          self.id: str(i), "source": "sparse"
      })for i, text in enumerate(splits)]

      vectorstore = get_vector(self.vector_type,
                               dense_documents,
                               self.gemini_embeddings,
                               'Google_Environment_report',
                               'Google Environment report as of 2023')

      dense_retriever = vectorstore.as_retriever(search_kwargs={'k': 10})
      sparse_retriever = BM25Retriever.from_documents(sparse_documents)
      ensemble_retriever = EnsembleRetriever(
          retrievers=[dense_retriever, sparse_retriever],
          weights=[0.7, 0.3]
      )

      return ensemble_retriever

  def augment(self, retriever):
      rag_chain_from_docs = (
          RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
          | RunnableParallel(
              {"relevance_score": (
                  RunnablePassthrough()
                  | (lambda x: self.relevance_prompt_template.format(question=x['question'], retrieved_context=x['context']))
                  | self.llm
                  | self.str_output_parser
              ), "answer": (
                  RunnablePassthrough()
                  | self.prompt
                  | self.llm
                  | self.str_output_parser
              )}
          )
          | RunnablePassthrough().assign(final_answer=conditional_answer)

      )

      rag_chain_with_source = RunnableParallel(
          {"context": retriever, "question": RunnablePassthrough()}
      ).assign(answer=rag_chain_from_docs)

      return rag_chain_with_source

  def generate(self, question, chain):
      result = chain.invoke(question)
      print(result)
      retrieved_docs = result['context']

      print(f"Original Question: {question}\n")
      print(f"Relevance Score: {result['answer']['relevance_score']}\n")
      print(f"Final Answer:\n{result['answer']['final_answer']}\n\n")
      print("Retrieved Documents:")
      for i, doc in enumerate(retrieved_docs, start=1):
          # note: if using the Weaviate vectorstore, change 'id' to 'doc_id'
          print(f"Document {i}: Document ID: {doc.metadata[self.id]} source: {doc.metadata['source']}")
          print(f"Content:\n{doc.page_content}\n")


In [None]:
rag = RagPipeline(source='/content/sample_data/google-2023-environmental-report.pdf',vector_type=VectorType.CHROMA)

In [None]:
rag = RagPipeline(source='/content/sample_data/google-2023-environmental-report.pdf',vector_type=VectorType.FAISS)

In [None]:
rag = RagPipeline(source='/content/sample_data/google-2023-environmental-report.pdf',vector_type=VectorType.WEAVIATE)

In [None]:
retriever = rag.retrieve()
augmentor = rag.augment(retriever)
rag.generate("What are Google's environmental initiatives?",augmentor)
