In [None]:
%pip install --upgrade pip

# Uninstall conflicting packages
%pip uninstall -y langchain-core langchain-openai langchain-experimental langchain-community langchain chromadb beautifulsoup4 python-dotenv PyPDF2 rank_bm25 faiss-cpu weaviate-client langchain-weaviate

# Install compatible versions of langchain-core and langchain-openai
%pip install langchain-core==0.3.6
%pip install langchain-openai==0.2.1
%pip install langchain-experimental==0.3.2
%pip install langchain-community==0.3.1
%pip install langchain==0.3.1

# Install remaining packages
%pip install chromadb==0.5.11
%pip install python-dotenv==1.0.1
%pip install PyPDF2==3.0.1 -q --user
%pip install rank_bm25==0.2.2

# new vector stores
%pip install faiss-cpu==1.8.0.post1
%pip install weaviate-client==4.8.1
%pip install langchain-weaviate==0.0.3

# google embeddings
%pip install langchain-google-genai

%pip install --upgrade langchain-together==0.2.0
%pip install beautifulsoup4==4.12.3
%pip install python-docx==1.1.2
%pip install docx2txt==0.8
%pip install jq==1.8.0


# Restart the kernel after installation

In [70]:
import os
from pathlib import Path
from enum import Enum

from PyPDF2 import PdfReader
import chromadb

os.environ['USER_AGENT'] = 'RAGUserAgent'

# openai
import openai

# langchain
import langchain
from langchain import hub
from langchain_openai import ChatOpenAI
from langchain_together import ChatTogether
from langchain.retrievers import EnsembleRetriever
from langchain.text_splitter import RecursiveCharacterTextSplitter,CharacterTextSplitter


# core
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.documents.base import Document
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser,JsonOutputParser
from langchain_core.outputs import Generation

# community
from langchain_community.retrievers import BM25Retriever,WikipediaRetriever
from langchain_community.vectorstores import Chroma,FAISS,Weaviate
from langchain_community.document_loaders import BSHTMLLoader
from langchain_community.document_loaders import Docx2txtLoader
from langchain_community.document_loaders import JSONLoader

# experimental
from langchain_experimental.text_splitter import SemanticChunker


from langchain_google_genai import GoogleGenerativeAIEmbeddings
from google.colab import userdata
from tqdm import tqdm

import weaviate
from langchain_weaviate.vectorstores import WeaviateVectorStore
from weaviate.embedded import EmbeddedOptions

from bs4 import BeautifulSoup
import docx
import json

In [71]:
class ModelType(Enum):
  OPENAI = 'openai'
  LLAMA = 'llama'
  MISTRAL = 'mistral'

class VectorType(Enum):
  FAISS = 'faiss'
  CHROMA = 'chroma'
  WEAVIATE = 'weaviate'

class RetrieverType(Enum):
  MMR = 'mmr' #MMR is a technique used to retrieve relevant items from a query while avoiding redundancy
  BM25 = 'bm25'
  DEFAULT = 'default'
  SIMILARITY='similarity_score_threshold'

class SplitterType(Enum):
  CHAR_SPLITTER = 'CharacterTextSplitter'
  RECURSIVE_CHAR_SPLITTER = 'RecursiveTextSplitter'
  SEMANTIC_CHUNKER = 'SemanticChunker'

class ParserType(Enum):
  JSON = 'JsonOutputParser'
  STD = 'StrOutputParser'

class FinalOutputModel(BaseModel):
  relevance_score: float = Field(description="The relevance score of the retrieved context to the question")
  answer: str = Field(description="The final answer to the question")



In [72]:
def create_sample_files():
  pdf_source = "/content/sample_data/google-2023-environmental-report.pdf"
  html_path = "/content/sample_data/google-2023-environmental-report.html"
  word_path = "/content/sample_data/google-2023-environmental-report.docx"
  json_path = "/content/sample_data/google-2023-environmental-report.json"

  with open(pdf_source, "rb") as pdf_file:
    pdf_reader = PdfReader(pdf_file)
    pdf_text = "".join(page.extract_text() for page in pdf_reader.pages)

    #html
    soup = BeautifulSoup("<html><body></body></html>","html.parser")
    soup.body.append(pdf_text)
    with open(html_path,"w", encoding='utf_8') as html_file:
      html_file.write(str(soup))

    #word
    doc = docx.Document()
    doc.add_paragraph(pdf_text)
    doc.save(word_path)

    # json
    json_data = {"text": pdf_text}
    with open(json_path, "w") as json_file:
      json.dump(json_data, json_file)


In [73]:
def format_docs(docs):
  return "\n\n".join(doc.page_content for doc in docs)

def extract_score(llm_output):
  score = 0
  try:
    score = float(llm_output.strip())
  except ValueError:
    pass

  return score

def conditional_answer(x):
  relevance_score = extract_score(x['relevance_score'])
  if relevance_score < 4:
    return "I have no idea"
  else:
    return x['answer']

def conditional_json_answer(x):
  relevance_score = extract_score(x['relevance_score'])
  if relevance_score < 4:
    return "I have no idea"
  else:
    return format_json_output(x)

def format_json_output(x):
    json_output = {
        "relevance_score": extract_score(x['relevance_score']),
        "answer": x['answer'],
    }
    json_parser = JsonOutputParser(pydantic_model=FinalOutputModel)
    return json_parser.parse_result([Generation(text=json.dumps(json_output))])

In [74]:
def get_vector(vector,documents, embedding,collection_name,description_name):
  if vector == VectorType.FAISS:
    return create_faiss_vectorstore(documents, embedding)
  elif vector == VectorType.CHROMA:
    return create_chroma_vectorstore(documents, embedding, collection_name)
  elif vector == VectorType.WEAVIATE:
    return create_weviate_vectorstore(documents,embedding,collection_name,description_name)

def get_retriever(vector,documents, embedding,collection_name,description_name,search_type,search_kwargs):
  vector_store = get_vector(vector,documents, embedding,collection_name,description_name)
  if search_type == RetrieverType.MMR:
    return vector_store.as_retriever(search_type="mmr")
  elif search_type == RetrieverType.SIMILARITY:
    return vector_store.as_retriever(search_type="similarity_score_threshold",search_kwargs=search_kwargs)
  else:
    return vector_store.as_retriever(search_kwargs=search_kwargs)

def create_faiss_vectorstore(documents, embedding):
  return FAISS.from_documents(
      documents = documents,
      embedding = embedding
  )

def create_chroma_vectorstore(documents, embedding, collection_name):
  chroma_client = chromadb.Client()
  return Chroma.from_documents(
      documents = documents,
      embedding = embedding,
      collection_name = collection_name,
      client = chroma_client
  )


In [75]:
def create_weviate_vectorstore(documents,
                               embedding,
                               collection_name,
                               description_name):

  weaviate_client = weaviate.Client(embedded_options=EmbeddedOptions())
  try:
    weaviate_client.schema.delete_class(collection_name)
  except:
    pass
  structure = {
      "class": collection_name,
      "description": description_name,
      "properties": [
          {
              "name": "text",
              "dataType": ["text"],
              "description": "The text content of the document"
          },
          {
              "name": "doc_id",
              "dataType": ["string"],
              "description": "Document Id"
          },
          {
              "name": "source",
              "dataType": ["string"],
              "description": "Source of the document"
          }
      ]
  }

  weaviate_client.schema.create_class(structure)
  vector_store= Weaviate(
      client=weaviate_client,
      embedding=embedding,
      index_name=collection_name,
      text_key="text",
      attributes=["doc_id", "source"],
      by_text=False
  )

  weaviate_client.batch.configure(batch_size=100)
  with weaviate_client.batch as batch:
    for doc in tqdm(documents, desc="Processing documents"):
        properties = {
            "text": doc.page_content,
            "doc_id": doc.metadata["doc_id"],
            "source": doc.metadata["source"]
        }
        vector = embedding.embed_query(doc.page_content)
        batch.add_data_object(
            data_object=properties,
            class_name=collection_name,
            vector=vector
        )

  return vector_store

In [79]:
class RagPipeline:
  def __init__(self, source,vector_type,retriever_type, model_type,splitter_type,parser_type):
    os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')
    os.environ['GOOGLE_API_KEY'] = userdata.get('GOOGLE_API_KEY')
    os.environ['TOGETHER_API_KEY'] = userdata.get('TOGETHER_API_KEY')

    openai.api_key = userdata.get('OPENAI_API_KEY')

    self.parser_type = parser_type
    self.splitter_type = splitter_type
    self.model_type = model_type
    self.retriever_type = retriever_type
    self.vector_type = vector_type
    self.source = source
    self.str_output_parser = StrOutputParser()
    self.gemini_embeddings = GoogleGenerativeAIEmbeddings(model='models/embedding-001')
    #self.llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)
    if model_type == ModelType.OPENAI:
      self.llm = self.llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)
    elif model_type == ModelType.LLAMA:
      self.llm = ChatTogether(model="meta-llama/Llama-3-70b-chat-hf", temperature=0)
    elif model_type == ModelType.MISTRAL:
      self.llm = ChatTogether(model="mistralai/Mixtral-8x22B-Instruct-v0.1", temperature=0)

    self.prompt = hub.pull('jclemens24/rag-prompt')

    self.relevance_prompt_template = PromptTemplate.from_template(
        """
          Given the following question and retrieved context, determine if the context is relevant to the question.
          Provide a score from 1 to 5, where 1 is not at all relevant and 5 is highly relevant.
          Return ONLY the numeric score, without any additional text or explanation.

          Question: {question}
          Retrieved Context: {retrieved_context}

          Relevance Score:
        """
    )

    if self.vector_type == VectorType.WEAVIATE:
        self.id = "doc_id"
    else:
        self.id = "id"

  def retrieve(self):
      extension = Path(self.source).suffix[1:]

      docs = []
      if extension == 'pdf':
        with open(self.source, "rb") as pdf_file:
          pdf_reader = PdfReader(pdf_file)
          pdf_text = "".join(page.extract_text() for page in pdf_reader.pages)
          docs = [Document(page_content=page) for page in pdf_text.split("\n\n")]
      elif extension == 'html':
        loader = BSHTMLLoader(self.source)
        docs = loader.load()
      elif extension == 'docx':
        loader = Docx2txtLoader(self.source)
        docs = loader.load()
      elif extension == 'json':
        loader = JSONLoader(file_path=self.source,jq_schema='.text')
        docs = loader.load()


      splits = []
      if self.splitter_type == SplitterType.RECURSIVE_CHAR_SPLITTER:
        splits = (
                    RecursiveCharacterTextSplitter
                    (
                        separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
                        chunk_size=1000,
                        chunk_overlap=200
                    ).split_documents(docs)
                )
      elif self.splitter_type == SplitterType.CHAR_SPLITTER:
          splits = (
                    CharacterTextSplitter
                    (
                        separators="\n",
                        chunk_size=1000,
                        chunk_overlap=200,
                        is_separator_regex=False
                    ).split_documents(docs)
                )
      else:
        semantic_splitter = SemanticChunker(self.gemini_embeddings, number_of_chunks=200)
        splits = semantic_splitter.split_documents(docs)

      dense_documents = [Document(page_content=doc.page_content, metadata={
          self.id: str(i),"source": "dense"
      }) for i, doc in enumerate(splits)]

      sparse_documents=[Document(page_content=doc.page_content, metadata={
          self.id: str(i), "source": "sparse"
      })for i, doc in enumerate(splits)]

      vectorstore = get_vector(self.vector_type,
                               dense_documents,
                               self.gemini_embeddings,
                               'Google_Environment_report',
                               'Google Environment report as of 2023')

      dense_retriever=get_retriever(self.vector_type,
                                   dense_documents,
                                   self.gemini_embeddings,
                                   'Google_Environment_report',
                                   'Google Environment report as of 2023',
                                    self.retriever_type,
                                    {"score_threshold": 0.5})

      #dense_retriever = vectorstore.as_retriever(search_kwargs={'k': 10})
      sparse_retriever = BM25Retriever.from_documents(sparse_documents)
      ensemble_retriever = EnsembleRetriever(
          retrievers=[dense_retriever, sparse_retriever],
          weights=[0.7, 0.3]
      )

      return ensemble_retriever

  def augment(self, retriever):
      rag_chain_from_docs = (
          RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
          | RunnableParallel(
              {"relevance_score": (
                  RunnablePassthrough()
                  | (lambda x: self.relevance_prompt_template.format(question=x['question'], retrieved_context=x['context']))
                  | self.llm
                  | self.str_output_parser
              ), "answer": (
                  RunnablePassthrough()
                  | self.prompt
                  | self.llm
                  | self.str_output_parser
              )}
          )
          | RunnablePassthrough().assign(final_answer=conditional_answer if self.parser_type == ParserType.STD else conditional_json_answer)

      )

      rag_chain_with_source = RunnableParallel(
          {"context": retriever, "question": RunnablePassthrough()}
      ).assign(answer=rag_chain_from_docs)

      return rag_chain_with_source

  def generate(self, question, chain):
      result = chain.invoke(question)
      print(result)
      retrieved_docs = result['context']

      print(f"Original Question: {question}\n")
      print(f"Relevance Score: {result['answer']['relevance_score']}\n")
      print(f"Final Answer:\n{result['answer']['final_answer']}\n\n")
      print(f"Final JSON Output:\n{result}\n\n")
      print("Retrieved Documents:")
      for i, doc in enumerate(retrieved_docs, start=1):
          # note: if using the Weaviate vectorstore, change 'id' to 'doc_id'
          print(f"Document {i}: Document ID: {doc.metadata[self.id]} source: {doc.metadata['source']}")
          print(f"Content:\n{doc.page_content}\n")


In [None]:
create_sample_files()

In [80]:
rag = RagPipeline(source='/content/sample_data/google-2023-environmental-report.json',
                  vector_type=VectorType.CHROMA,
                  retriever_type=RetrieverType.SIMILARITY,
                  model_type=ModelType.LLAMA,
                  splitter_type=SplitterType.SEMANTIC_CHUNKER,
                  parser_type=ParserType.JSON)



In [81]:
retriever = rag.retrieve()
augmentor = rag.augment(retriever)
rag.generate("What are Google's environmental initiatives?",augmentor)


{'context': [Document(metadata={'id': '117', 'source': 'dense'}, page_content='in four key ways: accelerating the transition to a net-zero carbon future, advancing water stewardship, building a circular economy, and protecting nature and biodiversity.Our \noperations\nGoogle uses energy, natural resources, and products \nand services to build our workplaces, data centers, and consumer hardware products. At the end of 2022, we had offices and data centers in roughly 200 cities  and nearly \n60 countries  around the world. We also had two retail \nstores, both in New York City.\nThe products and services that our customers and users \nrely on—like Gmail, Google Cloud, Search, and YouTube—are powered by our data centers and networking infrastructure. At the end of 2022, we had 28 Google-owned and -operated data center campuses across 24 data center locations\n\u200965 on four continents, as well as more \nthan 30 Google Cloud regions . In addition to our Google-\nowned and -operated data 