In [None]:
%pip install --upgrade pip

# Uninstall conflicting packages
%pip uninstall -y langchain-core langchain-openai langchain-experimental langchain-community langchain chromadb beautifulsoup4 python-dotenv PyPDF2 rank_bm25 bs4 python-docx docx2txt jq

# Install compatible versions of langchain libraries
%pip install langchain-core==0.3.6
%pip install langchain-openai==0.2.1
%pip install langchain-experimental==0.3.2
%pip install langchain-community==0.3.1
%pip install langchain==0.3.1

# Install remaining packages
%pip install chromadb==0.5.11
%pip install beautifulsoup4==4.12.3
%pip install python-dotenv==1.0.1
%pip install PyPDF2==3.0.1 -q --user
%pip install rank_bm25==0.2.2

# New installs for document loaders
%pip install bs4==0.0.2
%pip install python-docx==1.1.2
%pip install docx2txt==0.8
%pip install jq==1.8.0

%pip install langchain-google-genai
%pip install --upgrade langchain-together==0.2.0
%pip install --upgrade --quiet pypdf

In [None]:
import os
from enum import Enum

os.environ['USER_AGENT'] = 'RAGUserAgent'

import openai
import chromadb
import langchain
from langchain import hub
from langchain_openai import ChatOpenAI
from langchain_together import ChatTogether

# core
from langchain_core.prompts import PromptTemplate
from langchain_core.documents.base import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableParallel

# community
from langchain_community.retrievers import BM25Retriever
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import MergedDataLoader

# retriever
from langchain.retrievers import EnsembleRetriever

# splitter
from langchain.text_splitter import RecursiveCharacterTextSplitter

# google
from google.colab import userdata
from langchain_google_genai import GoogleGenerativeAIEmbeddings

# other
import pypdf
from PyPDF2 import PdfReader
from bs4 import BeautifulSoup
import docx
import json


#  loader
from langchain_community.document_loaders import Docx2txtLoader
from langchain_community.document_loaders import JSONLoader
from langchain_community.document_loaders import BSHTMLLoader
from langchain_community.document_loaders import PyPDFLoader


In [None]:
class SourceType(Enum):
  PDF = 'pdf'
  HTML = 'html'
  DOCX = 'docx'

In [None]:
def data_loader():
  based_path = '/content/sample_data/google-2023-environmental-report'
  extensions= ['html','docx']

  # pdf_loader = PyPDFLoader(file_path=f'{based_path}.pdf')
  # html_loader = BSHTMLLoader(f'{based_path}.html')
  # docx_loader = Docx2txtLoader(f'{based_path}.docx')
  # json_loader = JSONLoader(file_path=f'{based_path}.json',jq_schema=".txt")

  # combined_loader = MergedDataLoader(loaders=[json_loader])

  # docs = combined_loader.load()
  loader = JSONLoader(
    file_path=f'{based_path}.json',
    jq_schema='.text',
  )

  docs = []
  with open(f'{based_path}.pdf', "rb") as pdf_file:
    pdf_reader = PdfReader(pdf_file)
    pdf_text = "".join(page.extract_text() for page in pdf_reader.pages)
    docs = [Document(page_content=page) for page in pdf_text.split("\n\n")]


  return docs

In [None]:
def format_docs(docs):
  return "\n\n".join(doc.page_content for doc in docs)

def extract_score(llm_output):
  score = 0
  try:
    score = float(llm_output.strip())
  except ValueError:
    pass
  return score

def conditional_answer(x):
  relevance_score = extract_score(x['relevance_score'])
  if relevance_score < 4:
    return "I have no idea"
  else:
    return x['answer']

In [None]:
def create_test_files(source):
  based_path = '/content/sample_data/google-2023-environmental-report'
  with open(source,'rb') as pdf_file:
    pdf_reader = PdfReader(pdf_file)
    pdf_text = "".join(page.extract_text() for page in pdf_reader.pages)

    #create html
    soup = BeautifulSoup("<htm><body></body></html>", 'html.parser')
    soup.body.append(pdf_text)
    with open(f'{based_path}.html', 'w', encoding='utf-8') as html_file:
      html_file.write(str(soup))

    #create doc
    doc = docx.Document()
    doc.add_paragraph(pdf_text)
    doc.save(f'{based_path}.docx')

    #create json
    with open(f'{based_path}.json', 'w', encoding='utf-8') as json_file:
      json.dump({'text': pdf_text}, json_file, ensure_ascii=False, indent=4)




In [None]:
class RagPipeline:
  def __init__(self,  source_path):
    os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')
    os.environ['TOGETHER_API_KEY'] = userdata.get('TOGETHER_API_KEY')
    os.environ['GOOGLE_API_KEY'] = userdata.get('GOOGLE_API_KEY')

    openai.api_key = userdata.get('OPENAI_API_KEY')


    self.source_path = source_path
    self.llm = ChatOpenAI(model='gpt-4o-mini', temperature=0)
    self.str_output_parser = StrOutputParser()
    self.gemini_embeddings = GoogleGenerativeAIEmbeddings(model='models/embedding-001')
    self.prompt = hub.pull('jclemens24/rag-prompt')

    self.relevance_prompt_template = PromptTemplate.from_template(
        """
          Given the following question and retrieved context, determine if the context is relevant to the question.
          Provide a score from 1 to 5, where 1 is not at all relevant and 5 is highly relevant.
          Return ONLY the numeric score, without any additional text or explanation.

          Question: {question}
          Retrieved Context: {retrieved_context}

          Relevance Score:
        """
    )

    create_test_files(self.source_path)

  def retriever(self):
    #docs = data_loader()
    docs = []
    with open(self.source_path, "rb") as pdf_file:
        pdf_reader = PdfReader(pdf_file)
        pdf_text = "".join(page.extract_text() for page in pdf_reader.pages)
        docs = [Document(page_content=page) for page in pdf_text.split("\n\n")]

    character_splitter = RecursiveCharacterTextSplitter(
        separators=["\n\n", "\n", ". ", " ", ""],
        chunk_size=1000,
        chunk_overlap=200
      )

    splits = character_splitter.split_documents(docs)


    dense_documents=[Document(page_content=doc.page_content, metadata={
        "id": str(i),"source": "dense"
    }) for i, doc in enumerate(splits)]

    sparse_documents=[Document(page_content=doc.page_content, metadata={
        "id": str(i), "source": "sparse"
    }) for i, doc in enumerate(splits)]

    vector_store = Chroma.from_documents(
        documents=dense_documents,
        embedding=self.gemini_embeddings,
        collection_name="google_initiative",
        client=chromadb.Client()
    )

    dense_retriever = vector_store.as_retriever(search_kwargs={"k": 3})
    sparse_retriever = BM25Retriever.from_documents(sparse_documents)

    ensemble_retriever = EnsembleRetriever(
        retrievers=[dense_retriever, sparse_retriever],
        weights=[0.7, 0.3]
    )

    return ensemble_retriever

  def augmenter(self, retriever):
    rag_chain_from_docs = (
          RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
          | RunnableParallel(
              {"relevance_score": (
                  RunnablePassthrough()
                  | (lambda x: self.relevance_prompt_template.format(question=x['question'], retrieved_context=x['context']))
                  | self.llm
                  | self.str_output_parser
              ), "answer": (
                  RunnablePassthrough()
                  | self.prompt
                  | self.llm
                  | self.str_output_parser
              )}
          )
          | RunnablePassthrough().assign(final_answer=conditional_answer)

      )

    rag_chain_with_source = RunnableParallel(
          {"context": retriever, "question": RunnablePassthrough()}
      ).assign(answer=rag_chain_from_docs)

    return rag_chain_with_source

  def generator(self,question,chain):
    result = chain.invoke(question)
    print(result)
    # retrieved_docs = result['context']
    # print(f"Original Question: {question}\n")
    # print(f"Relevance Score: {result['answer']['relevance_score']}\n")
    # print(f"Final Answer:\n{result['answer']['final_answer']}\n\n")
    # print("Retrieved Documents:")
    # for i, doc in enumerate(retrieved_docs, start=1):
    #   print(f"Document {i}: Document ID: {doc.metadata[self.id]} source: {doc.metadata['source']}")
    #   print(f"Content:\n{doc.page_content}\n")







In [None]:
rag = RagPipeline('/content/sample_data/google-2023-environmental-report.pdf')

In [None]:
chainer = rag.retriever()

In [None]:
rag.generator("What are Google's environmental initiatives?",chainer)