# Run the following if you are on Colab

- change the resource type to GPU

In [None]:
!git clone https://github.com/scbxtraining/scbx-rag.git

In [None]:
import os
os.chdir('/content/scbx-rag')

In [None]:
!pip install -r requirements.txt

# RAG

In [None]:
import os
import pandas as pd
from langchain_openai import AzureChatOpenAI
from langchain_community.utils.math import cosine_similarity
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv

In [None]:
os.environ["AZURE_OPENAI_API_KEY"] = "OPENAI_KEY"
os.environ["AZURE_OPENAI_ENDPOINT"] = "OPENAI_ENDPOINT"
deployment_name="DEPLOYMENT_NAME"
api_version="API_VERSION"

## Indexing

1. Load: First we need to load our data. This is done with Document Loaders.
2. Split: Text splitters break large Documents into smaller chunks. This is useful both for indexing data and for passing it in to a model, since large chunks are harder to search over and won't fit in a model's finite context window.
3. Store: We need somewhere to store and index our splits, so that they can later be searched over. This is often done using a VectorStore and Embeddings model.

[ref](https://python.langchain.com/docs/tutorials/rag/)

![title](./imgs/index.png)

### Load

In [None]:
path = './inputs/'
docs = []

files = os.listdir(path)
files = [x for x in files if x.endswith('.pdf')]

for file in files:
    loader = PyMuPDFLoader(f"{path}/{file}")
    doc = loader.load()
    for _ in doc:
        additional_metadata = {
                                "last_modified_date": file.split('.')[0].split('_')[1],
                                "document_name": file.split('.')[0].split('_')[0],
                            }
        _.metadata.update(additional_metadata)

    docs = docs + doc
len(docs)

### Split

In [None]:
# Chucking: Split the text into chunks
CHUNK_SIZE = 4000
CHUNK_OVERLAP = 200

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    add_start_index=True
)
texts = text_splitter.split_documents(docs)
print(f"splitted texts with length: {len(texts)}")

In [None]:
texts[50]

In [None]:
texts[50].page_content

In [None]:
texts[50].metadata

In [None]:
texts[4].metadata

In [None]:
texts[5].metadata

In [None]:
texts[4].page_content

In [None]:
texts[5].page_content

### Store to Vector DB

In [None]:
## Change device type to cpu if you are running on laptop

from langchain.embeddings import HuggingFaceBgeEmbeddings

model_name = "BAAI/bge-base-en-v1.5"
model_kwargs = {"device": "cuda"}
encode_kwargs = {"normalize_embeddings": True}

embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

In [None]:
vectorstore = Chroma.from_documents(documents=texts[:], embedding=embeddings)

## Retrieval and generation

1. Retrieve: Given a user input, relevant splits are retrieved from storage using a Retriever.
2. Generate: A ChatModel / LLM produces an answer using a prompt that includes the question and the retrieved data

[Ref](https://python.langchain.com/docs/tutorials/rag/)

![title](./imgs/retrieval.png)

In [None]:
question = "what is SCBX's 2025 Sustainability Targets"

In [None]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})

In [None]:
retrieved_docs = retriever.invoke(question)

In [None]:
retrieved_docs[0]

In [None]:
retrieved_docs[0].page_content

In [None]:
PROMPT_TEMPLATE = """
    Use the following context (delimited by <ctx></ctx>) to answer the question. 
    Use the context to provide the answer only. 
    ------
    <ctx>
    {context}
    </ctx>
    ------
    {question}
    Answer:

"""

custom_rag_prompt = PromptTemplate.from_template(template=PROMPT_TEMPLATE)

llm = AzureChatOpenAI(
    azure_deployment=deployment_name,
    api_version=api_version, 
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

# Replace with ChatOpenAI if you have access to OpenAI API
# llm = ChatOpenAI(model_name='gpt-4o', temperature=0, streaming=True, api_key=os.environ["OPENAI_API_KEY"])

In [None]:
"""
We’ll use the LCEL Runnable protocol to define the chain, allowing us to

pipe together components and functions in a transparent way
automatically trace our chain in LangSmith
get streaming, async, and batched calling out of the box.
"""

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)
    
chain = (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | custom_rag_prompt
        | llm
        | StrOutputParser()
)

In [None]:
questions = [
    "What is the reduction in scope 1 and scope 2 emissions that SCB achieved in 2023?",
    "What share of SCBX's total revenue was powered by AI in 2023?",
    "What is SCBX's 2025 financial support target for 'Net Zero financed emissions' for scope 3 emissions?",
    "Can you summarize what SCBX is doing to improve financial and digital literacy?",
    "What has been Thailand's share of economic loss from extreme climate events between 2000 to 2019?",
    "What are SCBX's scope 1 and 2 emissions for year 2023 and how much reduction have we seen from the year before?",
    "What is SCBX's scope 3 emissions baseline, which year was it measured in, and what are the top 3 sectors that fall under this category?"
]

result = []

In [None]:
for question in questions:
    answer = chain.invoke(question)
    print(answer)
    result.append(answer)

In [None]:
print(chain.invoke("what is SCBX's 2025 Sustainability Targets"))

In [None]:
result_df = pd.DataFrame({
    "question": questions,
    "answers": result
})

In [None]:
result_df

## Return Sources

In [None]:
from typing import List
from typing_extensions import Annotated, TypedDict
from langchain_core.runnables import RunnableParallel

In [None]:
PROMPT_TEMPLATE = """
    Use the following context (delimited by <ctx></ctx>) to answer the question. 
    Use the context to provide the answer only. 
    ------
    <ctx>
    {context}
    </ctx>
    ------
    {question}
    Answer:

"""

custom_rag_prompt = PromptTemplate.from_template(template=PROMPT_TEMPLATE)

In [None]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain_from_docs = (
    {
        "question": lambda x: x["question"],  # input query
        "context": lambda x: format_docs(x["context"]),  # context
    }
    | custom_rag_prompt  
    | llm  
    | StrOutputParser()  
)

retrieve_docs = (lambda x: x["question"]) | retriever

# Below, we chain `.assign` calls. This takes a dict and successively
# adds keys-- "context" and "answer"-- where the value for each key
# is determined by a Runnable. The Runnable operates on all existing
# keys in the dict.
chain = RunnablePassthrough.assign(context=retrieve_docs).assign(
    answer=rag_chain_from_docs
)

chain.invoke({"question": "What is SCBX's financed emissions baseline, which year was it measured in?"})

# Filter by Meta Data

In [None]:
def init_RAG_retrieval(input_pdf_path = './inputs/'):
    docs = []
    files = os.listdir(input_pdf_path)
    files = [x for x in files if x.endswith('.pdf')]
    
    for file in files:
        loader = PyMuPDFLoader(f"{input_pdf_path}/{file}")
        doc = loader.load()
        for _ in doc:
            additional_metadata = {
                                    "last_modified_date": file.split('.')[0].split('_')[1],
                                    "document_name": file.split('.')[0].split('_')[0],
                                }
            _.metadata.update(additional_metadata)
    
        docs = docs + doc
    
    # Chucking: Split the text into chunks
    CHUNK_SIZE = 4000
    CHUNK_OVERLAP = 200
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP,
        add_start_index=True
    )
    texts = text_splitter.split_documents(docs)
    print(f"splitted texts with length: {len(texts)}")
    
    ## to replace with OpenAIEmbeddings if you have access to OpenAI API
    # embeddings = OpenAIEmbeddings(
    #     model='text-embedding-ada-002',
    #     deployment='text-embedding-ada-002',
    # )
    
    vectorstore = Chroma.from_documents(documents=texts[:], embedding=embeddings)
    retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})

    return vectorstore, retriever

In [None]:
def create_custom_rag(retriever):
    PROMPT_TEMPLATE = """
        Use the following context (delimited by <ctx></ctx>) to answer the question. 
        Use the context to provide the answer only. 
        ------
        <ctx>
        {context}
        </ctx>
        ------
        {question}
        Answer:

    """

    custom_rag_prompt = PromptTemplate.from_template(template=PROMPT_TEMPLATE)
    
    def format_docs(docs):
        return "\n\n".join(doc.page_content for doc in docs)
    
    rag_chain = (
            {"context": retriever | format_docs, "question": RunnablePassthrough()}
            | custom_rag_prompt
            | llm
            | StrOutputParser()
    )
    return rag_chain


rag_vectorstore, rag_retrieval = init_RAG_retrieval(input_pdf_path = './inputs/multiple_pdfs/')
rag_chain = create_custom_rag(rag_retrieval)

In [None]:
query = "What is SCBX's financed emissions baseline, which year was it measured in?"
relevant_documents = ['SCBX_Sustainability Report 2023']

# filter for relevant documents
search_kwargs = {"k": 5}
retriever = rag_vectorstore.as_retriever(search_type="similarity", search_kwargs=search_kwargs)

# retrieve relevant documents
retrieved_docs = retriever.invoke(query)

retrieved_docs[0]

In [None]:
query = "What is SCBX's financed emissions baseline, which year was it measured in?"
relevant_documents = ['SCBX Sustainability Report']

# filter for relevant documents
search_kwargs = {"k": 5, "filter": {'document_name': {'$in': relevant_documents}}}
retriever = rag_vectorstore.as_retriever(search_type="similarity", search_kwargs=search_kwargs)

# retrieve relevant documents
retrieved_docs = retriever.invoke(query)

In [None]:
retrieved_docs[0]

## Leverage LLM Classification to build the router chain

In [None]:
from langchain.chains.router.multi_prompt_prompt import MULTI_PROMPT_ROUTER_TEMPLATE
from langchain.chains.router.llm_router import LLMRouterChain, RouterOutputParser
from langchain.prompts import PromptTemplate
from langchain_openai import AzureChatOpenAI
from langchain_openai import ChatOpenAI

In [None]:
question_classification = [
  {
    "description": "SCBX sustainability, emissions baseline",
    "relevant_documents": [ "SCBX Sustainability Report" ]
  },
  {
    "description": "questions about Thailand economy, company overview for SCBX, key insights, and 2024 outlook",
    "relevant_documents" : [ "SCBX SET Thailand Focus" ],
  }
]

In [None]:
prompt_templates = []

for item in question_classification:
    prompt_template = item["description"]
    prompt_templates.append(prompt_template)

In [None]:
prompt_embeddings = embeddings.embed_documents(prompt_templates)

def prompt_router(input):
    query_embedding = embeddings.embed_query(input["query"])
    similarity = cosine_similarity([query_embedding], prompt_embeddings)[0]
    most_similar = prompt_templates[similarity.argmax()]
    print(f"The most similar prompt is {similarity.argmax()}")
    return similarity.argmax()

In [None]:
print(prompt_router({"query": "What is SCBX's financed emissions baseline, which year was it measured in?"}))

In [None]:
print(prompt_router({"query": "what is the executive summary about Thailand's economy"}))

In [None]:
def router_rag(query):
    router_idx = prompt_router({"query": query})
    relevant_documents = question_classification[router_idx].get("relevant_documents")
    
    # filter for relevant documents
    search_kwargs = {"k": 5, "filter": {'document_name': {'$in': relevant_documents}}}
    retriever = rag_vectorstore.as_retriever(search_type="similarity", search_kwargs=search_kwargs)
    
    # retrieve relevant documents
    rag_chain = create_custom_rag(retriever)
    
    answer = rag_chain.invoke(query)
    return answer

In [None]:
router_rag(query = "What is SCBX's financed emissions baseline, which year was it measured in?")

In [None]:
router_rag(query = "what is the executive summary about Thailand's economy?")