# RAG

In [1]:
import os
import pandas as pd
from langchain_openai import AzureChatOpenAI
from langchain_community.utils.math import cosine_similarity
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv

In [2]:
load_dotenv(dotenv_path='../.env')

False

## Indexing

1. Load: First we need to load our data. This is done with Document Loaders.
2. Split: Text splitters break large Documents into smaller chunks. This is useful both for indexing data and for passing it in to a model, since large chunks are harder to search over and won't fit in a model's finite context window.
3. Store: We need somewhere to store and index our splits, so that they can later be searched over. This is often done using a VectorStore and Embeddings model.

[ref https://python.langchain.com/docs/tutorials/rag/]

![title](./imgs/index.png)

### Load

In [3]:
path = './inputs/'
docs = []

files = os.listdir(path)
files = [x for x in files if x.endswith('.pdf')]

for file in files:
    loader = PyMuPDFLoader(f"{path}/{file}")
    doc = loader.load()
    for _ in doc:
        additional_metadata = {
                                "last_modified_date": file.split('.')[0].split('_')[1],
                                "document_name": file.split('.')[0].split('_')[0],
                            }
        _.metadata.update(additional_metadata)

    docs = docs + doc
len(docs)

146

### Split

In [4]:
# Chucking: Split the text into chunks
CHUNK_SIZE = 4000
CHUNK_OVERLAP = 200

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    add_start_index=True
)
texts = text_splitter.split_documents(docs)
print(f"splitted texts with length: {len(texts)}")

splitted texts with length: 155


In [5]:
texts[50]

Document(metadata={'source': './inputs//SCBX-SR2023_ENG.pdf', 'file_path': './inputs//SCBX-SR2023_ENG.pdf', 'page': 47, 'total_pages': 146, 'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': 'Adobe InDesign 19.5 (Macintosh)', 'producer': 'Adobe PDF Library 17.0', 'creationDate': "D:20240828014004+07'00'", 'modDate': "D:20240828014217+07'00'", 'trapped': '', 'last_modified_date': 'ENG', 'document_name': 'SCBX-SR2023', 'start_index': 0}, page_content='SCBX BOARD OF DIRECTORS\nTechnology Committee\nIT Committee\nSCBX\nTech\n Strategy\nCommittee\nCloud\nStrategy \nCommittee\nData \nStrategy \nCommittee\nCyber \nStrategy \nCommittee\nCloud \nCOE\nData\nCOE\nCyber\nCOE\nAudit Committee\nAudit Function\nAudit Committee\nAudit Function\nRisk Oversight Committee\nTechnology Risk\nFunction\nCompliance\nFunction\nRisk Management \nCommittee\nSUBSIDIARIES\nRisk Oversight \nCommittee\nAudit Function\nTechnology \nRisk Function\nBusiness Function\nIT & Cyber \n

In [6]:
texts[50].page_content

'SCBX BOARD OF DIRECTORS\nTechnology Committee\nIT Committee\nSCBX\nTech\n Strategy\nCommittee\nCloud\nStrategy \nCommittee\nData \nStrategy \nCommittee\nCyber \nStrategy \nCommittee\nCloud \nCOE\nData\nCOE\nCyber\nCOE\nAudit Committee\nAudit Function\nAudit Committee\nAudit Function\nRisk Oversight Committee\nTechnology Risk\nFunction\nCompliance\nFunction\nRisk Management \nCommittee\nSUBSIDIARIES\nRisk Oversight \nCommittee\nAudit Function\nTechnology \nRisk Function\nBusiness Function\nIT & Cyber \nSecurity Function\nTechnology\nAudit\nRisk Management\nSUBSIDIARY BOARD OF DIRECTORS\nTechnology\nAudit\nRisk Management\nCYBERSECURITY AND DATA SECURITY GOVERNANCE\n48\n2023 SUSTAINABILITY REPORT \nOVERVIEW\nSPECIAL\nREPORT\nPLANTING\nDIGITAL \nSEEDS\nENRICHING \nLIVES\nGROWING\nECONOMIC \nRESILIENCE\nREACHING\nCLIMATE\nNEUTRALITY\nFOUNDATION\nFACTBOOK'

In [7]:
texts[50].metadata

{'source': './inputs//SCBX-SR2023_ENG.pdf',
 'file_path': './inputs//SCBX-SR2023_ENG.pdf',
 'page': 47,
 'total_pages': 146,
 'format': 'PDF 1.7',
 'title': '',
 'author': '',
 'subject': '',
 'keywords': '',
 'creator': 'Adobe InDesign 19.5 (Macintosh)',
 'producer': 'Adobe PDF Library 17.0',
 'creationDate': "D:20240828014004+07'00'",
 'modDate': "D:20240828014217+07'00'",
 'trapped': '',
 'last_modified_date': 'ENG',
 'document_name': 'SCBX-SR2023',
 'start_index': 0}

In [38]:
texts[4].metadata

{'source': './inputs//SCBX-SR2023_ENG.pdf',
 'file_path': './inputs//SCBX-SR2023_ENG.pdf',
 'page': 4,
 'total_pages': 146,
 'format': 'PDF 1.7',
 'title': '',
 'author': '',
 'subject': '',
 'keywords': '',
 'creator': 'Adobe InDesign 19.5 (Macintosh)',
 'producer': 'Adobe PDF Library 17.0',
 'creationDate': "D:20240828014004+07'00'",
 'modDate': "D:20240828014217+07'00'",
 'trapped': '',
 'last_modified_date': 'ENG',
 'document_name': 'SCBX-SR2023',
 'start_index': 0}

In [41]:
texts[5].metadata

{'source': './inputs//SCBX-SR2023_ENG.pdf',
 'file_path': './inputs//SCBX-SR2023_ENG.pdf',
 'page': 4,
 'total_pages': 146,
 'format': 'PDF 1.7',
 'title': '',
 'author': '',
 'subject': '',
 'keywords': '',
 'creator': 'Adobe InDesign 19.5 (Macintosh)',
 'producer': 'Adobe PDF Library 17.0',
 'creationDate': "D:20240828014004+07'00'",
 'modDate': "D:20240828014217+07'00'",
 'trapped': '',
 'last_modified_date': 'ENG',
 'document_name': 'SCBX-SR2023',
 'start_index': 3820}

In [39]:
texts[4].page_content

'Becoming an AI-First and Customer-Centric Organization to build an inclusive and sustainable \nfinancial ecosystem, enabling all stakeholders to accererlate their climate actions.\nSCBX Group remains steadfast in championing our sustainability mission, ‘Opportunities for \nEveryone, Possibilities Everyday,’ guided by the strategic framework comprising 4 pillars: \n‘Planting Digital Seeds, Enriching Lives, Growing Economic Resilience, and Achieving Climate \nNeutrality.’ Our framework adheres to the 10 principles of the United Nations Global Compact \nand upholds the United Nations Sustainable Development Goals (SDGs) and the Paris \nAgreement.\nThroughout 2023, SCBX Group has pursued its sustainability mission under the core pillar of \n‘Planting Digital Seeds’, this endeavor entails laying  the organizational groundwork with a \nprimary focus on building critical technology and data capabilities.  We set up the Cloud Center \nof Excellence (CoE) and the Cyber CoE alongside the endeav

In [40]:
texts[5].page_content

'way towards becoming ‘The Most Admired Regional Financial Technology Group.’\n05\n2023 SUSTAINABILITY REPORT \nOVERVIEW\nSPECIAL\nREPORT\nPLANTING\nDIGITAL \nSEEDS\nENRICHING \nLIVES\nGROWING\nECONOMIC \nRESILIENCE\nREACHING\nCLIMATE\nNEUTRALITY\nFOUNDATION\nFACTBOOK'

### Store to Vector DB

In [8]:
from langchain.embeddings import HuggingFaceBgeEmbeddings

model_name = "BAAI/bge-base-en-v1.5"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}

embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [9]:
vectorstore = Chroma.from_documents(documents=texts[:], embedding=embeddings)

## Retrieval and generation

1. Retrieve: Given a user input, relevant splits are retrieved from storage using a Retriever.
2. Generate: A ChatModel / LLM produces an answer using a prompt that includes the question and the retrieved data

Ref - https://python.langchain.com/docs/tutorials/rag/]

![title](./imgs/retrieval.png)

In [10]:
question = "what is SCBX's 2025 Sustainability Targets"

In [11]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})

In [12]:
retrieved_docs = retriever.invoke(question)

In [13]:
retrieved_docs[0]

Document(metadata={'author': '', 'creationDate': "D:20240828014004+07'00'", 'creator': 'Adobe InDesign 19.5 (Macintosh)', 'document_name': 'SCBX-SR2023', 'file_path': './inputs//SCBX-SR2023_ENG.pdf', 'format': 'PDF 1.7', 'keywords': '', 'last_modified_date': 'ENG', 'modDate': "D:20240828014217+07'00'", 'page': 13, 'producer': 'Adobe PDF Library 17.0', 'source': './inputs//SCBX-SR2023_ENG.pdf', 'start_index': 2, 'subject': '', 'title': '', 'total_pages': 146, 'trapped': ''}, page_content='2025 Sustainability Targets\nIn 2023, SCBX Group has set three-year sustainability targets (2023–2025)  \nto put its sustainability strategy in action. \n4 Pillars\nStrategic Goals\n2025 Target\n2023 Performance\nSDGs Target\nPLANTING\nDIGITAL SEEDS\nDigital Solutions and \nSecurity\nInvest Baht 14,000 million in digital initiatives \nand innovations and achieve zero \ntolerance for information security breaches.\nBaht 5,773 million invested  and \ncontinulously strive for zero tolerance for \ninformat

In [14]:
retrieved_docs[0].page_content

'2025 Sustainability Targets\nIn 2023, SCBX Group has set three-year sustainability targets (2023–2025)  \nto put its sustainability strategy in action. \n4 Pillars\nStrategic Goals\n2025 Target\n2023 Performance\nSDGs Target\nPLANTING\nDIGITAL SEEDS\nDigital Solutions and \nSecurity\nInvest Baht 14,000 million in digital initiatives \nand innovations and achieve zero \ntolerance for information security breaches.\nBaht 5,773 million invested  and \ncontinulously strive for zero tolerance for \ninformation security breaches.\nDigital Skills of the Future\nFoster 200,000 employees and other \npeople to develop and use digital skills      \nin their careers and businesses.\n123,696 employees and other people \ndeveloped AI, analytical thinking, and \ndigital skills.\nENRICHING\nLIVES\nFinancial and Digital \nInclusion\nSupport 4,000,000 underserved people to \ngain access to personal credit through \nnew, digital products.\n1,627,149 underserved people gained \naccess to financial produc

In [23]:
PROMPT_TEMPLATE = """
    Use the following context (delimited by <ctx></ctx>) to answer the question. 
    Use the context to provide the answer only. 
    ------
    <ctx>
    {context}
    </ctx>
    ------
    {question}
    Answer:

"""

custom_rag_prompt = PromptTemplate.from_template(template=PROMPT_TEMPLATE)

llm = AzureChatOpenAI(
    azure_deployment=os.getenv("AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"),
    api_version=os.getenv("AZURE_OPENAI_API_VERSION"), 
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

# Replace with ChatOpenAI if you have access to OpenAI API
# llm = ChatOpenAI(model_name='gpt-4o', temperature=0, streaming=True, api_key=os.environ["OPENAI_API_KEY"])

In [24]:
"""
We’ll use the LCEL Runnable protocol to define the chain, allowing us to

pipe together components and functions in a transparent way
automatically trace our chain in LangSmith
get streaming, async, and batched calling out of the box.
"""

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)
    
chain = (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | custom_rag_prompt
        | llm
        | StrOutputParser()
)

In [25]:
questions = [
    "What is the reduction in scope 1 and scope 2 emissions that SCB achieved in 2023?",
    "What share of SCBX's total revenue was powered by AI in 2023?",
    "What is SCBX's 2025 financial support target for 'Net Zero financed emissions' for scope 3 emissions?",
    "Can you summarize what SCBX is doing to improve financial and digital literacy?",
    "What has been Thailand's share of economic loss from extreme climate events between 2000 to 2019?",
    "What are SCBX's scope 1 and 2 emissions for year 2023 and how much reduction have we seen from the year before?",
    "What is SCBX's scope 3 emissions baseline, which year was it measured in, and what are the top 3 sectors that fall under this category?"
]

result = []

In [26]:
for question in questions:
    answer = chain.invoke(question)
    print(answer)
    result.append(answer)

SCB achieved a 7% reduction in scope 1 and scope 2 emissions in 2023.
20% of SCBX's total revenue was powered by AI in 2023.
Baht 100,000 million.
SCBX Group is committed to improving financial and digital literacy by delivering affordable and accessible financial solutions, focusing on empowering underserved groups, and leveraging technology to enhance financial inclusion. They collaborate with leading partners to broaden financial service channels, promote job and income opportunities, and nurture financial literacy and discipline. Additionally, SCBX aims to cultivate digital skills among employees and society, partnering with world-class AI and technology firms to enhance skills in AI, data analytics, critical thinking, and digital technology. Their 2025 targets include broadening financial access to 4 million underserved customers and nurturing digital skills for 200,000 employees and people in society.
Thailand's share of economic loss from extreme climate events between 2000 to 2

In [27]:
print(chain.invoke("what is SCBX's 2025 Sustainability Targets"))

SCBX's 2025 Sustainability Targets are:

1. **Digital Solutions and Security**: Invest Baht 14,000 million in digital initiatives and innovations and achieve zero tolerance for information security breaches.
2. **Digital Skills of the Future**: Foster 200,000 employees and other people to develop and use digital skills in their careers and businesses.
3. **Financial and Digital Inclusion**: Support 4,000,000 underserved people to gain access to personal credit through new, digital products.
4. **SME-Startup-Social (3S) Empowerment**: Empower 2,000,000 individuals and 60,000 small businesses to grow and reach their full potential via Baht 5,800 million of investment to create positive impacts.
5. **Funding for Low-Carbon Transition and SDGs**: Provide Baht 100,000 million of funding to clients engaging in low-carbon transition and promoting SDGs.
6. **Net Zero in Own Operations**: Achieve net zero in own operations (Scope 1 and 2) by 2030.
7. **Net Zero in Financed Emissions**: Achieve 

In [28]:
result_df = pd.DataFrame({
    "question": questions,
    "answers": result
})

In [29]:
result_df

Unnamed: 0,question,answers
0,What is the reduction in scope 1 and scope 2 e...,SCB achieved a 7% reduction in scope 1 and sco...
1,What share of SCBX's total revenue was powered...,20% of SCBX's total revenue was powered by AI ...
2,What is SCBX's 2025 financial support target f...,"Baht 100,000 million."
3,Can you summarize what SCBX is doing to improv...,SCBX Group is committed to improving financial...
4,What has been Thailand's share of economic los...,Thailand's share of economic loss from extreme...
5,What are SCBX's scope 1 and 2 emissions for ye...,SCBX's scope 1 and 2 emissions for the year 20...
6,"What is SCBX's scope 3 emissions baseline, whi...",SCBX's scope 3 emissions baseline is 7.3 milli...


## Return Sources

In [30]:
from typing import List
from typing_extensions import Annotated, TypedDict
from langchain_core.runnables import RunnableParallel

In [50]:
PROMPT_TEMPLATE = """
    Use the following context (delimited by <ctx></ctx>) to answer the question. 
    Use the context to provide the answer only. 
    ------
    <ctx>
    {context}
    </ctx>
    ------
    {question}
    Answer:

"""

custom_rag_prompt = PromptTemplate.from_template(template=PROMPT_TEMPLATE)

In [None]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain_from_docs = (
    {
        "question": lambda x: x["question"],  # input query
        "context": lambda x: format_docs(x["context"]),  # context
    }
    | custom_rag_prompt  
    | llm  
    | StrOutputParser()  
)

retrieve_docs = (lambda x: x["question"]) | retriever

# Below, we chain `.assign` calls. This takes a dict and successively
# adds keys-- "context" and "answer"-- where the value for each key
# is determined by a Runnable. The Runnable operates on all existing
# keys in the dict.
chain = RunnablePassthrough.assign(context=retrieve_docs).assign(
    answer=rag_chain_from_docs
)

chain.invoke({"question": "What is SCBX's financed emissions baseline, which year was it measured in?"})

{'question': "What is SCBX's financed emissions baseline, which year was it measured in?",
 'context': [Document(metadata={'author': '', 'creationDate': "D:20240828014004+07'00'", 'creator': 'Adobe InDesign 19.5 (Macintosh)', 'document_name': 'SCBX-SR2023', 'file_path': './inputs//SCBX-SR2023_ENG.pdf', 'format': 'PDF 1.7', 'keywords': '', 'last_modified_date': 'ENG', 'modDate': "D:20240828014217+07'00'", 'page': 96, 'producer': 'Adobe PDF Library 17.0', 'source': './inputs//SCBX-SR2023_ENG.pdf', 'start_index': 0, 'subject': '', 'title': '', 'total_pages': 146, 'trapped': ''}, page_content='2021  Financed Emissions (scope 3, category 15)1 by Sector, MtCO2\n0.4\nRetail\n% Financed\nEmissions\n100%\n8%\n10%\n82%\n56%\n2%\n1%\n13%\n2%\n2%\n1%\n5%\nSCB\nLending \nPortfolio\nMortgages\nCorporate\nPortfolio\nPower\nEnergy\n(Fossil Fuel)\nHospitality & \nReal Estate\nMetal\nIndustrial\nChemicals &\nRaw Materials\nAgricultural\nProducts & \nCommodities\nFood & \nBeverage\nOthers2\nFinanced\nEmi

# Filter by Meta Data

In [55]:
def init_RAG_retrieval(input_pdf_path = './inputs/'):
    docs = []
    files = os.listdir(input_pdf_path)
    files = [x for x in files if x.endswith('.pdf')]
    
    for file in files:
        loader = PyMuPDFLoader(f"{input_pdf_path}/{file}")
        doc = loader.load()
        for _ in doc:
            additional_metadata = {
                                    "last_modified_date": file.split('.')[0].split('_')[1],
                                    "document_name": file.split('.')[0].split('_')[0],
                                }
            _.metadata.update(additional_metadata)
    
        docs = docs + doc
    
    # Chucking: Split the text into chunks
    CHUNK_SIZE = 4000
    CHUNK_OVERLAP = 200
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP,
        add_start_index=True
    )
    texts = text_splitter.split_documents(docs)
    print(f"splitted texts with length: {len(texts)}")
    
    ## to replace with OpenAIEmbeddings if you have access to OpenAI API
    # embeddings = OpenAIEmbeddings(
    #     model='text-embedding-ada-002',
    #     deployment='text-embedding-ada-002',
    # )
    
    vectorstore = Chroma.from_documents(documents=texts[:], embedding=embeddings)
    retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})

    return vectorstore, retriever

In [66]:
def create_custom_rag(retriever):
    PROMPT_TEMPLATE = """
        Use the following context (delimited by <ctx></ctx>) to answer the question. 
        Use the context to provide the answer only. 
        ------
        <ctx>
        {context}
        </ctx>
        ------
        {question}
        Answer:

    """

    custom_rag_prompt = PromptTemplate.from_template(template=PROMPT_TEMPLATE)
    
    def format_docs(docs):
        return "\n\n".join(doc.page_content for doc in docs)
    
    rag_chain = (
            {"context": retriever | format_docs, "question": RunnablePassthrough()}
            | custom_rag_prompt
            | llm
            | StrOutputParser()
    )
    return rag_chain


rag_vectorstore, rag_retrieval = init_RAG_retrieval(input_pdf_path = './inputs/multiple_pdfs/')
rag_chain = create_custom_rag(rag_retrieval)

splitted texts with length: 211


In [67]:
query = "What is SCBX's financed emissions baseline, which year was it measured in?"
relevant_documents = ['SCBX_Sustainability Report 2023']

# filter for relevant documents
search_kwargs = {"k": 5}
retriever = rag_vectorstore.as_retriever(search_type="similarity", search_kwargs=search_kwargs)

# retrieve relevant documents
retrieved_docs = retriever.invoke(query)

retrieved_docs[0]

Document(metadata={'author': '', 'creationDate': "D:20240828014004+07'00'", 'creator': 'Adobe InDesign 19.5 (Macintosh)', 'document_name': 'SCBX-SR2023', 'file_path': './inputs//SCBX-SR2023_ENG.pdf', 'format': 'PDF 1.7', 'keywords': '', 'last_modified_date': 'ENG', 'modDate': "D:20240828014217+07'00'", 'page': 96, 'producer': 'Adobe PDF Library 17.0', 'source': './inputs//SCBX-SR2023_ENG.pdf', 'start_index': 0, 'subject': '', 'title': '', 'total_pages': 146, 'trapped': ''}, page_content='2021  Financed Emissions (scope 3, category 15)1 by Sector, MtCO2\n0.4\nRetail\n% Financed\nEmissions\n100%\n8%\n10%\n82%\n56%\n2%\n1%\n13%\n2%\n2%\n1%\n5%\nSCB\nLending \nPortfolio\nMortgages\nCorporate\nPortfolio\nPower\nEnergy\n(Fossil Fuel)\nHospitality & \nReal Estate\nMetal\nIndustrial\nChemicals &\nRaw Materials\nAgricultural\nProducts & \nCommodities\nFood & \nBeverage\nOthers2\nFinanced\nEmissions\n(MtCO2)\nCorp\nSME\n7.3\n0.6\n6.0\n0.7\n0.70\n1.0\n0.1\n0.1\n0.1\n0.1\n0.1\n4.1\n0.6\nSME\n6.0\n

In [68]:
query = "What is SCBX's financed emissions baseline, which year was it measured in?"
relevant_documents = ['SCBX Sustainability Report']

# filter for relevant documents
search_kwargs = {"k": 5, "filter": {'document_name': {'$in': relevant_documents}}}
retriever = rag_vectorstore.as_retriever(search_type="similarity", search_kwargs=search_kwargs)

# retrieve relevant documents
retrieved_docs = retriever.invoke(query)

In [69]:
retrieved_docs[0]

Document(metadata={'author': '', 'creationDate': "D:20240828014004+07'00'", 'creator': 'Adobe InDesign 19.5 (Macintosh)', 'document_name': 'SCBX Sustainability Report', 'file_path': './inputs/multiple_pdfs//SCBX Sustainability Report_2023.pdf', 'format': 'PDF 1.7', 'keywords': '', 'last_modified_date': '2023', 'modDate': "D:20240828014217+07'00'", 'page': 96, 'producer': 'Adobe PDF Library 17.0', 'source': './inputs/multiple_pdfs//SCBX Sustainability Report_2023.pdf', 'start_index': 0, 'subject': '', 'title': '', 'total_pages': 146, 'trapped': ''}, page_content='2021  Financed Emissions (scope 3, category 15)1 by Sector, MtCO2\n0.4\nRetail\n% Financed\nEmissions\n100%\n8%\n10%\n82%\n56%\n2%\n1%\n13%\n2%\n2%\n1%\n5%\nSCB\nLending \nPortfolio\nMortgages\nCorporate\nPortfolio\nPower\nEnergy\n(Fossil Fuel)\nHospitality & \nReal Estate\nMetal\nIndustrial\nChemicals &\nRaw Materials\nAgricultural\nProducts & \nCommodities\nFood & \nBeverage\nOthers2\nFinanced\nEmissions\n(MtCO2)\nCorp\nSME\n

## Leverage LLM Classification to build the router chain

In [70]:
from langchain.chains.router.multi_prompt_prompt import MULTI_PROMPT_ROUTER_TEMPLATE
from langchain.chains.router.llm_router import LLMRouterChain, RouterOutputParser
from langchain.prompts import PromptTemplate
from langchain_openai import AzureChatOpenAI
from langchain_openai import ChatOpenAI

In [71]:
question_classification = [
  {
    "description": "SCBX sustainability, emissions baseline",
    "relevant_documents": [ "SCBX Sustainability Report" ]
  },
  {
    "description": "questions about Thailand economy, company overview for SCBX, key insights, and 2024 outlook",
    "relevant_documents" : [ "SCBX SET Thailand Focus" ],
  }
]

In [72]:
prompt_templates = []

for item in question_classification:
    prompt_template = item["description"]
    prompt_templates.append(prompt_template)

In [73]:
prompt_embeddings = embeddings.embed_documents(prompt_templates)

def prompt_router(input):
    query_embedding = embeddings.embed_query(input["query"])
    similarity = cosine_similarity([query_embedding], prompt_embeddings)[0]
    most_similar = prompt_templates[similarity.argmax()]
    print(f"The most similar prompt is {similarity.argmax()}")
    return similarity.argmax()

In [74]:
print(prompt_router({"query": "What is SCBX's financed emissions baseline, which year was it measured in?"}))

The most similar prompt is 0
0


In [75]:
print(prompt_router({"query": "what is the executive summary about Thailand's economy"}))

The most similar prompt is 1
1


In [76]:
def router_rag(query):
    router_idx = prompt_router({"query": query})
    relevant_documents = question_classification[router_idx].get("relevant_documents")
    
    # filter for relevant documents
    search_kwargs = {"k": 5, "filter": {'document_name': {'$in': relevant_documents}}}
    retriever = rag_vectorstore.as_retriever(search_type="similarity", search_kwargs=search_kwargs)
    
    # retrieve relevant documents
    rag_chain = create_custom_rag(retriever)
    
    answer = rag_chain.invoke(query)
    return answer

In [77]:
router_rag(query = "What is SCBX's financed emissions baseline, which year was it measured in?")

The most similar prompt is 0


"SCBX's financed emissions baseline was 7.3 million tonnes of carbon dioxide equivalent, measured in 2021."

In [78]:
router_rag(query = "what is the executive summary about Thailand's economy?")

The most similar prompt is 1


"Thailand’s economy will slowly rebound to 2.5% in 2024, backed by the service sector alongside a robust rebound in foreign tourist arrivals. However, growth in 2024 will be limited for merchandise exports, while the manufacturing sector and private investment will recover slowly. Despite an expedited disbursement of public investment following an over-six-month delay in the 2024 budget bill, it will not fully offset the previous sharp contraction. In 2025, Thailand's economy will expand moderately by 2.6% (revised down from 2.9%), partly supported by private investment rebound and government disbursement returning to normal levels. However, private consumption will face higher downward pressure from those fragile households. Policy uncertainty remains an additional downside risk. SCB EIC anticipates two MPC rate cuts to 2.25% in late 2024 and 2% in early 2025. Thailand will face heightened economic risks, with tighter financial conditions pressuring growth while households and busines

# Exercise
- Add a tag to meta data as last_modified_date
- Retrieve the most recent information using last_modified_date

In [79]:
def filter_for_max_date_document(relevant_documents):
    """
        filter for the document with the most recent last_modified_date 
    """
    return most_recent_relevant_document