# Run the following if you are on Colab

- change the resource type to GPU

In [None]:
!git clone https://github.com/scbxtraining/scbx-rag.git

In [None]:
import os
os.chdir('/content/scbx-rag')

In [None]:
!pip install -r requirements.txt

# RAGAs

In [None]:
import os

In [None]:
os.environ["AZURE_OPENAI_API_KEY"] = "OPENAI_KEY"
os.environ["AZURE_OPENAI_ENDPOINT"] = "OPENAI_ENDPOINT"
deployment_name="DEPLOYMENT_NAME"
api_version="API_VERSION"

### What is RAGAs

RAGAs (Retrieval-Augmented Generation Assessment) is a framework that provides you with the necessary ingredients to help you evaluate your RAG pipeline on a component level.

### Evaluation Data
To evaluate the RAG pipeline, RAGAs expects the following information:

1. question: The user query that is the input of the RAG pipeline. The input.

2. answer: The generated answer from the RAG pipeline. The output.

3. contexts: The contexts retrieved from the external knowledge source used to answer the question.

4. ground_truths: The ground truth answer to the question. This is the only human-annotated information.

### Evaluation Metrics

RAGAs provide you with a few metrics to evaluate a RAG pipeline component-wise as well as end-to-end.

1. Context precision: measures the signal-to-noise ratio of the retrieved context. This metric is computed using the question and the contexts.

2. Context recall: measures if all the relevant information required to answer the question was retrieved. This metric is computed based on the ground_truth (this is the only metric in the framework that relies on human-annotated ground truth labels) and the contexts.

3. Faithfulness: measures the factual accuracy of the generated answer. The number of correct statements from the given contexts is divided by the total number of statements in the generated answer. This metric uses the question, contexts and the answer.

4. Answer relevancy: measures how relevant the generated answer is to the question. This metric is computed using the question and the answer. For example, the answer “France is in western Europe.” to the question “Where is France and what is it’s capital?” would achieve a low answer relevancy because it only answers half of the question.

![title](./imgs/ragas.png)

## Setting up the RAG Pipeline

In [None]:
import os
from langchain_openai import ChatOpenAI
from langchain_openai import AzureChatOpenAI
from langchain_community.utils.math import cosine_similarity
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.embeddings import HuggingFaceBgeEmbeddings

In [None]:
llm = AzureChatOpenAI(
    azure_deployment=deployment_name,
    api_version=api_version, 
    temperature=0,
    max_tokens=2000,
    max_retries=2,
)

model_name = "BAAI/bge-base-en-v1.5"
model_kwargs = {"device": "cuda"}
encode_kwargs = {"normalize_embeddings": True}
embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)


In [None]:
def init_RAG_retrieval(input_pdf_path = './inputs/'):
    docs = []
    files = os.listdir(input_pdf_path)
    files = [x for x in files if x.endswith('.pdf')]
    
    for file in files:
        loader = PyMuPDFLoader(f"{input_pdf_path}/{file}")
        doc = loader.load()
        for _ in doc:
            additional_metadata = {
                                    "last_modified_date": file.split('.')[0].split('_')[1],
                                    "document_name": file.split('.')[0].split('_')[0],
                                }
            _.metadata.update(additional_metadata)
    
        docs = docs + doc
    
    # Chucking: Split the text into chunks
    CHUNK_SIZE = 4000
    CHUNK_OVERLAP = 200
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP,
        add_start_index=True
    )
    texts = text_splitter.split_documents(docs)
    print(f"splitted texts with length: {len(texts)}")    
    vectorstore = Chroma.from_documents(documents=texts[:], embedding=embeddings)
    retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})

    return retriever

In [None]:
def create_custom_rag(retriever):
    PROMPT_TEMPLATE = """
        Use the following context (delimited by <ctx></ctx>) to answer the question. 
        Use the context to provide the answer only. 
        ------
        <ctx>
        {context}
        </ctx>
        ------
        {question}
        Answer:

    """

    custom_rag_prompt = PromptTemplate.from_template(template=PROMPT_TEMPLATE)
    
    def format_docs(docs):
        return "\n\n".join(doc.page_content for doc in docs)
    
    rag_chain = (
            {"context": retriever | format_docs, "question": RunnablePassthrough()}
            | custom_rag_prompt
            | llm
            | StrOutputParser()
    )
    return rag_chain

In [None]:
rag_retrieval = init_RAG_retrieval("./inputs/")
rag_chain = create_custom_rag(rag_retrieval)

## Preparing the Evaluation Data

In [None]:
questions = [
    "What is the reduction in scope 1 and scope 2 emissions that SCB achieved in 2023?", 
    "What share of SCBX's total revenue was powered by AI in 2023?", 
    "What is SCBX's 2025 financial support target for  'Net Zero financed emissions' for scope 3 emissions?", 
    "Can you summarize what SCBX is doing to improve Financial and digital literacy", 
    "What has been Thailland's share of economic loss from extreme climate events between 2000 to 2019?", 
    "What are SCBX's scope 1 and 2 emissions for year 2023 and how much reduction have we seen from the year before?", 
    "What is sCBX's scope 3 emissions baseline, which year was it measured in?"
]


ground_truth = [
    "SCB achieved a 7% reduction in scope 1 and scope 2 emissions in 2023.", 
    "To check where to get the answer", 
    "Baht 200,000 million", 
    "SCBX Group is committed to nurturing digital skills and promoting technological digital literacy in society to propel long-term economic growth, broaden employment opportunities, and uplift people's quality of life. They leverage technology to enhance financial solutions, drive innovation, and accelerate financial inclusion across all user groups. Additionally, SCBX collaborates with leading partners to broaden financial service channels, promote job and income opportunities, and nurture financial literacy and discipline.", 
    "Thailand was confronted with 146 extreme weather events, resulting in an economic loss of around USD 7.7 billion between 2000 to 2019.", 
    "SCBX's scope 1 and 2 emissions for the year 2023 amounted to 65,384 tonnes of carbon dioxide equivalent, achieving a targeted 7% reduction compared to 2022.", 
    "SCBX's scope 3 emissions baseline was measured in 2021, and the top 3 sectors that fall under this category are the power sector, energy (fossil fuel), and hospitality & real estate."
]


answers = []
contexts = []

# Inference
for query in questions:
  answers.append(rag_chain.invoke(query))
  contexts.append([docs.page_content for docs in rag_retrieval.get_relevant_documents(query)])

# To dict
dataset = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    "ground_truth": ground_truth
}

# Convert dict to dataset
# dataset = Dataset.from_dict(data)

# Evaluating the Reusts

In [None]:
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
)

In [None]:
result = evaluate(
    dataset = dataset, 
    metrics=[
        context_precision,
        context_recall,
        faithfulness,
        answer_relevancy,
    ]
)

df = result.to_pandas()

In [None]:
df

1. context_relevancy (signal-to-noise ratio of the retrieved context): While the LLM judges all of the context as relevant for the last question, it also judges that most of the retrieved context for the second question is irrelevant. Depending on this metric, you could experiment with different numbers of retrieved contexts to reduce the noise.
2. context_recall (if all the relevant information required to answer the question was retrieved): The LLMs evaluate that the retrieved contexts contain the relevant information required to answer the questions correctly.
3. faithfulness (factual accuracy of the generated answer): While the LLM judges that the first and last questions are answered correctly, the answer to the second question, which wrongly states that the president did not mention Intel’s CEO, is judged with a faithfulness of 0.5.
4. answer_relevancy (how relevant is the generated answer to the question): All of the generated answers are judged as fairly relevant to the questions.

## QA Evaluation Chain

In [None]:
from langchain.prompts.prompt import PromptTemplate
from langchain.llms import OpenAI
from langchain.chains import LLMChain
import pandas as pd

In [None]:
examples = []
predictions = []

for question, truth, prediction in zip(questions, ground_truth, answers):
    examples.append(
        {
            "question": question,
            # Ground truth
            "answer": real_answer
        }
    )
    predictions.append({'response': prediction})

In [None]:
_PROMPT_TEMPLATE = """You are an expert professor specialized in grading students' answers to questions.
    You are grading the following question:
    {query}
    Here is the real answer:
    {answer}
    You are grading the following predicted answer:
    {result}
    What grade do you give from 0 to 10, where 0 is the lowest (very low similarity) and 10 is the highest (very high similarity)?
"""

PROMPT = PromptTemplate(
    input_variables=["query", "answer", "result"], template=_PROMPT_TEMPLATE
)

In [None]:
llm = OpenAI(temperature=0)
evalchain = QAEvalChain.from_llm(llm=llm, prompt=PROMPT)

eval_result = evalchain.evaluate(
    examples,
    predictions,
    question_key="question",
    answer_key="answer",
    prediction_key="response",
)

In [None]:
eval_result