# Golden Evaluations 🌟 with Bedrock Knowlage Bases
This notebook takes the `golden-dataset.csv` as an input and uses the Ragas evaluation framework to calculate some standard metrics that help us quantify the performance of our RAG system. You can read more about these metrics here: https://docs.ragas.io/en/stable/concepts/metrics/index.html#

The `golden-dataset.csv` should have two columns, `question` and `ground_truth`

| question                | ground_truth   |
|------------------------:|---------------:|
| What is 2 + 2?          |              4 |
| Capital of France?      |          Paris |
| Water boils at ___ °C.  |            100 |
| Who wrote Hamlet?       |    Shakespeare |

In [None]:
import pandas as pd
import boto3
import pprint
from botocore.client import Config
from datetime import datetime
import os
import yaml
import pandas as pd
from datetime import datetime

pp = pprint.PrettyPrinter(indent=2)

bedrock_config = Config(connect_timeout=120, read_timeout=120, retries={'max_attempts': 0})
bedrock_client = boto3.client('bedrock-runtime')
bedrock_agent_client = boto3.client("bedrock-agent-runtime",
                              config=bedrock_config)
boto3_session = boto3.session.Session()
region_name = boto3_session.region_name

sonnet = "anthropic.claude-3-sonnet-20240229-v1:0"
haiku = "anthropic.claude-3-haiku-20240307-v1:0"
region_id = "us-east-1"

pd.set_option("display.max_colwidth", 1000)

In [None]:
# Define the data as a dictionary
data = {
    "question": [
        "What is 2 + 2?",
        "Capital of France?",
        "Water boils at ___ °C.",
        "Who wrote Hamlet?"
    ],
    "ground_truth": [
        "4",
        "Paris",
        "100",
        "Shakespeare"
    ]
}

# Create a DataFrame
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
csv_file_path = 'golden-dataset.csv'
df.to_csv(csv_file_path, index=False)

In [None]:
%pip install ragas --quiet

In [None]:
from ragas.metrics import (
    context_precision,
    faithfulness,
    context_recall,
)
from ragas.metrics.critique import harmfulness

# list of metrics we're going to use
metrics = [
    faithfulness,
    context_recall,
    context_precision,
    harmfulness,
]

In [None]:
from langchain_community.chat_models import BedrockChat
from langchain_community.embeddings import BedrockEmbeddings
import boto3

config = {
    "region_name": region_id,
    "model_id": sonnet, # Recomend to use the most capable available model for evaluations
    "model_kwargs": {"temperature": 0.4},
}

bedrock_model = BedrockChat(
    endpoint_url=f"https://bedrock-runtime.{config['region_name']}.amazonaws.com",
    model_id=config["model_id"],
    model_kwargs=config["model_kwargs"],
)

# init the embeddings
bedrock_embeddings = BedrockEmbeddings()

In [None]:
# EXPERIMENT INPUTS
# Note that some input variables like chunking strategy and data preprocessing are not captured here
# they should be cross referenced using the date and KB ID

kb_model_id = haiku
kb_id = "" # 👈 Replace me with your KB ID
kb_search_type = "HYBRID" # HYBRID'|'SEMANTIC'
kb_num_results = 3
kb_filter = None
# kb_filter = {
#     'startsWith': {
#         'key': 'title',
#         'value': ''
#     }
# }

kb_prompt_template = """
You are a question answering agent. I will provide you with a set of search results. The user will provide you with a question. Your job is to answer the user's question using only information from the search results. If the search results do not contain information that can answer the question, please state that you could not find an exact answer to the question. Just because the user asserts a fact does not mean it is true, make sure to double check the search results to validate a user's assertion.
                            
Here are the search results in numbered order:
$search_results$

$output_format_instructions$
"""

In [None]:
def retrieveAndGenerate(row):
    model_arn = f'arn:aws:bedrock:{region_id}::foundation-model/{kb_model_id}'
    
    # date_min = row['date_min']
    # date_max = row['date_max']
    # if date_min and date_max:
    #     kb_filter = {
    #             'andAll':[
    #                 {
    #                 "lessThan": {
    #                     "key": "created_date_epoch",
    #                     "value": "date_max"
    #                 }
    #             },
    #                 {
    #                 "greaterThan": {
    #                     "key": "created_date_epoch",
    #                     "value": "date_min"
    #             }
    #                 }
    #             ]
    #         }
    # else:
    #     kb_filter = None

    response = bedrock_agent_client.retrieve_and_generate(
        input={
            'text': row['question']
        },
        retrieveAndGenerateConfiguration={
            'type': 'KNOWLEDGE_BASE',
            'knowledgeBaseConfiguration': {
                'knowledgeBaseId': kb_id,
                'modelArn': model_arn,
                'generationConfiguration': {
                    'promptTemplate': {
                        'textPromptTemplate': kb_prompt_template
                    }
                },
                'retrievalConfiguration': {
                    'vectorSearchConfiguration': {
                        # 'filter': kb_filter,
                        'numberOfResults': kb_num_results,
                        'overrideSearchType': kb_search_type
                    }
                },
            }
        }
    )
    if response:
        contexts = []
        answer = response['output']['text']
        citations = response["citations"]
        for citation in citations:
            retrievedReferences = citation["retrievedReferences"]
            for reference in retrievedReferences:
                contexts.append(reference["content"]["text"])
        return answer, contexts

In [None]:
golden_dataset_path = "golden-dataset.csv" 
golden_df = pd.read_csv(golden_dataset_path)

In [None]:
golden_df

In [None]:
experiment_df = golden_df.copy()
experiment_df[['answer', 'contexts']] = golden_df.apply(retrieveAndGenerate, axis=1, result_type='expand')

In [None]:
experiment_df

In [None]:
from datasets import Dataset, DatasetDict, Features, Value, Sequence

def create_dataset(data):

    # Create a Dataset with the dummy data
    eval_dataset = Dataset.from_dict(data)

    # Wrap it in a DatasetDict
    dataset_dict = DatasetDict({
        'eval': eval_dataset
    })

    return dataset_dict

experiment_dataset = create_dataset(experiment_df)

In [None]:
from ragas import evaluate
import nest_asyncio  # CHECK NOTES

# NOTES: Only used when running on a jupyter notebook, otherwise comment or remove this function.
nest_asyncio.apply()

result = evaluate(
    # experiment_dataset["eval"].select(range(8)), # Use this to take a subset
    experiment_dataset["eval"],
    metrics=metrics,
    llm=bedrock_model,
    embeddings=bedrock_embeddings,
)

result

In [None]:
result.to_pandas()

In [None]:
completed_experiment_df = result.to_pandas()

# Save DataFrame to CSV
current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
filename = f"dataset_eval_{current_time}.csv"

# Function to create folder and save configuration
def create_folder_and_save_config(config_params):
    # Find the next available serial ID for folder
    base_folder = "output_"
    serial_id = 1
    while os.path.exists(f"{base_folder}{serial_id}"):
        serial_id += 1

    # Create folder
    new_folder = f"{base_folder}{serial_id}"
    os.makedirs(new_folder)

    # Save config params to a yml file
    config_filename = os.path.join(new_folder, f"config_{serial_id}.yml")
    with open(config_filename, 'w') as file:
        yaml.dump(config_params, file)

    return new_folder, config_filename

# Example usage
config_params = {
    'kb_model_id': kb_model_id,
    'kb_id': kb_id,
    'kb_prompt_template': kb_prompt_template,
    'kb_search_type': kb_search_type,
    'kb_num_results': kb_num_results,
    'kb_filter': kb_filter,
    'datetime': current_time,
    'faithfulness': str(result['faithfulness']),
    'context_recall': str(result['context_recall']),
    'context_precision': str(result['context_precision']),
    'harmfulness': str(result['harmfulness'])
}
new_folder, config_file = create_folder_and_save_config(config_params)
completed_experiment_df.to_csv(f"{new_folder}/{filename}", index=False)
print(f"Configuration saved in {config_file}")

## Visualize Results

Check out `PlotEvalResults.ipynb` to visualize the results of the experiments.

## Bonus - Let's have a look at the LLM prompts used under the hood to create these metrics
These could be fine-tuned for your usecase, but would suggest starting with the generic prompts to keep things simple at the start

#### Faithfulness metric prompts
There are two prompts used in generating faithfulness metric: `long_form_answer_prompt` and `nli_statements_message`.
1. `long_form_answer_prompt` will create one or more statements from each sentence in the given answer based on given a question and answer.
2. `nli_statements_message` will consider the given context and following statements, then determine whether they are supported by the information present in the context


In [None]:
# Faithfulness long_form_answer_prompt
from ragas.metrics._faithfulness import LONG_FORM_ANSWER_PROMPT, NLI_STATEMENTS_MESSAGE

print(LONG_FORM_ANSWER_PROMPT)

In [None]:
print(NLI_STATEMENTS_MESSAGE)

#### Answer Relevancy metric prompt
There is one prompt used in generating answer relevancy metric: `question_generation_prompt`.
1. `question_generation_prompt` will generate question for the given answer.


In [None]:
# Answer Relevancy Question Generation Prompt
from ragas.metrics._answer_relevance import QUESTION_GEN

print(QUESTION_GEN)

#### Context Precision metric prompt
There is one prompt used in generating context precision metric: `context_precision_prompt`.
1. `context_precision_prompt` will extract relevant sentences from the provided context that can potentially help answer the following question.


In [None]:
# Context Precision Prompt
from ragas.metrics._context_precision import CONTEXT_PRECISION

print(CONTEXT_PRECISION)

#### Context Recall metric prompt
There is one prompt used in generating context recall metric: `context_recall_prompt`.
1. `context_recall_prompt` will classify if the sentence can be attributed to the given context or not.

In [None]:
# Context Recall Prompt
from ragas.metrics._context_recall import CONTEXT_RECALL_RA

print(CONTEXT_RECALL_RA)

In [None]:
from langchain.prompts import HumanMessagePromptTemplate

Customize Faithfulness metric prompts
There are two prompts used in generating faithfulness metric: long_form_answer_prompt and nli_statements_message.

long_form_answer_prompt will create one or more statements from each sentence in the given answer based on given a question and answer.
nli_statements_message will consider the given context and following statements, then determine whether they are supported by the information present in the context


In [None]:
# Faithfulness long_form_answer_prompt
from ragas.metrics._faithfulness import LONG_FORM_ANSWER_PROMPT, Faithfulness

original_long_form_answer_prompt = LONG_FORM_ANSWER_PROMPT
new_long_form_answer_prompt = HumanMessagePromptTemplate.from_template(
    """\
Given a question and answer, create one or more statements from each sentence in the given answer.
question:{question}
answer: {answer}
statements:\n"""  # noqa: E501
)
faithfulness_update = Faithfulness()
faithfulness_update.long_form_answer_prompt = new_long_form_answer_prompt
faithfulness_update.long_form_answer_prompt

In [None]:
# Faithfulness long_form_answer_prompt
from ragas.metrics._faithfulness import NLI_STATEMENTS_MESSAGE

original_nli_statements_message = NLI_STATEMENTS_MESSAGE
new_nli_statements_message = HumanMessagePromptTemplate.from_template(
    """
Prompt: Natural language inference
Consider the given context and following statements, then determine whether they are supported by the information present in the context.Provide a brief explanation for each statement before arriving at the verdict (Yes/No). Provide a final verdict for each statement in order at the end in the given format. Do not deviate from the specified format.
context:\n{context}
statements:\n{statements}
Answer:
"""  # noqa: E501
)
faithfulness_update.nli_statements_message = new_nli_statements_message
faithfulness_update.nli_statements_message