# Set up some ENV vars and configs

In [7]:
import sys
!{sys.executable} -m pip install ragas
!{sys.executable} -m pip install boto3
!{sys.executable} -m pip install ipywidgets



In [8]:
import os
import langchain
import nest_asyncio

langchain.debug=False
# langchain.debug=True
os.environ["OPENAI_API_KEY"] = input("Open AI API Key")
os.environ["RAGAS_DO_NOT_TRACK "] = "true"

# NOTES: Only used when running on a jupyter notebook, otherwise comment or remove this function.
nest_asyncio.apply()

# Pull down a predefined question-answer dataset

In [9]:
from datasets import load_dataset

amnesty_qa = load_dataset("explodinggradients/amnesty_qa", "english_v2", trust_remote_code=True)
print(amnesty_qa)
amnesty_qa['eval'].to_pandas()

DatasetDict({
    eval: Dataset({
        features: ['question', 'ground_truth', 'answer', 'contexts'],
        num_rows: 20
    })
})


Unnamed: 0,question,ground_truth,answer,contexts
0,What are the global implications of the USA Su...,The global implications of the USA Supreme Cou...,The global implications of the USA Supreme Cou...,"[- In 2022, the USA Supreme Court handed down ..."
1,Which companies are the main contributors to G...,"According to the Carbon Majors database, the m...","According to the Carbon Majors database, the m...","[- Fossil fuel companies, whether state or pri..."
2,Which private companies in the Americas are th...,The largest private companies in the Americas ...,"According to the Carbon Majors database, the l...",[The private companies responsible for the mos...
3,What action did Amnesty International urge its...,Amnesty International urged its supporters to ...,Amnesty International urged its supporters to ...,[Amnesty International called on its vast netw...
4,What are the recommendations made by Amnesty I...,The recommendations made by Amnesty Internatio...,Amnesty International made several recommendat...,[Amnesty International recommends that the Spe...
5,Who are the target audience of the two books c...,The target audience of the two books created b...,The target audience of the two books created b...,[Amnesty International has therefore created t...
6,Which right guarantees access to comprehensive...,The right that guarantees access to comprehens...,The right that guarantees access to comprehens...,[26. The Act raises serious questions about it...
7,Who has the right to be fully informed about h...,The victims of gross human rights violations a...,Everyone has the right to be fully informed ab...,[- The victims of gross human rights violation...
8,When can individuals be found guilty under Art...,Individuals can be found guilty under Article ...,Under Article 207.3 of the Russian Criminal Co...,[- As long as their statements are contrary to...
9,When does the prosecution consider statements ...,The prosecution considers statements contrary ...,Under Article 207.3 of the Russian Criminal Co...,[- As long as their statements are contrary to...


## Example of pulling in our own local dataset

Unsure of why it labels this as "train" data, but I could not find a good way to rename it during data load.  It does not affect functionality as far as I can tell.

In [10]:
dataset = load_dataset("./data")
print(dataset)
dataset['train'].to_pandas()

Generating train split: 2 examples [00:00, 738.11 examples/s]

DatasetDict({
    train: Dataset({
        features: ['question', 'ground_truth', 'answer', 'contexts'],
        num_rows: 2
    })
})





Unnamed: 0,question,ground_truth,answer,contexts
0,What is my name?,Your name is Grey,Your name is Grey.,[My name is Grey]
1,What is my name?,Your name is Grey,Your name is Matt.,[My name is Grey]


# Run using the default OpenAI embeddings and LLM

Note, we are only currently targeting the `answer_correctness` metric.  There are a few others in the documentation, but when we start looking at using bedrock instead of OpenAI, we start running in to issues.  `answer_correctness` is a nice hybrid metric that can capture a rough approximation of how your solution is doing.

In [11]:
from ragas import evaluate
from ragas.metrics import (
    # context_precision,
    # answer_relevancy,
    # faithfulness,
    # context_recall,
    answer_correctness
)

In [12]:
results_openai = evaluate(
    amnesty_qa['eval'],
    metrics=[
        answer_correctness
    ]
)

Evaluating: 100%|██████████| 20/20 [00:32<00:00,  1.60s/it]


In [13]:
# Using Open AI
print(results_openai)
results_openai.to_pandas()

{'answer_correctness': 0.4450}


Unnamed: 0,question,ground_truth,answer,contexts,answer_correctness
0,What are the global implications of the USA Su...,The global implications of the USA Supreme Cou...,The global implications of the USA Supreme Cou...,"[- In 2022, the USA Supreme Court handed down ...",0.305036
1,Which companies are the main contributors to G...,"According to the Carbon Majors database, the m...","According to the Carbon Majors database, the m...","[- Fossil fuel companies, whether state or pri...",0.297955
2,Which private companies in the Americas are th...,The largest private companies in the Americas ...,"According to the Carbon Majors database, the l...",[The private companies responsible for the mos...,0.585971
3,What action did Amnesty International urge its...,Amnesty International urged its supporters to ...,Amnesty International urged its supporters to ...,[Amnesty International called on its vast netw...,0.53173
4,What are the recommendations made by Amnesty I...,The recommendations made by Amnesty Internatio...,Amnesty International made several recommendat...,[Amnesty International recommends that the Spe...,0.32354
5,Who are the target audience of the two books c...,The target audience of the two books created b...,The target audience of the two books created b...,[Amnesty International has therefore created t...,0.746764
6,Which right guarantees access to comprehensive...,The right that guarantees access to comprehens...,The right that guarantees access to comprehens...,[26. The Act raises serious questions about it...,0.748469
7,Who has the right to be fully informed about h...,The victims of gross human rights violations a...,Everyone has the right to be fully informed ab...,[- The victims of gross human rights violation...,0.466448
8,When can individuals be found guilty under Art...,Individuals can be found guilty under Article ...,Under Article 207.3 of the Russian Criminal Co...,[- As long as their statements are contrary to...,0.227963
9,When does the prosecution consider statements ...,The prosecution considers statements contrary ...,Under Article 207.3 of the Russian Criminal Co...,[- As long as their statements are contrary to...,0.537945


In [14]:
results_openai_custom_dataset = evaluate(
    dataset['train'],
    metrics=[
        answer_correctness
    ]
)

Evaluating: 100%|██████████| 2/2 [00:20<00:00, 10.08s/it]


In [15]:
# Using Open AI
print(results_openai_custom_dataset)
results_openai_custom_dataset.to_pandas()

{'answer_correctness': 0.6063}


Unnamed: 0,question,ground_truth,answer,contexts,answer_correctness
0,What is my name?,Your name is Grey,Your name is Grey.,[My name is Grey],0.9966
1,What is my name?,Your name is Grey,Your name is Matt.,[My name is Grey],0.216028


# Running with Bedrock instead of OpenAI

We can run with Bedrock, but I have found the prompts being used for the different metrics do not perform well out of the box against many of the Bedrock models.  I have tried against Llama-2, ClaudeV2.1, and Claude Instant.  I have found the best results with Claude Instant, both before and after some of the changes we will be making.

We must first set up our connection to bedrock using boto3.  In this case, I am using an sso session I have named "dev".

In [16]:
import boto3

from langchain_community.chat_models import BedrockChat
from langchain_community.embeddings import BedrockEmbeddings

boto3.setup_default_session(profile_name="dev")
BEDROCK_CLIENT = boto3.client("bedrock-runtime", "us-east-1")
embeddings = BedrockEmbeddings(
    model_id="amazon.titan-embed-text-v1", client=BEDROCK_CLIENT
)
llm = BedrockChat(
    # model_id="anthropic.claude-v2:1",
    model_id="anthropic.claude-instant-v1",
    # model_id="meta.llama2-13b-chat-v1",
    model_kwargs={"temperature": 0.1, "max_tokens_to_sample": 1000}, # For Claude
    # model_kwargs={"temperature": 0.1, "max_gen_len": 1000}, # For Llama
    client=BEDROCK_CLIENT,
)

### If you would like to see if not perform well, you may uncomment the following and run it BEFORE you run any of the cells below.  Some of the cells below are destructive, and will change the way this cell will behave if you run it after.

Running "poorly" can mean a few different things.  In many of the cases I tried, it meant the LLM gave back responses that were not in JSON format, and the follow up prompts RAGAS used to try and get it to format them correctly were unsuccessful.  When this happened, that record in the dataset ends up with `Nan` instead of a score.

In the case of Claude Instant, the "badness" is much less pronounced. It USUALLY returns valid json with the original prompt, but not always.  Also, sometimes it responds to the example info instead of the last passed in example that we are targeting.  This leads to incorrect scores.  This is hard to see in the full dataset, but easier to see in the custom dataset, which should have 1 exact match for an answer, and one incorrect answer, resulting in around a .5 score.

Other models, like ClaudeV2 and Llama2 both end up returning mostly NaN records, or failing to complete entirely, unless we use the workarounds below.

In [17]:
results_bedrock_bad = evaluate(
    amnesty_qa['eval'],
    llm=llm,
    embeddings=embeddings,
    metrics=[
        answer_correctness
    ]
)

Evaluating: 100%|██████████| 20/20 [00:28<00:00,  1.43s/it]


In [18]:
# Using Claude Instant
print(results_bedrock_bad)
results_bedrock_bad.to_pandas()

{'answer_correctness': 0.3721}


Unnamed: 0,question,ground_truth,answer,contexts,answer_correctness
0,What are the global implications of the USA Su...,The global implications of the USA Supreme Cou...,The global implications of the USA Supreme Cou...,"[- In 2022, the USA Supreme Court handed down ...",0.235862
1,Which companies are the main contributors to G...,"According to the Carbon Majors database, the m...","According to the Carbon Majors database, the m...","[- Fossil fuel companies, whether state or pri...",0.335132
2,Which private companies in the Americas are th...,The largest private companies in the Americas ...,"According to the Carbon Majors database, the l...",[The private companies responsible for the mos...,0.5452
3,What action did Amnesty International urge its...,Amnesty International urged its supporters to ...,Amnesty International urged its supporters to ...,[Amnesty International called on its vast netw...,0.571804
4,What are the recommendations made by Amnesty I...,The recommendations made by Amnesty Internatio...,Amnesty International made several recommendat...,[Amnesty International recommends that the Spe...,0.184053
5,Who are the target audience of the two books c...,The target audience of the two books created b...,The target audience of the two books created b...,[Amnesty International has therefore created t...,0.413059
6,Which right guarantees access to comprehensive...,The right that guarantees access to comprehens...,The right that guarantees access to comprehens...,[26. The Act raises serious questions about it...,0.411088
7,Who has the right to be fully informed about h...,The victims of gross human rights violations a...,Everyone has the right to be fully informed ab...,[- The victims of gross human rights violation...,0.444759
8,When can individuals be found guilty under Art...,Individuals can be found guilty under Article ...,Under Article 207.3 of the Russian Criminal Co...,[- As long as their statements are contrary to...,0.356503
9,When does the prosecution consider statements ...,The prosecution considers statements contrary ...,Under Article 207.3 of the Russian Criminal Co...,[- As long as their statements are contrary to...,0.382838


In [19]:
results_bedrock_bad_custom_dataset = evaluate(
    dataset['train'],
    llm=llm,
    embeddings=embeddings,
    metrics=[
        answer_correctness
    ]
)

Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

Evaluating: 100%|██████████| 2/2 [00:04<00:00,  2.05s/it]


In [20]:
# Using Claude Instant
print(results_bedrock_bad_custom_dataset)
results_bedrock_bad_custom_dataset.to_pandas()

{'answer_correctness': 0.3553}


Unnamed: 0,question,ground_truth,answer,contexts,answer_correctness
0,What is my name?,Your name is Grey,Your name is Grey.,[My name is Grey],0.408958
1,What is my name?,Your name is Grey,Your name is Matt.,[My name is Grey],0.301643


# Making Bedrock eval better

To get around this bad behavior, we need to use better prompts.  In this case, watching the logs, it looks like the `answer_correctness` metric has one main prompt that asks for a response and expects it in JSON. It gives a few shot example returning json, but does not actually give the instruction to return in that format, so we need to make that a little more clear.

[Here](https://docs.ragas.io/en/stable/howtos/applications/custom_prompts.html) are some instructions for overriding those prompts.  Unfortuately, the way this is set up, this is a destructive action that will mutate the `answer_correctness` metric to always use this prompt until your python kernal restarts.

In [21]:
# Save off the original istruction string so we have access later if we want it
# original_correctness_prompt_instruction = answer_correctness.correctness_prompt.instruction
original_correctness_prompt_instruction = "Extract following from given question and ground truth"

In [22]:
from ragas.llms.prompt import Prompt
from ragas.metrics import answer_correctness

new_instruction = f"""{original_correctness_prompt_instruction}.
Generate "Extracted Statements" similar to the {len(answer_correctness.correctness_prompt.examples)} examples I am about to give you.
Respond only in JSON format, just like the "Extracted Statements" in these two examples.
Do not say anything before or after the JSON output, or you will be penalized.
Only generate the data for the last question and ground truth provided."""

bedrock_correctness_prompt = Prompt(
    name="bedrock_correctness_prompt",
    instruction=new_instruction,
    examples=answer_correctness.correctness_prompt.examples,
    input_keys=answer_correctness.correctness_prompt.input_keys,
    output_key=answer_correctness.correctness_prompt.output_key,
    output_type=answer_correctness.correctness_prompt.output_type,
)

### Check what our new prompt looks like, then override the default prompt on the `answer_correctness` metric.  This is that destructive action I was talking about earlier.

In [23]:
print("OLD PROMPT:")
print(answer_correctness.correctness_prompt.to_string())

answer_correctness.correctness_prompt = bedrock_correctness_prompt

print("NEW PROMPT:")
print(answer_correctness.correctness_prompt.to_string())

OLD PROMPT:
Extract following from given question and ground truth

question: "What powers the sun and what is its primary function?"
answer: "The sun is powered by nuclear fission, similar to nuclear reactors on Earth, and its primary function is to provide light to the solar system."
ground_truth: "The sun is actually powered by nuclear fusion, not fission. In its core, hydrogen atoms fuse to form helium, releasing a tremendous amount of energy. This energy is what lights up the sun and provides heat and light, essential for life on Earth. The sun's light also plays a critical role in Earth's climate system and helps to drive the weather and ocean currents."
Extracted statements: [{{"statements that are present in both the answer and the ground truth": ["The sun's primary function is to provide light"], "statements present in the answer but not found in the ground truth": ["The sun is powered by nuclear fission", "similar to nuclear reactors on Earth"], "relevant statements found in 

### Now lets try the Bedrock eval again and see how it does.

In [24]:
results_bedrock = evaluate(
    amnesty_qa['eval'],
    llm=llm,
    embeddings=embeddings,
    metrics=[
        answer_correctness
    ]
)

Evaluating:   0%|          | 0/20 [00:00<?, ?it/s]

Evaluating: 100%|██████████| 20/20 [00:18<00:00,  1.05it/s]


In [25]:
# Using Claude Instant
print(results_bedrock)
results_bedrock.to_pandas()

{'answer_correctness': 0.3516}


Unnamed: 0,question,ground_truth,answer,contexts,answer_correctness
0,What are the global implications of the USA Su...,The global implications of the USA Supreme Cou...,The global implications of the USA Supreme Cou...,"[- In 2022, the USA Supreme Court handed down ...",0.412333
1,Which companies are the main contributors to G...,"According to the Carbon Majors database, the m...","According to the Carbon Majors database, the m...","[- Fossil fuel companies, whether state or pri...",0.329203
2,Which private companies in the Americas are th...,The largest private companies in the Americas ...,"According to the Carbon Majors database, the l...",[The private companies responsible for the mos...,0.454541
3,What action did Amnesty International urge its...,Amnesty International urged its supporters to ...,Amnesty International urged its supporters to ...,[Amnesty International called on its vast netw...,0.196804
4,What are the recommendations made by Amnesty I...,The recommendations made by Amnesty Internatio...,Amnesty International made several recommendat...,[Amnesty International recommends that the Spe...,0.184053
5,Who are the target audience of the two books c...,The target audience of the two books created b...,The target audience of the two books created b...,[Amnesty International has therefore created t...,0.621393
6,Which right guarantees access to comprehensive...,The right that guarantees access to comprehens...,The right that guarantees access to comprehens...,[26. The Act raises serious questions about it...,0.619421
7,Who has the right to be fully informed about h...,The victims of gross human rights violations a...,Everyone has the right to be fully informed ab...,[- The victims of gross human rights violation...,0.444759
8,When can individuals be found guilty under Art...,Individuals can be found guilty under Article ...,Under Article 207.3 of the Russian Criminal Co...,[- As long as their statements are contrary to...,0.189836
9,When does the prosecution consider statements ...,The prosecution considers statements contrary ...,Under Article 207.3 of the Russian Criminal Co...,[- As long as their statements are contrary to...,0.216172


In [26]:
results_bedrock_custom_dataset = evaluate(
    dataset['train'],
    llm=llm,
    embeddings=embeddings,
    metrics=[
        answer_correctness
    ]
)

Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

Evaluating: 100%|██████████| 2/2 [00:01<00:00,  1.79it/s]


In [27]:
# Using Claude Instant
print(results_bedrock_custom_dataset)
results_bedrock_custom_dataset.to_pandas()

{'answer_correctness': 0.5636}


Unnamed: 0,question,ground_truth,answer,contexts,answer_correctness
0,What is my name?,Your name is Grey,Your name is Grey.,[My name is Grey],0.992291
1,What is my name?,Your name is Grey,Your name is Matt.,[My name is Grey],0.134976


# Comparing the results

In [28]:
def results_string(dataset):
    return f"""{dataset["answer_correctness"]}  (Invalid Records: {dataset.to_pandas()['answer_correctness'].isnull().sum()}/{len(dataset.dataset)})"""

print("Premade Dataset:")
print(f"OpenAI: {results_string(results_openai)}")
print(f"Bad Bedrock: {results_string(results_bedrock_bad)}")
print(f"Hopefully Better Bedrock: {results_string(results_bedrock)}")
print("")
print("Custom Dataset (Should be around .5 ideally):")
print(f"OpenAI: {results_string(results_openai_custom_dataset)}")
print(f"Bad Bedrock: {results_string(results_bedrock_bad_custom_dataset)}")
print(f"Hopefully Better Bedrock: {results_string(results_bedrock_custom_dataset)}")

Premade Dataset:
OpenAI: 0.44496384216791485  (Invalid Records: 0/20)
Bad Bedrock: 0.3720638482151994  (Invalid Records: 0/20)
Hopefully Better Bedrock: 0.35157994592822805  (Invalid Records: 0/20)

Custom Dataset (Should be around .5 ideally):
OpenAI: 0.6063143621778305  (Invalid Records: 0/2)
Bad Bedrock: 0.35530007032095545  (Invalid Records: 0/2)
Hopefully Better Bedrock: 0.5636334036542888  (Invalid Records: 0/2)
