In [1]:
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings


evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))
evaluator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())

# Evaluate Generated Answers from Retrieval-Augmented Generation (RAG) for Question Answering with Gen AI Evaluation Service SDK

In [3]:
from vertexai.evaluation import (
    EvalTask,
    MetricPromptTemplateExamples,
    PointwiseMetric,
    EvalResult,
)

### Helper functions

In [4]:
# General
import pandas as pd
import plotly.graph_objects as go
from IPython.display import HTML, Markdown, display


def display_eval_report(eval_result, metrics=None):
    """Display the evaluation results."""

    title, summary_metrics, report_df = eval_result
    metrics_df = pd.DataFrame.from_dict(summary_metrics, orient="index").T
    if metrics:
        metrics_df = metrics_df.filter(
            [
                metric
                for metric in metrics_df.columns
                if any(selected_metric in metric for selected_metric in metrics)
            ]
        )
        report_df = report_df.filter(
            [
                metric
                for metric in report_df.columns
                if any(selected_metric in metric for selected_metric in metrics)
            ]
        )

    # Display the title with Markdown for emphasis
    display(Markdown(f"## {title}"))

    # Display the metrics DataFrame
    display(Markdown("### Summary Metrics"))
    display(metrics_df)

    # Display the detailed report DataFrame
    display(Markdown("### Report Metrics"))
    display(report_df)


def display_explanations(df, metrics=None, n=1):
    style = "white-space: pre-wrap; width: 800px; overflow-x: auto;"
    df = df.sample(n=n)
    if metrics:
        df = df.filter(
            ["instruction", "context", "reference", "completed_prompt", "response"]
            + [
                metric
                for metric in df.columns
                if any(selected_metric in metric for selected_metric in metrics)
            ]
        )

    for index, row in df.iterrows():
        for col in df.columns:
            display(HTML(f"<h2>{col}:</h2> <div style='{style}'>{row[col]}</div>"))
        display(HTML("<hr>"))


def plot_radar_plot(eval_results, max_score=5, metrics=None):
    fig = go.Figure()

    for eval_result in eval_results:
        title, summary_metrics, report_df = eval_result

        if metrics:
            summary_metrics = {
                k: summary_metrics[k]
                for k, v in summary_metrics.items()
                if any(selected_metric in k for selected_metric in metrics)
            }

        fig.add_trace(
            go.Scatterpolar(
                r=list(summary_metrics.values()),
                theta=list(summary_metrics.keys()),
                fill="toself",
                name=title,
            )
        )

    fig.update_layout(
        polar=dict(radialaxis=dict(visible=True, range=[0, max_score])), showlegend=True
    )

    fig.show()


def plot_bar_plot(eval_results, metrics=None):
    fig = go.Figure()
    data = []

    for eval_result in eval_results:
        title, summary_metrics, _ = eval_result
        if metrics:
            summary_metrics = {
                k: summary_metrics[k]
                for k, v in summary_metrics.items()
                if any(selected_metric in k for selected_metric in metrics)
            }

        data.append(
            go.Bar(
                x=list(summary_metrics.keys()),
                y=list(summary_metrics.values()),
                name=title,
            )
        )

    fig = go.Figure(data=data)

    # Change the bar mode
    fig.update_layout(barmode="group")
    fig.show()

### Reference-Free (without Golden Answer)

In ragas the metrics that don't require reference are
- ResponseRelevancy/ AnswerRelevancy
- Faithfulness  

(the three below depends on how they are defined)
- AspectCritic
- SimpleCriteriaScore
- RubricsScore

### Prepare your dataset

In [5]:
user_inputs = questions = [
    "Which part of the brain does short-term memory seem to rely on?",
    "What provided the Roman senate with exuberance?",
    "What area did the Hasan-jalalians command?",
]

retrieved_contexts = [
    "Short-term memory is supported by transient patterns of neuronal communication, dependent on regions of the frontal lobe (especially dorsolateral prefrontal cortex) and the parietal lobe. Long-term memory, on the other hand, is maintained by more stable and permanent changes in neural connections widely spread throughout the brain. The hippocampus is essential (for learning new information) to the consolidation of information from short-term to long-term memory, although it does not seem to store information itself. Without the hippocampus, new memories are unable to be stored into long-term memory, as learned from patient Henry Molaison after removal of both his hippocampi, and there will be a very short attention span. Furthermore, it may be involved in changing neural connections for a period of three months or more after the initial learning.",
    "In 62 BC, Pompey returned victorious from Asia. The Senate, elated by its successes against Catiline, refused to ratify the arrangements that Pompey had made. Pompey, in effect, became powerless. Thus, when Julius Caesar returned from a governorship in Spain in 61 BC, he found it easy to make an arrangement with Pompey. Caesar and Pompey, along with Crassus, established a private agreement, now known as the First Triumvirate. Under the agreement, Pompey's arrangements would be ratified. Caesar would be elected consul in 59 BC, and would then serve as governor of Gaul for five years. Crassus was promised a future consulship.",
    "The Seljuk Empire soon started to collapse. In the early 12th century, Armenian princes of the Zakarid noble family drove out the Seljuk Turks and established a semi-independent Armenian principality in Northern and Eastern Armenia, known as Zakarid Armenia, which lasted under the patronage of the Georgian Kingdom. The noble family of Orbelians shared control with the Zakarids in various parts of the country, especially in Syunik and Vayots Dzor, while the Armenian family of Hasan-Jalalians controlled provinces of Artsakh and Utik as the Kingdom of Artsakh.",
]

responses_a = generated_answers_by_rag_a = [
    "frontal lobe and the parietal lobe",
    "The Roman Senate was filled with exuberance due to successes against Catiline.",
    "The Hasan-Jalalians commanded the area of Syunik and Vayots Dzor.",
]

responses_b = generated_answers_by_rag_b = [
    "Occipital lobe",
    "The Roman Senate was subdued because they had food poisoning.",
    "The Galactic Empire commanded the state of Utah.",
]

In [6]:
from ragas.dataset_schema import SingleTurnSample, EvaluationDataset

n = 3

samples_a = []
samples_b = []

for i in range(n):
    sample_a = SingleTurnSample(
        user_input=user_inputs[i],
        retrieved_contexts=[
            retrieved_contexts[i]
        ],  # accepts List[str] & retrieved_contexts[i] was str
        response=responses_a[i],
    )
    sample_b = SingleTurnSample(
        user_input=user_inputs[i],
        retrieved_contexts=[
            retrieved_contexts[i]
        ],  # accepts List[str] & retrieved_contexts[i] was str
        response=responses_b[i],
    )

    samples_a.append(sample_a)
    samples_b.append(sample_b)

ragas_eval_dataset_a = EvaluationDataset(samples=samples_a)
ragas_eval_dataset_b = EvaluationDataset(samples=samples_b)

In [7]:
ragas_eval_dataset_a.to_pandas()

Unnamed: 0,user_input,retrieved_contexts,response
0,Which part of the brain does short-term memory...,[Short-term memory is supported by transient p...,frontal lobe and the parietal lobe
1,What provided the Roman senate with exuberance?,"[In 62 BC, Pompey returned victorious from Asi...",The Roman Senate was filled with exuberance du...
2,What area did the Hasan-jalalians command?,[The Seljuk Empire soon started to collapse. I...,The Hasan-Jalalians commanded the area of Syun...


### Select and create metrics

#### Explore predefined metrics

- ResponseRelevancy/ AnswerRelevancy
- Faithfulness

In [8]:
from ragas.metrics import ResponseRelevancy, AnswerRelevancy, Faithfulness

response_relevance = ResponseRelevancy()
faithfulness = Faithfulness()

#### Create custom metrics
- AspectCritic
- RubricsScore
- SimpleCriteriaScore

In [9]:
from ragas.metrics import AspectCritic, RubricsScore, SimpleCriteriaScore

maliciousness_definition = (
    "Is the response intended to harm, deceive, or exploit users?"
)

aspect_critic = AspectCritic(
    name="maliciousness",
    definition=maliciousness_definition,
)

# adapeted google's helpfulness_prompt_template
helpfulness_rubrics = {
    "score1_description": "Response is useless/irrelevant, contains inaccurate/deceptive/misleading information, and/or contains harmful/offensive content. The user would feel not at all satisfied with the content in the response.",
    "score2_description": "Response is minimally relevant to the instruction and may provide some vaguely useful information, but it lacks clarity and detail. It might contain minor inaccuracies. The user would feel only slightly satisfied with the content in the response.",
    "score3_description": "Response is relevant to the instruction and provides some useful content, but could be more relevant, well-defined, comprehensive, and/or detailed. The user would feel somewhat satisfied with the content in the response.",
    "score4_description": "Response is very relevant to the instruction, providing clearly defined information that addresses the instruction's core needs.  It may include additional insights that go slightly beyond the immediate instruction.  The user would feel quite satisfied with the content in the response.",
    "score5_description": "Response is useful and very comprehensive with well-defined key details to address the needs in the instruction and usually beyond what explicitly asked. The user would feel very satisfied with the content in the response.",
}

rubrics_score = RubricsScore(name="helpfulness", rubrics=helpfulness_rubrics)

simple_criteria = SimpleCriteriaScore(
    name="question_answering",
    definition="Score 0 to 5 based on how well the response answers the user input",
)

### Run evaluation with your dataset

In [10]:
from ragas import evaluate

ragas_metrics = [
    aspect_critic,
    faithfulness,
    response_relevance,
    rubrics_score,
    simple_criteria,
]

ragas_result_rag_a = evaluate(
    dataset=ragas_eval_dataset_a, metrics=ragas_metrics, llm=evaluator_llm
)

ragas_result_rag_b = evaluate(
    dataset=ragas_eval_dataset_b, metrics=ragas_metrics, llm=evaluator_llm
)

Evaluating:   0%|          | 0/15 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/15 [00:00<?, ?it/s]

### Display evaluation results

In [11]:
result_rag_a = EvalResult(
    summary_metrics=ragas_result_rag_a._repr_dict,
    metrics_table=ragas_result_rag_a.to_pandas(),
)

result_rag_b = EvalResult(
    summary_metrics=ragas_result_rag_b._repr_dict,
    metrics_table=ragas_result_rag_b.to_pandas(),
)

#### View summary results

In [12]:
display_eval_report(
    eval_result=(
        "Model A Eval Result",
        result_rag_a.summary_metrics,
        result_rag_a.metrics_table,
    ),
)

## Model A Eval Result

### Summary Metrics

Unnamed: 0,maliciousness,faithfulness,answer_relevancy,helpfulness,question_answering
0,0.0,0.666667,0.921447,4.0,5.0


### Report Metrics

Unnamed: 0,user_input,retrieved_contexts,response,maliciousness,faithfulness,answer_relevancy,helpfulness,question_answering
0,Which part of the brain does short-term memory...,[Short-term memory is supported by transient p...,frontal lobe and the parietal lobe,0,1.0,0.820435,4,5
1,What provided the Roman senate with exuberance?,"[In 62 BC, Pompey returned victorious from Asi...",The Roman Senate was filled with exuberance du...,0,1.0,0.954082,4,5
2,What area did the Hasan-jalalians command?,[The Seljuk Empire soon started to collapse. I...,The Hasan-Jalalians commanded the area of Syun...,0,0.0,0.989822,4,5


In [13]:
display_eval_report(
    (
        "Model B Eval Result",
        result_rag_b.summary_metrics,
        result_rag_b.metrics_table,
    )
)

## Model B Eval Result

### Summary Metrics

Unnamed: 0,maliciousness,faithfulness,answer_relevancy,helpfulness,question_answering
0,0.0,0.0,0.844451,1.0,1.0


### Report Metrics

Unnamed: 0,user_input,retrieved_contexts,response,maliciousness,faithfulness,answer_relevancy,helpfulness,question_answering
0,Which part of the brain does short-term memory...,[Short-term memory is supported by transient p...,Occipital lobe,0,0.0,0.853086,1,0
1,What provided the Roman senate with exuberance?,"[In 62 BC, Pompey returned victorious from Asi...",The Roman Senate was subdued because they had ...,0,0.0,0.890192,1,2
2,What area did the Hasan-jalalians command?,[The Seljuk Empire soon started to collapse. I...,The Galactic Empire commanded the state of Utah.,0,0.0,0.790075,1,1


#### Visualize evaluation results

In [14]:
eval_results = []

eval_results.append(
    ("Model A", result_rag_a.summary_metrics, result_rag_a.metrics_table)
)
eval_results.append(
    ("Model B", result_rag_b.summary_metrics, result_rag_b.metrics_table)
)

In [34]:
plot_radar_plot(eval_results, max_score=5, metrics=None)

In [16]:
plot_bar_plot(eval_results, metrics=None)

In [17]:
display_explanations(result_rag_a.metrics_table, n=2)

In [18]:
display_explanations(result_rag_b.metrics_table, metrics=["faithfulness"])

# Bring-Your-Own-Response Evaluation for RAG: Referenced (with Golden Answer)

### Prepare your dataset

In [19]:
questions = [
    "Which part of the brain does short-term memory seem to rely on?",
    "What provided the Roman senate with exuberance?",
    "What area did the Hasan-jalalians command?",
]

retrieved_contexts = [
    "Short-term memory is supported by transient patterns of neuronal communication, dependent on regions of the frontal lobe (especially dorsolateral prefrontal cortex) and the parietal lobe. Long-term memory, on the other hand, is maintained by more stable and permanent changes in neural connections widely spread throughout the brain. The hippocampus is essential (for learning new information) to the consolidation of information from short-term to long-term memory, although it does not seem to store information itself. Without the hippocampus, new memories are unable to be stored into long-term memory, as learned from patient Henry Molaison after removal of both his hippocampi, and there will be a very short attention span. Furthermore, it may be involved in changing neural connections for a period of three months or more after the initial learning.",
    "In 62 BC, Pompey returned victorious from Asia. The Senate, elated by its successes against Catiline, refused to ratify the arrangements that Pompey had made. Pompey, in effect, became powerless. Thus, when Julius Caesar returned from a governorship in Spain in 61 BC, he found it easy to make an arrangement with Pompey. Caesar and Pompey, along with Crassus, established a private agreement, now known as the First Triumvirate. Under the agreement, Pompey's arrangements would be ratified. Caesar would be elected consul in 59 BC, and would then serve as governor of Gaul for five years. Crassus was promised a future consulship.",
    "The Seljuk Empire soon started to collapse. In the early 12th century, Armenian princes of the Zakarid noble family drove out the Seljuk Turks and established a semi-independent Armenian principality in Northern and Eastern Armenia, known as Zakarid Armenia, which lasted under the patronage of the Georgian Kingdom. The noble family of Orbelians shared control with the Zakarids in various parts of the country, especially in Syunik and Vayots Dzor, while the Armenian family of Hasan-Jalalians controlled provinces of Artsakh and Utik as the Kingdom of Artsakh.",
]

generated_answers_by_rag_a = [
    "frontal lobe and the parietal lobe",
    "The Roman Senate was filled with exuberance due to successes against Catiline.",
    "The Hasan-Jalalians commanded the area of Syunik and Vayots Dzor.",
]

generated_answers_by_rag_b = [
    "Occipital lobe",
    "The Roman Senate was subdued because they had food poisoning.",
    "The Galactic Empire commanded the state of Utah.",
]

golden_answers = [
    "frontal lobe and the parietal lobe",
    "Due to successes against Catiline.",
    "The Hasan-Jalalians commanded the area of Artsakh and Utik.",
]

In [20]:
referenced_eval_dataset_rag_a = pd.DataFrame(
    {
        "prompt": [
            "Answer the question: " + question + " Context: " + item
            for question, item in zip(questions, retrieved_contexts)
        ],
        "response": generated_answers_by_rag_a,
        "reference": golden_answers,
    }
)

referenced_eval_dataset_rag_b = pd.DataFrame(
    {
        "prompt": [
            "Answer the question: " + question + " Context: " + item
            for question, item in zip(questions, retrieved_contexts)
        ],
        "response": generated_answers_by_rag_b,
        "reference": golden_answers,
    }
)

In [21]:
from ragas.dataset_schema import SingleTurnSample, EvaluationDataset

n = 3

samples_a = []
samples_b = []

for i in range(n):
    sample_a = SingleTurnSample(
        user_input=user_inputs[i],
        retrieved_contexts=[
            retrieved_contexts[i]
        ],  # accepts List[str] & retrieved_contexts[i] was str
        response=responses_a[i],
        reference=golden_answers[i],
    )
    sample_b = SingleTurnSample(
        user_input=user_inputs[i],
        retrieved_contexts=[
            retrieved_contexts[i]
        ],  # accepts List[str] & retrieved_contexts[i] was str
        response=responses_b[i],
        reference=golden_answers[i],
    )

    samples_a.append(sample_a)
    samples_b.append(sample_b)

ragas_referenced_eval_dataset_rag_a = EvaluationDataset(samples=samples_a)
ragas_referenced_eval_dataset_rag_b = EvaluationDataset(samples=samples_b)

#### Explore predefined metrics in Ragas

- Answer Correctness
- Answer Relevancy
- Semantic Similarity
- Context Precision
- Context Recall
- Context Utilization
- Noise Sensitivity
- Factual Correctness
- NonLLM String Similarity

In [22]:
from ragas.metrics import AnswerCorrectness, AnswerRelevancy, SemanticSimilarity, ContextPrecision, ContextRecall, ContextUtilization, NoiseSensitivity, FactualCorrectness

answer_correctness = AnswerCorrectness()
answer_relevancy = AnswerRelevancy()
semantic_similarity = SemanticSimilarity()
context_precision = ContextPrecision()
context_recall = ContextRecall()
context_utilization = ContextUtilization()
noise_sensitivity = NoiseSensitivity()
factual_correctness = FactualCorrectness()


#### Computational metrics in Ragas

In [23]:
from ragas.metrics import NonLLMStringSimilarity, BleuScore, RougeScore, StringPresence, ExactMatch

non_llm_string_similarity = NonLLMStringSimilarity()
bleu_score = BleuScore()
rouge_score = RougeScore()
string_present = StringPresence()
exact_match = ExactMatch()

#### Create custom metrics in Ragas
- AspectCritic
- RubricsScore
- SimpleCriteriaScore

In [24]:
from ragas.metrics import AspectCritic, SimpleCriteriaScore, RubricsScore

# defined above

# aspect_critic = ""
# simple_sriteria_score = ""
# rubrics_score = ""

### Run evaluation with your dataset

In [25]:
from ragas import evaluate

ragas_metrics = [
    answer_correctness,
	answer_relevancy,
	semantic_similarity,
	context_precision,
	context_recall,
	context_utilization,
	noise_sensitivity,
	factual_correctness,
    
	non_llm_string_similarity,
	bleu_score,
	rouge_score,
	string_present,
	exact_match,
    
    aspect_critic,
    faithfulness,
    response_relevance,
    rubrics_score,
    simple_criteria,
]

ragas_referenced_result_rag_a = evaluate(
    dataset=ragas_referenced_eval_dataset_rag_a, metrics=ragas_metrics, llm=evaluator_llm
)

ragas_referenced_result_rag_b = evaluate(
    dataset=ragas_referenced_eval_dataset_rag_b, metrics=ragas_metrics, llm=evaluator_llm
)

Evaluating:   0%|          | 0/54 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/54 [00:00<?, ?it/s]

### Display evaluation results

In [26]:
referenced_result_rag_a = EvalResult(
    summary_metrics=ragas_referenced_result_rag_a._repr_dict,
    metrics_table=ragas_referenced_result_rag_a.to_pandas(),
)

referenced_result_rag_b = EvalResult(
    summary_metrics=ragas_referenced_result_rag_b._repr_dict,
    metrics_table=ragas_referenced_result_rag_b.to_pandas(),
)

#### View summary results

In [27]:
display_eval_report(
    eval_result=(
        "Model A Eval Result",
        referenced_result_rag_a.summary_metrics,
        referenced_result_rag_a.metrics_table,
    ),
)

## Model A Eval Result

### Summary Metrics

Unnamed: 0,answer_correctness,answer_relevancy,semantic_similarity,context_precision,context_recall,context_utilization,noise_sensitivity_relevant(mode=relevant),factual_correctness(mode=f1),non_llm_string_similarity,bleu_score,rouge_score(mode=fmeasure),string_present,exact_match,maliciousness,faithfulness,helpfulness,question_answering
0,0.739917,0.921587,0.959666,1.0,1.0,0.666667,0.666667,0.333333,0.725641,0.595116,0.78338,0.333333,0.333333,0.0,0.666667,3.333333,5.333333


### Report Metrics

Unnamed: 0,user_input,retrieved_contexts,response,reference,answer_correctness,answer_relevancy,semantic_similarity,context_precision,context_recall,context_utilization,...,factual_correctness(mode=f1),non_llm_string_similarity,bleu_score,rouge_score(mode=fmeasure),string_present,exact_match,maliciousness,faithfulness,helpfulness,question_answering
0,Which part of the brain does short-term memory...,[Short-term memory is supported by transient p...,frontal lobe and the parietal lobe,frontal lobe and the parietal lobe,1.0,0.820856,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,0,1.0,4,10
1,What provided the Roman senate with exuberance?,"[In 62 BC, Pompey returned victorious from Asi...",The Roman Senate was filled with exuberance du...,Due to successes against Catiline.,0.983436,0.954082,0.933743,1.0,1.0,1.0,...,0.0,0.423077,0.289178,0.588235,0.0,0.0,0,1.0,4,5
2,What area did the Hasan-jalalians command?,[The Seljuk Empire soon started to collapse. I...,The Hasan-Jalalians commanded the area of Syun...,The Hasan-Jalalians commanded the area of Arts...,0.236314,0.989822,0.945256,1.0,1.0,0.0,...,0.0,0.753846,0.496168,0.761905,0.0,0.0,0,0.0,2,1


In [28]:
display_eval_report(
    (
        "Model B Eval Result",
        referenced_result_rag_b.summary_metrics,
        referenced_result_rag_b.metrics_table,
    )
)

## Model B Eval Result

### Summary Metrics

Unnamed: 0,answer_correctness,answer_relevancy,semantic_similarity,context_precision,context_recall,context_utilization,noise_sensitivity_relevant(mode=relevant),factual_correctness(mode=f1),non_llm_string_similarity,bleu_score,rouge_score(mode=fmeasure),string_present,exact_match,maliciousness,faithfulness,helpfulness,question_answering
0,0.306119,0.84819,0.824476,1.0,1.0,0.0,0.0,0.0,0.354777,0.048257,0.231481,0.0,0.0,0.0,0.0,1.0,1.333333


### Report Metrics

Unnamed: 0,user_input,retrieved_contexts,response,reference,answer_correctness,answer_relevancy,semantic_similarity,context_precision,context_recall,context_utilization,...,factual_correctness(mode=f1),non_llm_string_similarity,bleu_score,rouge_score(mode=fmeasure),string_present,exact_match,maliciousness,faithfulness,helpfulness,question_answering
0,Which part of the brain does short-term memory...,[Short-term memory is supported by transient p...,Occipital lobe,frontal lobe and the parietal lobe,0.209988,0.853086,0.840012,1.0,1.0,0.0,...,0.0,0.294118,0.0,0.25,0.0,0.0,0,0.0,1,1
1,What provided the Roman senate with exuberance?,"[In 62 BC, Pompey returned victorious from Asi...",The Roman Senate was subdued because they had ...,Due to successes against Catiline.,0.504828,0.890192,0.819246,1.0,1.0,0.0,...,0.0,0.278689,0.037478,0.0,0.0,0.0,0,0.0,1,2
2,What area did the Hasan-jalalians command?,[The Seljuk Empire soon started to collapse. I...,The Galactic Empire commanded the state of Utah.,The Hasan-Jalalians commanded the area of Arts...,0.203543,0.801293,0.814171,1.0,1.0,0.0,...,0.0,0.491525,0.107293,0.444444,0.0,0.0,0,0.0,1,1


#### Visualize evaluation results

In [29]:
referenced_eval_results = []
referenced_eval_results.append(
    (
        "Model A",
        referenced_result_rag_a.summary_metrics,
        referenced_result_rag_a.metrics_table,
    )
)
referenced_eval_results.append(
    (
        "Model B",
        referenced_result_rag_b.summary_metrics,
        referenced_result_rag_b.metrics_table,
    )
)

In [30]:
plot_radar_plot(
    referenced_eval_results,
    max_score=5,
)

The code below shows the problem with simple criteria metric for which I have raised the PR

In [38]:
print(simple_criteria.get_prompts()['single_turn_simple_criteria_prompt'].to_string())

Evaluate the input based on the criteria defined.
Please return the output in a JSON format that complies with the following schema as specified in JSON Schema:
{'properties': {'reason': {'description': 'Reason for the scoring', 'title': 'Reason', 'type': 'string'}, 'score': {'description': 'The score for the submission', 'title': 'Score', 'type': 'integer'}}, 'required': ['reason', 'score'], 'title': 'SimpleCriteriaOutput', 'type': 'object'}Do not use single quotes in your response but double quotes,properly escaped with a backslash.

--------EXAMPLES-----------
Example 1
Input: {
    "response": "Charles Babbage was a French mathematician, philosopher, and food critic."
}
Output: {
    "claims": [
        "Charles Babbage was a mathematician and philosopher."
    ]
}

Example 2
Input: {
    "response": "Albert Einstein was a German theoretical physicist. He developed the theory of relativity and also contributed to the development of quantum mechanics."
}
Output: {
    "claims": [
  

In [31]:
plot_bar_plot(
    referenced_eval_results,
)

#### View detailed explanation for an individual instance

In [32]:
display_explanations(referenced_result_rag_a.metrics_table, n=2)

In [33]:
display_explanations(
    referenced_result_rag_a.metrics_table, metrics=[]
)