In [1]:
# Tutorial for setting up a small RAG system using Faiss 
# and evaluating it using the Gemini Flash 1.5 LLM and the DeepEval library
# I use the Google Gemini API (free tier, local API key), but DeepEval is compatible with several LLMs
# Google Gemini: https://ai.google.dev/gemini-api/docs/models/gemini
# DeepEval: https://docs.confident-ai.com/docs/guides-rag-evaluation

# Testing out this notebook with the latest version of DeepEval: v2.2.7
# Thankfully, not much had changes for the updates in DeepEval to break my code;
# Instead, the only additional parsing needed was on the output returned from an evaluation

# DeepEval v1.1.6 was fairly compatible with Google Gemini by creating a new LLM class that inherited from DeepEvalBaseLLM
# and adding methods that called Gemini's generation functions; it was a similar setup for the Embeddings, inheriting from DeepEvalBaseEmbeddingModel
# The only trick is that the LLM output needs to be in JSON format
# I used the pydantic and instructor libraries for this; the following gives good examples of how to use them
# Tutorial on using custom LLMs with DeepEval: https://docs.confident-ai.com/docs/guides-using-custom-llms

# Metrics available in DeepEval:
# - Contextual Precision: Evaluates whether the reranker in your retriever ranks more relevant nodes in your retrieval context higher than irrelevant ones.
# - Contextual Recall: Evaluates whether the embedding model in your retriever is able to accurately capture and retrieve relevant information based on the context of the input.
# - Contextual Relevance: Evaluates whether the text chunk size and top-K of your retriever is able to retrieve information without much irrelevancies.
# - Answer Relevancy: Evaluates whether the prompt template in your generator is able to instruct your LLM to output relevant and helpful outputs based on the retrieval_context.
# - Faithfulness: Evaluates whether the LLM used in your generator can output information that does not hallucinate AND contradict any factual information presented in the retrieval_context.
# - Other metrics are available for non-RAG systems; custom metrics can also be created (I did not test this)

In [19]:
# Set up environment

In [1]:
import os
from IPython.display import display, Markdown
import pandas as pd
from typing import List
from pydantic import BaseModel, ConfigDict # for JSON output from DeepEval
import instructor # for JSON output from DeepEval

# Replace these two Google Gemini imports with imports for your LLM
import google.generativeai as genai
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings

# also need pip install llama-index-embeddings-langchain, added the below
from llama_index.embeddings.langchain import LangchainEmbedding

from llama_index.core import Document, VectorStoreIndex, Settings, StorageContext, load_index_from_storage
from llama_index.vector_stores.faiss import FaissVectorStore
import faiss

import deepeval
from deepeval.models import DeepEvalBaseLLM, DeepEvalBaseEmbeddingModel
from deepeval.test_case import LLMTestCase
from deepeval.dataset import EvaluationDataset
from deepeval.synthesizer import Synthesizer
from deepeval import evaluate
from deepeval.evaluate import TestResult, print_test_result
from deepeval.metrics import (
    AnswerRelevancyMetric,
    ContextualPrecisionMetric,
    ContextualRecallMetric,
    ContextualRelevancyMetric,
    FaithfulnessMetric
)
#from deepeval.metrics.ragas import (
#    RagasMetric,
#    RAGASAnswerRelevancyMetric,
#    RAGASFaithfulnessMetric, 
#    RAGASContextualRecallMetric,
#    RAGASContextualPrecisionMetric,
#    RAGASContextualRelevancyMetric
#) 



In [2]:
# Environmental variable to opt out of DeepEval tracking telemetry data
os.environ["DEEPEVAL_TELEMETRY_OPT_OUT"] = "YES"

In [3]:
deepeval.telemetry_opt_out()

True

In [4]:
def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [5]:
# set up local API key
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

In [6]:
# Establish RAG pipeline with Gemini

In [7]:
# # Create a Faiss vector store for RAG
# # If you already have an index created, skip a few coding cells to the LLM / embeddings setup

# # Example of creating a small vector store
# # Using 4 State of the Union speeches, all text from whitehouse.gov briefing room speeches posted online, edited to include a title with the date of the speech
# # Example from 2024:
# # https://www.whitehouse.gov/briefing-room/speeches-remarks/2024/03/07/remarks-of-president-joe-biden-state-of-the-union-address-as-prepared-for-delivery-2/

# # load and parse files
# sotu = []
# newfiles = ["./Speeches/titleedits/state_of_the_union_042921.txt", "./Speeches/titleedits/state_of_the_union_030122.txt", "./Speeches/titleedits/state_of_the_union_020723.txt", "./Speeches/titleedits/state_of_the_union_030724.txt"]
# for i in newfiles:
#     with open(i) as file:
#         for line in file:
#             nl = line.rstrip()
#             if nl != '':
#                 sotu.append(nl)

# # convert into Document format
# documents = [Document(text=line) for line in sotu]

In [9]:
# # Example of a loaded Document line
# documents[-1]

Document(id_='833ea164-b547-46ec-8854-cefdc83fbb10', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='May God protect our troops.', mimetype='text/plain', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')

In [10]:
# # Set up the faiss index
# d = 768 # dimensions of the input vector of the embedding model that we're going to use; in this case, the google embedding model
# faiss_index = faiss.IndexFlatL2(d)
# print(faiss_index.is_trained) # double check that the training worked

True


In [6]:
# Set up the llm, embeddings, and Settings for Faiss 
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash") # Replace with your LLM
doc_embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004") # Replace with your embeddings model
Settings.embed_model = doc_embeddings # used for LlamaIndex FaissVectorStore
Settings.llm = llm # used for LlamaIndex FaissVectorStore

In [13]:
# # Uncomment for when you need to re-embed and vectorize documents

# vector_store = FaissVectorStore(faiss_index=faiss_index)
# storage_context = StorageContext.from_defaults(vector_store=vector_store)
# index = VectorStoreIndex.from_documents(
#     documents, storage_context=storage_context, show_progress=True
# )

# # Save index to disk
# index.storage_context.persist()

# # Save/remember index id for loading next time
# index.index_id

In [7]:
# After you have a saved index, load that index for RAG answer generation:

# load index from disk
vector_store = FaissVectorStore.from_persist_dir("./storage")
storage_context = StorageContext.from_defaults(
    vector_store=vector_store, persist_dir="./storage"
)
# My local index id '3d3c99c5-aa1c-42d7-a9ce-c4bb12fbc6d5' uses the 4 speeches including a title that includes the date it was given
# My local index id '95634851-570e-454e-983f-6634eeb72aee' contains 3200 documents from the rag_mini_wikipedia dataset
index = load_index_from_storage(storage_context=storage_context, index_id='95634851-570e-454e-983f-6634eeb72aee')

In [13]:
# # Optional- if you'd like to query your index
# # Set up query and chat engines with the index
# query_engine = index.as_query_engine(similarity_top_k=10)
# chat_engine = index.as_chat_engine(similarity_top_k=10, chat_mode='context')

In [None]:
# # Example query and response
# query = "In detail, what has the President done to improve the economy over the four years of his speeches?"
# response = query_engine.query(query) 
# print(response.response)

In [None]:
# # Get ranked scores for top k RAG source nodes
# for node in response.source_nodes:
#     print(f"{node.get_score()} -> {node.text}")

In [17]:
# # Example of using the chat engine with our index
# query = "You are an expert speech analyst and specialize in analyzing Presidential State of the Union speeches. Could you please analyze the speeches and generate 2 questions and answers from each speech, providing the document filename of each speech that relates to each question?"
# response = chat_engine.chat(query) 
# print(response.response)

In [None]:
# # Optional: View chat history
# chat_engine.chat_history

In [12]:
# Code for DeepEval RAG evaluation library to work with Gemini and our local RAG setup

# In DeepEval v1.1.6 and in v2.2.7, an example Input/Output/Context is a 'LLMTestCase'
# You can evaluate LLMTestCases individually or in a large batch with the evaluate function
# Metric scores are produced as well as an LLM generated explanation for a given score 

# DeepEval can also synthetically generate data from documents - see Synthesizer and generate_goldens_from_docs, below. 
# Of the 3 libraries I tested, this function produced the most human-realistic queries.
# DeepEval also offers other functionality besides RAG evaluation, including "red teaming LLM applications for security vulnerabilities"

# https://docs.confident-ai.com/docs/guides-rag-evaluation

In [6]:
# DeepEval requires a json response. In practice, this has led to malformed json returned from the llm, even with as simple of a schema as this, 
# but this and the LLM class can likely be refined to improve responses
class Response(BaseModel):
    response: str

In [7]:
# Non Open-AI usage requires a custom LLM class for using DeepEval
# Tutorial with example code on using custom LLMs with DeepEval: https://docs.confident-ai.com/docs/guides-using-custom-llms
class CustomGeminiFlash(DeepEvalBaseLLM):
    def __init__(self):
        self.model = genai.GenerativeModel(model_name="models/gemini-1.5-flash")
        model_config  = ConfigDict(protected_namespaces=())

    def load_model(self):
        return self.model

    def generate(self, prompt: str, schema: BaseModel) -> BaseModel: 
        client = self.load_model()
        instructor_client = instructor.from_gemini(
            client=client,
            mode=instructor.Mode.GEMINI_JSON,
        )
        resp = instructor_client.messages.create(
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
            response_model=schema,
        )
        return resp

    async def a_generate(self, prompt: str, schema: BaseModel) -> BaseModel:
        return self.generate(prompt, schema)

    def get_model_name(self):
        return "Gemini 1.5 Flash"

In [8]:
# Similarly, a custom embedding model class is required for non Open-AI embeddings
# Tutorial on using custom embeddings with DeepEval: https://docs.confident-ai.com/docs/guides-using-custom-embedding-models 
class CustomGeminiEmbeddingModel(DeepEvalBaseEmbeddingModel):
    def __init__(self):
        model_config  = ConfigDict(protected_namespaces=())

    def load_model(self):
        return GoogleGenerativeAIEmbeddings(
            model="models/text-embedding-004"
        )

    def embed_text(self, text: str) -> List[float]:
        embedding_model = self.load_model()
        return embedding_model.embed_query(text)

    def embed_texts(self, texts: List[str]) -> List[List[float]]:
        embedding_model = self.load_model()
        return embedding_model.embed_documents(texts)

    async def a_embed_text(self, text: str) -> List[float]:
        embedding_model = self.load_model()
        return await embedding_model.aembed_query(text)

    async def a_embed_texts(self, texts: List[str]) -> List[List[float]]:
        embedding_model = self.load_model()
        return await embedding_model.aembed_documents(texts)

    def get_model_name(self):
        "Custom Gemini Embeddings"

In [9]:
# Create custom llm and embeddings
custom_geminiflash = CustomGeminiFlash()
custom_geminiembeddings = CustomGeminiEmbeddingModel()

In [None]:
# Generate a synthetic dataset of "Goldens" (AKA a dataset with 'input', 'context', 'source_file' columns -- not 'Retrieval_Context') with DeepEval
dataset = EvaluationDataset()
synthesizer = Synthesizer(model=custom_geminiflash, embedder=custom_geminiembeddings)
dataset.generate_goldens_from_docs(
    synthesizer=synthesizer,
    document_paths=['Speeches/titleedits/state_of_the_union_042921.txt', 'Speeches/titleedits/state_of_the_union_030122.txt', 
                    'Speeches/titleedits/state_of_the_union_020723.txt', 'Speeches/titleedits/state_of_the_union_030724.txt'],
    max_goldens_per_document=3, # maximum number of questions to generate per document
    include_expected_output=True
)

dataset.save_as(file_type="csv", directory=".")

In [13]:
# Example of evaluating one example/test case

# Parameters in a DeepEval LLMTestCase:
# Input: Question/query for the LLM
# Actual Output: Answer returned from the LLM
# Expected Output: The ideal output for the input/question
# Retrieval Context (optional): LLM's actual retrieval results from the RAG system
# Context (optional): Additional ground truth context besides RAG

contextual_precision = ContextualPrecisionMetric(model=custom_geminiflash)
contextual_recall = ContextualRecallMetric(model=custom_geminiflash)
contextual_relevancy = ContextualRelevancyMetric(model=custom_geminiflash)

test_case = LLMTestCase(
    input="What measures does the speaker propose to lower prescription drug costs in America?",
    actual_output="The speaker proposes giving Medicare the power to negotiate lower prescription drug prices, capping prescription drug costs at $2,000 a year for everyone, and allowing Medicare to negotiate lower prices for 500 drugs over the next decade.s",
    expected_output="The speaker proposes that Medicare should be given the power to negotiate lower drug prescription prices. They argue that this would save hundreds of billions of dollars and lower prescription drug costs for everyone. The speaker also states that the money saved could be used to strengthen the Affordable Care Act and expand Medicare coverage benefits without costing taxpayers an additional penny.",
    retrieval_context=['Let’s do what we’ve always talked about for all the years I was down here in this — in this body — in Congress.  Let’s give Medicare the power to save hundreds of billions of dollars by negotiating lower drug prescription prices.  (Applause.)', 'In fact, we pay the highest prescription drug prices of anywhere in the world right here in America — nearly three times — for the same drug, nearly three times what other countries pay.  We have to change that, and we can.', 'And we’re finally giving Medicare the power to negotiate drug prices. Bringing down prescription drug costs doesn’t just save seniors money.', 'For years people have talked about it but I finally got it done and gave Medicare the power to negotiate lower prices for prescription drugs just like the VA does for our veterans.', 'And, by the way, that won’t just — that won’t just help people on Medicare; it will lower prescription drug costs for everyone.', 'Now I want to cap prescription drug costs at $2,000 a year for everyone!', 'We know how to do this.  The last President had that as an objective.  We all know how outrageously expensive drugs are in America.', 'Make no mistake, if you try to do anything to raise the cost of prescription drugs, I will veto it.', 'Now it’s time to go further and give Medicare the power to negotiate lower prices for 500 drugs over the next decade.', 'It will cut the federal deficit, saving tax payers hundreds of billions of dollars on the prescription drugs the government buys for Medicare.']
)

In [23]:
# For RAG systems, DeepEval recommends the following Retrieval and Generation metrics:
# Retrieval metrics:
contextual_precision.measure(test_case)
print("Contextual Precision Score: ", contextual_precision.score)
print("Contextual Precision Reason: ", contextual_precision.reason)

contextual_recall.measure(test_case)
print("Contextual Recall Score: ", contextual_recall.score)
print("Contextual Recall Reason: ", contextual_recall.reason)

contextual_relevancy.measure(test_case)
print("Contextual Relevancy Score: ", contextual_relevancy.score)
print("Contextual Relevancy Reason: ", contextual_relevancy.reason)

Output()

Output()

Contextual Precision Score:  0.9095238095238096
Contextual Precision Reason:  The score is 0.91 because the first five nodes are relevant and directly address the speaker's proposal to lower drug prices through Medicare negotiation.  However, the sixth node, focusing on capping prescription drug costs,  is a separate proposal and doesn't directly align with the initial proposal, making it a relevant node ranked lower. The seventh and eighth nodes don't mention specific measures, making them less relevant compared to the first five nodes that directly discuss the speaker's proposed measure.


Output()

Contextual Recall Score:  1.0
Contextual Recall Reason:  The score is 1.00 because the speaker proposes that Medicare should be given the power to negotiate lower drug prescription prices, and the node(s) in retrieval context) support this with phrases like 'Let’s give Medicare the power to save hundreds of billions of dollars by negotiating lower drug prescription prices.'.


Contextual Relevancy Score:  0.5
Contextual Relevancy Reason:  The score is 0.50 because the reasons for irrelevancy indicate that the context doesn't specifically outline measures to lower prescription drug costs, but it does discuss the speaker's position and impact on costs. "The context discusses the high prescription drug prices in America and suggests changing them, but it doesn't mention any specific measures the speaker proposes to lower costs." and "The context only states the speaker's position on raising the cost of prescription drugs, not any measures to lower it." highlight this, suggesting a partial relevance as the context doesn't ignore the topic entirely.


In [29]:
# Generation metrics:
answer_relevancy = AnswerRelevancyMetric(model=custom_geminiflash)
faithfulness = FaithfulnessMetric(model=custom_geminiflash)
                                 
answer_relevancy.measure(test_case)
print("Answer Relevancy Score: ", answer_relevancy.score)
print("Answer Relevancy Reason: ", answer_relevancy.reason)

faithfulness.measure(test_case)
print("Faithfulness Score: ", faithfulness.score)
print("Faithfulness Reason: ", faithfulness.reason)

Output()

Output()

Answer Relevancy Score:  1.0
Answer Relevancy Reason:  The score is 1.00 because the input asks for measures to lower prescription drug costs and the provided JSON correctly represents that information. Great job!


Faithfulness Score:  1.0
Faithfulness Reason:  The score is 1.00 because there are no contradictions, this is great!


In [10]:
# Example of measuring metrics for multiple test cases / a full dataset

# Load manually curated dataset
evaldataset = EvaluationDataset()
evaldataset.add_test_cases_from_csv_file(
    file_path="datasets/rag_mini_wikipedia_complete_chat_ares.csv",
    input_col_name="question",
    actual_output_col_name="answer",
    expected_output_col_name="ground_truth",
#    context_col_name="context",
#    context_col_delimiter= ",",
    retrieval_context_col_name="contexts", # Context that the LLM produced when it answered the Input
    retrieval_context_col_delimiter= ","
#    additional_metadata_col_name="source_file"
)

In [11]:
# Retriever metrics:
contextual_precision = ContextualPrecisionMetric(model=custom_geminiflash, include_reason=False) # include_reason=False is an option
contextual_recall = ContextualRecallMetric(model=custom_geminiflash)
contextual_relevancy = ContextualRelevancyMetric(model=custom_geminiflash) # Note: this was the only metric that would not finish execution for the manually curated dataset (too many 429 errors)

# Generation metrics:
answer_relevancy = AnswerRelevancyMetric(model=custom_geminiflash)
faithfulness = FaithfulnessMetric(model=custom_geminiflash, include_reason=False)

In [12]:
len(evaldataset.test_cases)

903

In [13]:
all_1results = []

In [22]:
#pd.DataFrame(contextprecision_results).to_clipboard(index=False,header=False)
len(all_1results)

0

In [50]:
# 2 Options for Metrics Evaluation: 

# 1) Iterating through test cases seems to work better than bulk evaluation with evaluate,
# as errors encountered with evaluate(...) cause no results to be returned
# Looping at least saves partial results until an error occurs
# I encountered this sometimes with contextual_relevancy and contextual_precision on the test dataset (429 errors or Invalid JSON errors),
# yet typically I got results if I iterated through individual test_cases
# For future: https://github.com/confident-ai/deepeval/issues/964 may assist with incorrect json errors like what was being returned

# Example for evaluating one test case at a time
# contextprecision_results = []
for i in range(0, 1): # testing first, typical:  # range(0, len(evaldataset.test_cases)
    print(i)
    eval_1metric = evaluate(test_cases=[evaldataset.test_cases[i]], metrics=[contextual_precision], throttle_value=10) #throttle_value is for rate limiting, in seconds between queries; 10 for rate limits on gemini 1.5, 20 for gemini 2.0 exp
    all_1results.append(eval_1metric.test_results[0]) # edit for newer (2.2.7) version of DeepEval

# 2) Evaluate through test_cases in bulk
# In testing, at least faithfulness, contextual_precision metrics worked this way with a small, manually curated dataset

# Throttle_value is for rate limiting- in seconds between queries
# test_precision = evaluate(test_cases=evaldataset.test_cases, metrics=[contextual_precision], throttle_value=90)

250


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |███████████████████████████████|100% (1/1) [Time Taken: 00:20, 20.03s/test case]



Metrics Summary

  - ✅ Contextual Precision (score: 0.625, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: None, error: None)

For test case:

  - input: What positions Ford played in the school football team?
  - actual output: Gerald Ford played center and linebacker for the University of Michigan football team.

  - expected output: Center and linebacker
  - context: []
  - retrieval context: ["['Ford attended Grand Rapids South High School and was a star athlete and captain of his football team. In 1930", " he was selected to the All-City team of the Grand Rapids City League. He also attracted the attention of college recruiters.'", " 'At Michigan", ' Ford became a member of the Delta Kappa Epsilon fraternity and washed dishes at his fraternity house to earn money for college expenses. Following his graduation in 1935 with a degree in political science and economics', ' he turned down contract offers from the Detroit Lions and Green Bay Packers of the N




251


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |███████████████████████████████|100% (1/1) [Time Taken: 00:20, 20.00s/test case]



Metrics Summary

  - ✅ Contextual Precision (score: 1.0, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: None, error: None)

For test case:

  - input: What did Ford say about his biological father?
  - actual output: Gerald Ford described his biological father as a "carefree, well-to-do man".  Later in life, he also described his biological father as abusive.

  - expected output: He was abusive and had a history of hitting his mother.
  - context: []
  - retrieval context: ["['Ford was not aware of his biological parentage until he was 17", ' when his parents told him about the circumstances of his birth. That same year his biological father', ' whom he described as a "carefree', ' well-to-do man"', ' approached Ford while he was waiting tables in a Grand Rapids restaurant. The two "maintained a sporadic contact" until Leslie King', " Sr.\\'s death", ' Associated Press. Nebraska - Born', ' Ford Left State As Infant. The New York Times (December 27', ' 200




252


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |███████████████████████████████|100% (1/1) [Time Taken: 00:20, 20.01s/test case]



Metrics Summary

  - ✅ Contextual Precision (score: 0.6428571428571428, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: None, error: None)

For test case:

  - input: Who did Ford nominate for Vice President?
  - actual output: After becoming President, Gerald Ford nominated Nelson Rockefeller to be Vice President.

  - expected output: Bob Dole
  - context: []
  - retrieval context: ['["Gerald Rudolph Ford', ' Jr. (July 14', ' 1913   December 26', ' 2006) was the thirty-eighth President of the United States', ' serving from 1974 to 1977', ' and the fortieth Vice President of the United States serving from 1973 to 1974. He was the first person appointed to the vice presidency under the terms of the 25th Amendment', " and became President upon Richard Nixon's resignation on August 9", ' 1974."', ' "Ford reluctantly agreed to run for office in 1976', " but first he had to counter a challenge for the Republican party nomination. Then-former Governor of Califor




253


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |███████████████████████████████|100% (1/1) [Time Taken: 00:20, 20.01s/test case]



Metrics Summary

  - ❌ Contextual Precision (score: 0.0, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: None, error: None)

For test case:

  - input: Was Ford active about Vietnamese affairs?
  - actual output: While Ford oversaw the final withdrawal of US personnel from Vietnam, the text indicates he did not actively intervene in Vietnamese affairs after the initial withdrawal of troops in 1973.

  - expected output: No
  - context: []
  - retrieval context: ["['As president", ' Ford signed the Helsinki Accords', ' marking a move toward dÃ©tente in the Cold War', ' even as South Vietnam', ' a former ally', ' was invaded and conquered by North Vietnam. Ford did not intervene in Vietamese affairs', ' but did help extract friends of the U.S. Domestically', ' the economy suffered from inflation and a recession under President Ford. One of his more controversial decisions was granting a presidential pardon to President Richard Nixon for his role in the Waterg




254


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |███████████████████████████████|100% (1/1) [Time Taken: 00:20, 20.02s/test case]



Metrics Summary

  - ❌ Contextual Precision (score: 0.0, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: None, error: None)

For test case:

  - input: hard
  - actual output: That's a very general word!  To help me understand what you mean by "hard," could you please provide some context or clarify your question?  For example, are you referring to something that is:

* **Difficult?**  (e.g., "This problem is hard.")
* **Solid or firm?** (e.g., "The ground is hard.")
* **Severe or harsh?** (e.g., "Hard times.")
* **Difficult to understand?** (e.g., "That was a hard lecture.")
* **Working diligently?** (e.g., "He works hard.")

More information will help me give you a more relevant and helpful response.

  - expected output: hard
  - context: []
  - retrieval context: ["['â\\x80\\x9cRunning the â\\x80\\x98Machineâ\\x80\\x99â\\x80\\x9d'", " 'This facial expression shows fear.'", " '* As a result", " musth behaviour is at least partly due to the elephant being




255


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |███████████████████████████████|100% (1/1) [Time Taken: 00:20, 20.02s/test case]



Metrics Summary

  - ✅ Contextual Precision (score: 1.0, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: None, error: None)

For test case:

  - input: Had Ford's wife married before?
  - actual output: Yes, Betty Ford had been previously married to William G. Warren before marrying Gerald Ford.

  - expected output: Yes
  - context: []
  - retrieval context: ["['On October 15", ' 1948', ' at Grace Episcopal Church in Grand Rapids', ' Ford married Elizabeth Bloomer Warren', " a department store fashion consultant. Warren had been a John Robert Powers fashion model and a dancer in the auxiliary troupe of the Martha Graham Dance Company. She had previously been married to and divorced from William G. Warren.'", " 'The Fords on their wedding day", ' October 15', " 1948'", " 'The Fords had four children:'", " '*On November 12", ' 2006', ' Ford became the longest-lived President', ' surpassing Ronald Reagan.    At the time of his death', " he had outlived Presid




256


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |███████████████████████████████|100% (1/1) [Time Taken: 00:20, 20.02s/test case]



Metrics Summary

  - ✅ Contextual Precision (score: 0.6428571428571428, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: None, error: None)

For test case:

  - input: Is Ford related with the assassination of John F. Kennedy?
  - actual output: Gerald Ford's involvement with the assassination of John F. Kennedy stems from his role on the Warren Commission.  He was tasked with writing a biography of Lee Harvey Oswald and some believe he altered the report to support the single-bullet theory.  However, there's no evidence suggesting he was directly involved in the assassination itself.  His connection is solely related to his work on the commission's investigation.

  - expected output: Yes
  - context: []
  - retrieval context: ["['In November 1963", ' President Lyndon B. Johnson appointed Ford to the Warren Commission', ' a special task force set up to investigate the assassination of President John F. Kennedy. Ford was assigned to prepare a biography of Le




257


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |███████████████████████████████|100% (1/1) [Time Taken: 00:20, 20.02s/test case]



Metrics Summary

  - ✅ Contextual Precision (score: 1.0, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: None, error: None)

For test case:

  - input: Was Gerald Ford the 38th President of the United States?
  - actual output: Yes, Gerald Ford was the 38th President of the United States.

  - expected output: yes
  - context: []
  - retrieval context: ['["Gerald Rudolph Ford', ' Jr. (July 14', ' 1913   December 26', ' 2006) was the thirty-eighth President of the United States', ' serving from 1974 to 1977', ' and the fortieth Vice President of the United States serving from 1973 to 1974. He was the first person appointed to the vice presidency under the terms of the 25th Amendment', " and became President upon Richard Nixon's resignation on August 9", ' 1974."', " 'Gerald Ford is sworn in as the 38th President of the United States by Chief Justice Warren Burger in the White House East Room", " while Betty Ford looks on.'", ' "Gerald R. Ford was born Leslie




258


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |███████████████████████████████|100% (1/1) [Time Taken: 00:20, 20.02s/test case]



Metrics Summary

  - ✅ Contextual Precision (score: 0.8928571428571428, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: None, error: None)

For test case:

  - input: Did Gerald Ford serve as the Republican Minority Leader of the House of Representatives?
  - actual output: Yes, Gerald Ford served as the Republican Minority Leader of the House of Representatives for over eight years.

  - expected output: yes
  - context: []
  - retrieval context: ['["Gerald Rudolph Ford', ' Jr. (July 14', ' 1913   December 26', ' 2006) was the thirty-eighth President of the United States', ' serving from 1974 to 1977', ' and the fortieth Vice President of the United States serving from 1973 to 1974. He was the first person appointed to the vice presidency under the terms of the 25th Amendment', " and became President upon Richard Nixon's resignation on August 9", ' 1974."', ' "Prior to 1973', ' Ford served for over eight years as the Republican Minority Leader of the Unite




259


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |███████████████████████████████|100% (1/1) [Time Taken: 00:20, 20.02s/test case]



Metrics Summary

  - ✅ Contextual Precision (score: 0.9166666666666666, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: None, error: None)

For test case:

  - input: Was Ford an Eagle Scout?
  - actual output: Yes, Gerald Ford was an Eagle Scout.

  - expected output: yes
  - context: []
  - retrieval context: ["['Ford attended Grand Rapids South High School and was a star athlete and captain of his football team. In 1930", " he was selected to the All-City team of the Grand Rapids City League. He also attracted the attention of college recruiters.'", " 'Eagle Scout Gerald Ford (circled in red) in 1929.'", ' "Ford joined the Boy Scouts of America', " and attained that program's highest rank", ' Eagle Scout. He always regarded this as one of his proudest accomplishments', ' even after attaining the White House.     In subsequent years', ' Ford received the Distinguished Eagle Scout Award in May 1970 and Silver Buffalo Award from the Boy Scouts of America. H




260


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |███████████████████████████████|100% (1/1) [Time Taken: 00:20, 20.02s/test case]



Metrics Summary

  - ✅ Contextual Precision (score: 1.0, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: None, error: None)

For test case:

  - input: What district was Ford elected from?
  - actual output: Gerald Ford was originally elected to Congress from Michigan's 5th congressional district.

  - expected output: Michigan's 5th congressional district
  - context: []
  - retrieval context: ['["Prior to 1973', ' Ford served for over eight years as the Republican Minority Leader of the United States House of Representatives; he was originally elected to Congress in 1948 from Michigan\'s 5th congressional district."', " 'Ford was a member of the House of Representatives for twenty-four years", ' holding the Grand Rapids congressional district seat from 1949 to 1973. It was a tenure largely notable for its modesty. As an editorial in The New York Times described him', ' Ford "saw himself as a negotiator and a reconciler', ' and the record shows it: he did 




261


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |███████████████████████████████|100% (1/1) [Time Taken: 00:20, 20.02s/test case]



Metrics Summary

  - ✅ Contextual Precision (score: 1.0, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: None, error: None)

For test case:

  - input: How many Eagle Scouts were involved in Ford's funeral procession?
  - actual output: Approximately 400 Eagle Scouts were involved in Gerald Ford's funeral procession.

  - expected output: 400
  - context: []
  - retrieval context: ['["Ford joined the Boy Scouts of America', " and attained that program's highest rank", ' Eagle Scout. He always regarded this as one of his proudest accomplishments', ' even after attaining the White House.     In subsequent years', ' Ford received the Distinguished Eagle Scout Award in May 1970 and Silver Buffalo Award from the Boy Scouts of America. He is the only US president who was an Eagle Scout.    Scouting was so important to Ford that his family asked that Scouts participate in his funeral. About 400 Eagle Scouts were part of the funeral procession', ' where they formed




262


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |███████████████████████████████|100% (1/1) [Time Taken: 00:20, 20.02s/test case]



Metrics Summary

  - ✅ Contextual Precision (score: 0.5833333333333334, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: None, error: None)

For test case:

  - input: Which future Heisman Trophy winner did Ford tackle?
  - actual output: Gerald Ford tackled Jay Berwanger, who won the first Heisman Trophy in 1935.

  - expected output: Jay Berwanger
  - context: []
  - retrieval context: ["['Ford attended Grand Rapids South High School and was a star athlete and captain of his football team. In 1930", " he was selected to the All-City team of the Grand Rapids City League. He also attracted the attention of college recruiters.'", " 'Ford as a University of Michigan football player", " 1933'", " 'At Michigan", ' Ford became a member of the Delta Kappa Epsilon fraternity and washed dishes at his fraternity house to earn money for college expenses. Following his graduation in 1935 with a degree in political science and economics', ' he turned down contract offer




263


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |███████████████████████████████|100% (1/1) [Time Taken: 00:20, 20.02s/test case]



Metrics Summary

  - ✅ Contextual Precision (score: 1.0, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: None, error: None)

For test case:

  - input: Has another US President been an Eagle Scout?
  - actual output: No, Gerald Ford is the only U.S. President who was an Eagle Scout.

  - expected output: no
  - context: []
  - retrieval context: ["['Eagle Scout Gerald Ford (circled in red) in 1929.'", ' "Ford joined the Boy Scouts of America', " and attained that program's highest rank", ' Eagle Scout. He always regarded this as one of his proudest accomplishments', ' even after attaining the White House.     In subsequent years', ' Ford received the Distinguished Eagle Scout Award in May 1970 and Silver Buffalo Award from the Boy Scouts of America. He is the only US president who was an Eagle Scout.    Scouting was so important to Ford that his family asked that Scouts participate in his funeral. About 400 Eagle Scouts were part of the funeral procession',




264


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |███████████████████████████████|100% (1/1) [Time Taken: 00:20, 20.02s/test case]



Metrics Summary

  - ✅ Contextual Precision (score: 1.0, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: None, error: None)

For test case:

  - input: Did Ford need to do extra work to pay for college?
  - actual output: Yes, Gerald Ford washed dishes at his fraternity house to help pay for college expenses.

  - expected output: yes
  - context: []
  - retrieval context: ["['Ford attended Grand Rapids South High School and was a star athlete and captain of his football team. In 1930", " he was selected to the All-City team of the Grand Rapids City League. He also attracted the attention of college recruiters.'", " 'At Michigan", ' Ford became a member of the Delta Kappa Epsilon fraternity and washed dishes at his fraternity house to earn money for college expenses. Following his graduation in 1935 with a degree in political science and economics', ' he turned down contract offers from the Detroit Lions and Green Bay Packers of the National Football League




265


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |███████████████████████████████|100% (1/1) [Time Taken: 00:20, 20.02s/test case]



Metrics Summary

  - ✅ Contextual Precision (score: 0.8528571428571429, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: None, error: None)

For test case:

  - input: Have more than five Presidents lived past the age of 90?
  - actual output: No, the text only names four presidents who lived past the age of 90:  John Adams, Herbert Hoover, Ronald Reagan, and Gerald Ford.  Therefore, fewer than five presidents have lived past 90.

  - expected output: no
  - context: []
  - retrieval context: ["['*Ford is one of only four former Presidents to live to 90 or more years of age. The others are Ronald Reagan (93)", " John Adams (90) and Herbert Hoover (90).'", " 'John Adams remains the longest-lived person ever elected to both of the highest offices in the United States.'", " '*Ford was the third longest lived Vice President at the age of 93. The two oldest were John Nance Garner", ' 98', ' and Levi P. Morton', " 96.'", " '*On November 12", ' 2006', ' Ford became




266


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |███████████████████████████████|100% (1/1) [Time Taken: 00:20, 20.02s/test case]



Metrics Summary

  - ❌ Contextual Precision (score: 0.0, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: None, error: None)

For test case:

  - input: How do kwajalein and eniwetok relate?
  - actual output: Kwajalein and Enewetak are both atolls in the Marshall Islands in the Pacific Ocean.  They were both heavily used for nuclear weapons testing by the United States during the mid-20th century.  Enewetak Atoll was the site of many of the early tests, while Kwajalein Atoll was later used as a missile testing range.  Their proximity and shared history as nuclear testing sites are their key relationship.

  - expected output: The Monterey supported landings at both locations.
  - context: []
  - retrieval context: ["['#Canadian Arctic Archipelago'", ' \'A Kwakwaka\\\'wakw totem pole and traditional "big house" in Victoria', " BC.'", ' \'* "Gazetteer of Planetary Nomenclature"', '  Moon Nomenclature: Crater. USGS', " Astrogeology Research Program.'", " 'Chim




267


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |███████████████████████████████|100% (1/1) [Time Taken: 00:20, 20.02s/test case]



Metrics Summary

  - ✅ Contextual Precision (score: 1.0, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: None, error: None)

For test case:

  - input: What did Ford receive on April 13, 1942?
  - actual output: On April 13, 1942, Gerald Ford received a commission as an ensign in the U.S. Naval Reserve.

  - expected output: Ford received a commission as ensign in the US Naval Reserve.
  - context: []
  - retrieval context: ["['Ford in Navy uniform", " 1945'", " 'Ford received a commission as ensign in the U.S. Naval Reserve on April 13", ' 1942. On April 20', ' he reported for active duty to the V-5 instructor school at Annapolis', ' Maryland. After one month of training', ' he went to Navy Preflight School in Chapel Hill', ' North Carolina', ' where he was one of 83 instructors and taught elementary seamanship', ' ordnance', ' gunnery', ' first aid and military drill. In addition', ' he coached in all nine sports that were offered', ' but mostly in swimmi




268


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |███████████████████████████████|100% (1/1) [Time Taken: 00:20, 20.02s/test case]



Metrics Summary

  - ✅ Contextual Precision (score: 1.0, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: None, error: None)

For test case:

  - input: Was Ford released from the hospital?
  - actual output: The provided text doesn't specify if Ford was ever hospitalized, let alone released from one.  To answer this, more information is needed.

  - expected output: Yes.
  - context: []


Overall Metric Pass Rates

Contextual Precision: 100.00% pass rate







269


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |███████████████████████████████|100% (1/1) [Time Taken: 00:20, 20.02s/test case]



Metrics Summary

  - ✅ Contextual Precision (score: 1.0, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: None, error: None)

For test case:

  - input: Was Ford interred at his Presidential Museum in Grand Rapids, Michigan?
  - actual output: Yes, Gerald Ford is interred at his Presidential Museum in Grand Rapids, Michigan.

  - expected output: Yes.
  - context: []
  - retrieval context: ["['After the service", ' Ford was interred at his Presidential Museum in Grand Rapids', " Michigan.'", ' "President Ford\'s tomb at his Presidential Museum in Grand Rapids', ' Michigan."', " 'On December 30", ' 2006', ' Ford became the 11th U.S. President to lie in state.  The burial was preceded by a state funeral and memorial services held at the National Cathedral in Washington', ' D.C. on January 2', ' 2007. Ford was eulogized by former President George H. W. Bush', ' former Secretary of State Henry Kissinger', ' former NBC Nightly News anchorman Tom Brokaw and curren




270


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |███████████████████████████████|100% (1/1) [Time Taken: 00:20, 20.02s/test case]



Metrics Summary

  - ✅ Contextual Precision (score: 1.0, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: None, error: None)

For test case:

  - input: Is Ford one of only four former Presidents to live to 90 or more years of age ?
  - actual output: Yes, the text explicitly states that Ford is one of only four former Presidents to live to 90 or more years of age.

  - expected output: Yes.
  - context: []
  - retrieval context: ["['*Ford is one of only four former Presidents to live to 90 or more years of age. The others are Ronald Reagan (93)", " John Adams (90) and Herbert Hoover (90).'", " '*On November 12", ' 2006', ' Ford became the longest-lived President', ' surpassing Ronald Reagan.    At the time of his death', " he had outlived President Reagan by 45 days.'", " '*Ford was the third longest lived Vice President at the age of 93. The two oldest were John Nance Garner", ' 98', ' and Levi P. Morton', " 96.'", " 'Ford at his 90th birthday party with L




271


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |███████████████████████████████|100% (1/1) [Time Taken: 00:20, 20.02s/test case]



Metrics Summary

  - ✅ Contextual Precision (score: 1.0, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: None, error: None)

For test case:

  - input: Was Ford the last surviving member of the Warren Commission ?
  - actual output: Yes, the text explicitly states that Ford was the last surviving member of the Warren Commission.

  - expected output: Yes.
  - context: []
  - retrieval context: ["['*Ford was the last surviving member of the Warren Commission. Wilson", ' Jeff.  Former President Gerald Ford Dies at 93. Associated Press. December 27', ' 2006. Also available  here.  Retrieved on December 31', " 2006.'", " 'In November 1963", ' President Lyndon B. Johnson appointed Ford to the Warren Commission', ' a special task force set up to investigate the assassination of President John F. Kennedy. Ford was assigned to prepare a biography of Lee Harvey Oswald', ' the accused assassin. In 1997 the Assassination Records Review Board (ARRB) released a document




272


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |███████████████████████████████|100% (1/1) [Time Taken: 00:20, 20.02s/test case]



Metrics Summary

  - ❌ Contextual Precision (score: 0.0, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: None, error: None)

For test case:

  - input: Did Grand Rapids supporters not urge him to take on Bartel J. Jonkman , the incumbent Republican congressman ?
  - actual output: No, the text explicitly states that Grand Rapids supporters *did* urge Ford to take on Bartel J. Jonkman.

  - expected output: No.
  - context: []
  - retrieval context: ["['Following his return from the war", ' Ford became active in local Republican politics. Grand Rapids supporters urged him to take on Bartel J. Jonkman', ' the incumbent Republican congressman. Ford had changed his view of the world as a result of his military service; "I came back a converted internationalist"', ' Ford stated', ' "and of course our congressman at that time was an avowed', ' dedicated isolationist. And I thought he ought to be replaced. Nobody thought I could win. I ended up winning two to one.




273


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |███████████████████████████████|100% (1/1) [Time Taken: 00:20, 20.02s/test case]



Metrics Summary

  - ✅ Contextual Precision (score: 0.8392857142857142, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: None, error: None)

For test case:

  - input: Did they not claim Ford 's pardon was quid pro quo in exchange for Nixon 's resignation ?
  - actual output: No, critics *did* claim Ford's pardon of Nixon was a quid pro quo exchange for Nixon's resignation.

  - expected output: They did claim Ford's pardon was quid pro quo.
  - context: []


Overall Metric Pass Rates

Contextual Precision: 100.00% pass rate







274


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |███████████████████████████████|100% (1/1) [Time Taken: 00:20, 20.02s/test case]



Metrics Summary

  - ✅ Contextual Precision (score: 1.0, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: None, error: None)

For test case:

  - input: Does Ghana have twice the per capita output of the poorer countries in West Africa?
  - actual output: Yes, the text states that Ghana has twice the per capita output of the poorer countries in West Africa.

  - expected output: yes
  - context: []
  - retrieval context: ["['Well endowed with natural resources", ' Ghana has twice the per capita output of the poorer countries in West Africa. Even so', ' Ghana remains heavily dependent on international financial and technical assistance. Gold', ' timber', ' cocoa', ' diamond', ' bauxite', " and manganese exports are major sources of foreign exchange.  The World Factbook   An oilfield which is reported to contain up to 3 billion barrels of light oil was discovered in 2007.'", " 'Ghana is a country located on the Gulf of Guinea", ' only a few degrees north of th




275


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |███████████████████████████████|100% (1/1) [Time Taken: 00:20, 20.02s/test case]



Metrics Summary

  - ✅ Contextual Precision (score: 1.0, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: None, error: None)

For test case:

  - input: Is English Ghana's official language?
  - actual output: Yes, English is Ghana's official language.

  - expected output: yes
  - context: []
  - retrieval context: ['["More than 100 languages and dialects are spoken in Ghana. English is the country\'s official language and predominates government and business affairs. It is also the standard language used for educational instruction. Native Ghanaian languages are divided into two linguistic subfamilies of the Niger-Congo language family. Languages belonging to the Kwa subfamily are found predominantly to the south of the Volta River', ' while those belonging to the Gur subfamily are found predominantly to the north. The Kwa group', " which is spoken by about 75% of the country's population", ' includes the Akan', ' Ga-Dangme', ' and Ewe languages. The Gur g




276


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |███████████████████████████████|100% (1/1) [Time Taken: 00:20, 20.02s/test case]



Metrics Summary

  - ❌ Contextual Precision (score: 0.0, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: None, error: None)

For test case:

  - input: Did the Dutch build the Elmina Castle?
  - actual output: No, the Portuguese built Elmina Castle in 1482.  The Dutch later captured it in 1637.

  - expected output: No
  - context: []
  - retrieval context: ['["In 1482', ' the Portuguese built the Elmina Castle. Their aim was to trade in gold', ' ivory and slaves. In 1481 King John II of Portugal sent Diogo d\'Azambuja to build this castle."', " 'In 1598 the Dutch joined them", ' and built forts at Komenda and Kormantsi. In 1637 they captured the castle from the Portuguese and that of Axim in 1642 (Fort St Anthony). Other European traders joined in by the mid 17th century. These were the English', ' Danes and Swedes. The coastline was dotted by more than 30 forts and castles built by the Dutch', ' British and the Danish merchants', ' the highest concentrati




277


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |███████████████████████████████|100% (1/1) [Time Taken: 00:20, 20.02s/test case]



Metrics Summary

  - ✅ Contextual Precision (score: 0.8333333333333334, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: None, error: None)

For test case:

  - input: When did Ghana achieve independence from the United Kingdom?
  - actual output: Ghana achieved independence from the United Kingdom on March 6, 1957.

  - expected output: 1957
  - context: []
  - retrieval context: ["['Ghana was inhabited in pre-colonial times by a number of ancient kingdoms", ' including the Ga Adangbes on the eastern coast', ' inland Empire of Ashanti and various Fante states along the coast and inland. Trade with European states flourished after contact with the Portuguese in the 15th century', ' and the British established a crown colony', ' Gold Coast', ' in 1874. MacLean', ' Iain. Rational Choice and British Politics: An Analysis of Rhetoric and Manipulation from Peel to Blair', " 2001. Page 76.'", " 'Formed from the merger of the British colony Gold Coast", ' The Empir




278


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |███████████████████████████████|100% (1/1) [Time Taken: 00:20, 20.02s/test case]



Metrics Summary

  - ✅ Contextual Precision (score: 1.0, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: None, error: None)

For test case:

  - input: How many regions is Ghana divided into?
  - actual output: Ghana is divided into 10 regions.

  - expected output: 10
  - context: []
  - retrieval context: ["['Ghana is a divided into 10 regions", " subdivided into a total of 138 districts. The regions are:'", " 'Regions of Ghana'", " 'Ghana is a country located on the Gulf of Guinea", ' only a few degrees north of the Equator', ' therefore giving it a warm climate. The Greenwich Meridian also passes through Ghana', ' specifically through the industrial city of Ghana-Tema; so it is said that Ghana is geographically closer to the "center" of the world than any other country. La Cote d\\\'Ivoire is located to the west of Ghana while Burkina Faso and Togo are located to its north and east respectively. The coastline is mostly a low', ' sandy shore backed by pl




279


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |███████████████████████████████|100% (1/1) [Time Taken: 00:20, 20.02s/test case]



Metrics Summary

  - ✅ Contextual Precision (score: 1.0, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: None, error: None)

For test case:

  - input: Who is the head of state of Ghana?
  - actual output: The provided text names John Agyekum Kufuor as the head of state of Ghana, but this is outdated.  The current head of state would need to be looked up in a current source.

  - expected output: President John Agyekum Kofuor
  - context: []
  - retrieval context: ["['Ghana is a republic and member of the Commonwealth of Nations. Its head of state is President John Agyekum Kufuor", ' the ninth leader of the country since independence. The government sits at Osu Castle. The Parliament of Ghana is unicameral and dominated by two main parties', " the New Patriotic Party and National Democratic Congress.'", " 'Formed from the merger of the British colony Gold Coast", ' The Empire of Ashanti and the British Togoland trust territory by a UN sponsored plebiscite',




280


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |███████████████████████████████|100% (1/1) [Time Taken: 00:20, 20.02s/test case]



Metrics Summary

  - ✅ Contextual Precision (score: 1.0, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: None, error: None)

For test case:

  - input: What is the dominant religion in Ghana?
  - actual output: Christianity is the dominant religion in Ghana.

  - expected output: Christian
  - context: []
  - retrieval context: ["['Religions: Christian 63%", ' African beliefs 21%', ' Muslim 16%', '  Facts on People of Ghana', ' accessed July 13', " 2006'", ' "More than 100 languages and dialects are spoken in Ghana. English is the country\'s official language and predominates government and business affairs. It is also the standard language used for educational instruction. Native Ghanaian languages are divided into two linguistic subfamilies of the Niger-Congo language family. Languages belonging to the Kwa subfamily are found predominantly to the south of the Volta River', ' while those belonging to the Gur subfamily are found predominantly to the north. 




281


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |███████████████████████████████|100% (1/1) [Time Taken: 00:20, 20.01s/test case]



Metrics Summary

  - ✅ Contextual Precision (score: 1.0, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: None, error: None)

For test case:

  - input: What are the two subfamilies of the Native Ghanaian languages?
  - actual output: The two subfamilies of the Native Ghanaian languages are Kwa and Gur.

  - expected output: Kwa and Gur
  - context: []
  - retrieval context: ['["More than 100 languages and dialects are spoken in Ghana. English is the country\'s official language and predominates government and business affairs. It is also the standard language used for educational instruction. Native Ghanaian languages are divided into two linguistic subfamilies of the Niger-Congo language family. Languages belonging to the Kwa subfamily are found predominantly to the south of the Volta River', ' while those belonging to the Gur subfamily are found predominantly to the north. The Kwa group', " which is spoken by about 75% of the country's population", ' incl




282


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |███████████████████████████████|100% (1/1) [Time Taken: 00:20, 20.02s/test case]



Metrics Summary

  - ✅ Contextual Precision (score: 1.0, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: None, error: None)

For test case:

  - input: What is the weather like at Lake Volta?
  - actual output: The weather at Lake Volta is described as hot and humid, consistent with the tropical climate of the area.  More specific details on temperature and rainfall would require additional data.

  - expected output: Warm and comparatively dry
  - context: []
  - retrieval context: ['["The climate is tropical. The eastern coastal belt is warm and comparatively dry (see Dahomey Gap); the southwest corner', ' hot and humid; and the north', ' hot and dry. Lake Volta', " the world's largest artificial lake", ' extends through large portions of eastern Ghana."', " 'The coolest month is June", ' while the warmest is January. The rainfall is equally distributed throughout the year', ' but tends to be a bit more frequent in the autumn months. There can be frequent




283


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |███████████████████████████████|100% (1/1) [Time Taken: 00:20, 20.01s/test case]



Metrics Summary

  - ✅ Contextual Precision (score: 0.9213664274366018, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: None, error: None)

For test case:

  - input: Is Ghana in Asia?
  - actual output: No, Ghana is in Africa.

  - expected output: no
  - context: []
  - retrieval context: ["['Ghana is a country located on the Gulf of Guinea", ' only a few degrees north of the Equator', ' therefore giving it a warm climate. The Greenwich Meridian also passes through Ghana', ' specifically through the industrial city of Ghana-Tema; so it is said that Ghana is geographically closer to the "center" of the world than any other country. La Cote d\\\'Ivoire is located to the west of Ghana while Burkina Faso and Togo are located to its north and east respectively. The coastline is mostly a low', ' sandy shore backed by plains and scrub and intersected by several rivers and streams. A tropical rain forest belt', ' broken by heavily forested hills and many streams 




284


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |███████████████████████████████|100% (1/1) [Time Taken: 00:20, 20.02s/test case]



Metrics Summary

  - ✅ Contextual Precision (score: 1.0, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: None, error: None)

For test case:

  - input: Is English the official language?
  - actual output: Yes, English is the official language of Ghana.

  - expected output: yes
  - context: []
  - retrieval context: ['["Canada\'s two official languages are  English and French. Official Bilingualism in Canada is law', ' defined in the Canadian Charter of Rights and Freedoms', ' the Official Languages Act', ' and Official Language Regulations; it is applied by the Commissioner of Official Languages. English and French have equal status in federal courts', ' Parliament', ' and in all federal institutions. The public has the right', ' where there is sufficient demand', ' to receive federal government services in either English or French', ' and official language minorities are guaranteed their own schools in all provinces and territories."', " 'The national lan




285


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |███████████████████████████████|100% (1/1) [Time Taken: 00:20, 20.02s/test case]



Metrics Summary

  - ✅ Contextual Precision (score: 1.0, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: None, error: None)

For test case:

  - input: Is Ghana's head of state John Agyekum Kufuor?
  - actual output: While the text mentions John Agyekum Kufuor as the head of state, that information is outdated.  He was the head of state at the time the text was written, but that is no longer the case.  To find the current head of state, a more recent source is needed.

  - expected output: yes
  - context: []
  - retrieval context: ["['Ghana is a republic and member of the Commonwealth of Nations. Its head of state is President John Agyekum Kufuor", ' the ninth leader of the country since independence. The government sits at Osu Castle. The Parliament of Ghana is unicameral and dominated by two main parties', " the New Patriotic Party and National Democratic Congress.'", " 'Formed from the merger of the British colony Gold Coast", ' The Empire of Ashanti an




286


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |███████████████████████████████|100% (1/1) [Time Taken: 00:20, 20.02s/test case]



Metrics Summary

  - ✅ Contextual Precision (score: 0.6666666666666666, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: None, error: None)

For test case:

  - input: What does the word Ghana mean?
  - actual output: The word "Ghana" means "Warrior King".

  - expected output: warrior king
  - context: []
  - retrieval context: ["['The Republic of Ghana is a country in West Africa. It borders CÃ´te d\\'Ivoire (also known as Ivory Coast) to the west", ' Burkina Faso to the north', ' Togo to the east', ' and the Gulf of Guinea to the south. The word "Ghana" means "Warrior King"', ' Jackson', ' John G. Introduction to African Civilizations', ' 2001. Page 201.  and was the source of the name "Guinea" (via French Guinoye) used to refer to the West African coast (as in Gulf of Guinea).\'', " 'Ghana is a country located on the Gulf of Guinea", ' only a few degrees north of the Equator', ' therefore giving it a warm climate. The Greenwich Meridian also passes throu




287


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |███████████████████████████████|100% (1/1) [Time Taken: 00:20, 20.02s/test case]



Metrics Summary

  - ✅ Contextual Precision (score: 1.0, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: None, error: None)

For test case:

  - input: Who was Kwame Nkrumah?
  - actual output: Kwame Nkrumah was the founder and first president of Ghana.  He was a key figure in the fight for Ghana's independence from British colonial rule and a prominent advocate of Pan-Africanism.

  - expected output: founder and first president
  - context: []
  - retrieval context: ["['Formed from the merger of the British colony Gold Coast", ' The Empire of Ashanti and the British Togoland trust territory by a UN sponsored plebiscite', ' Ghana became the first democratic sub-Sahara country in colonial Africa to gain its independence in 1957. Kwame Nkrumah', ' founder and first president of the modern Ghanaian state', ' was not only an African anti-colonial leader but also one with a dream of a united Africa which would not drift into neo-colonialism. He was the first Af




288


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |███████████████████████████████|100% (1/1) [Time Taken: 00:20, 20.02s/test case]



Metrics Summary

  - ✅ Contextual Precision (score: 0.5873015873015873, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: None, error: None)

For test case:

  - input: Where is old Ghana in relation to present Ghana?
  - actual output: Old Ghana (the medieval empire) was located approximately 500 miles north of present-day Ghana, between the Senegal and Niger Rivers.

  - expected output: 500 miles north
  - context: []
  - retrieval context: ["['Geographically", ' the old Ghana is 500 miles north of the present Ghana', " and occupied the area between Rivers Senegal and Niger.'", " 'Ghana is a country located on the Gulf of Guinea", ' only a few degrees north of the Equator', ' therefore giving it a warm climate. The Greenwich Meridian also passes through Ghana', ' specifically through the industrial city of Ghana-Tema; so it is said that Ghana is geographically closer to the "center" of the world than any other country. La Cote d\\\'Ivoire is located to the




289


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |███████████████████████████████|100% (1/1) [Time Taken: 00:20, 20.02s/test case]



Metrics Summary

  - ✅ Contextual Precision (score: 1.0, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: None, error: None)

For test case:

  - input: Who makes up Ghana's Parliament?
  - actual output: Ghana's Parliament is made up of members from two main parties: the New Patriotic Party and the National Democratic Congress.

  - expected output: the New Patriotic Party and National Democratic Congress
  - context: []
  - retrieval context: ["['Ghana is a republic and member of the Commonwealth of Nations. Its head of state is President John Agyekum Kufuor", ' the ninth leader of the country since independence. The government sits at Osu Castle. The Parliament of Ghana is unicameral and dominated by two main parties', " the New Patriotic Party and National Democratic Congress.'", " 'Ghana was inhabited in pre-colonial times by a number of ancient kingdoms", ' including the Ga Adangbes on the eastern coast', ' inland Empire of Ashanti and various Fante st




290


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |███████████████████████████████|100% (1/1) [Time Taken: 00:20, 20.02s/test case]



Metrics Summary

  - ✅ Contextual Precision (score: 0.9480519480519481, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: None, error: None)

For test case:

  - input: What European countries established states in Ghana?
  - actual output: Portugal, Britain, the Netherlands, Denmark, and Sweden all established trading posts and, at times, exerted control over various parts of what is now Ghana.  The British ultimately gained dominance.

  - expected output: The UK
  - context: []
  - retrieval context: ["['Ghana was inhabited in pre-colonial times by a number of ancient kingdoms", ' including the Ga Adangbes on the eastern coast', ' inland Empire of Ashanti and various Fante states along the coast and inland. Trade with European states flourished after contact with the Portuguese in the 15th century', ' and the British established a crown colony', ' Gold Coast', ' in 1874. MacLean', ' Iain. Rational Choice and British Politics: An Analysis of Rhetoric and Ma




291


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |███████████████████████████████|100% (1/1) [Time Taken: 00:20, 20.02s/test case]



Metrics Summary

  - ✅ Contextual Precision (score: 1.0, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: None, error: None)

For test case:

  - input: Does the Gray Wolf share an ancestry with the domestic dog?
  - actual output: Yes, the provided text explicitly states that the gray wolf shares a common ancestry with the domestic dog.

  - expected output: Yes
  - context: []
  - retrieval context: ["['The gray wolf (Canis lupus)", ' also known as the timber wolf or', ' simply', ' wolf', " is a mammal of the order Carnivora. The gray wolf is the largest member of the family Canidae and also the most well known of wolves. Its shoulder height ranges from 0.6 to 0.9 meters (26â\\x80\\x9336 inches) and its weight typically varies between 32 and 62 kilograms (70â\\x80\\x93135 pounds). As evidenced by studies of DNA sequencing and genetic drift the gray wolf shares a common ancestry with the domestic dog (Canis lupus familiaris).'", " 'Current theories propose 




292


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |███████████████████████████████|100% (1/1) [Time Taken: 00:20, 20.02s/test case]



Metrics Summary

  - ✅ Contextual Precision (score: 0.8333333333333333, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: None, error: None)

For test case:

  - input: Do wolf pups tend to have darker fur than adults?
  - actual output: Yes, the text says wolf pups tend to have darker fur at birth than they will as adults.

  - expected output: Yes
  - context: []
  - retrieval context: ["['At birth", ' wolf pups tend to have darker fur and blue irises that will change to a yellow-gold or orange color when the pups are between 8 and 16 weeks old.    Though extremely unusual', " it is possible for an adult wolf to retain its blue-colored irises.'", ' "Fur coloration varies greatly', ' running from gray to gray-brown', ' all the way through the canine spectrum of white', ' red', ' brown', ' and black. These colors tend to mix in many populations to form predominantly blended individuals', " though it is certainly not uncommon for an individual or an entire pop




293


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |███████████████████████████████|100% (1/1) [Time Taken: 00:20, 20.02s/test case]



Metrics Summary

  - ✅ Contextual Precision (score: 1.0, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: None, error: None)

For test case:

  - input: Do wolves leave their pack?
  - actual output: Yes, the text indicates that young wolves, typically upon reaching sexual maturity, leave their birth packs to find mates and establish their own territories.  This is referred to as dispersal.

  - expected output: Yes
  - context: []
  - retrieval context: ['["The size of the pack may change over time and is controlled by several factors', ' including habitat', ' personalities of individual wolves within a pack', ' and food supply. Packs can contain between 2 and 20 wolves', ' though 8 is a more typical size.    New packs are formed when a wolf leaves its birth pack', ' finds a mate', " and claims a territory. Lone wolves searching for other individuals can travel very long distances seeking out suitable territories. Dispersing individuals must avoid the terri




294


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |███████████████████████████████|100% (1/1) [Time Taken: 00:20, 20.02s/test case]



Metrics Summary

  - ✅ Contextual Precision (score: 1.0, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: None, error: None)

For test case:

  - input: What kinds of coats do wolves have?
  - actual output: Wolves have double coats.  The outer layer consists of tough guard hairs that repel water and dirt, while the inner layer is a dense, water-resistant undercoat that provides insulation.  They also have distinct winter and summer coats.

  - expected output: bulky coats with two layers
  - context: []
  - retrieval context: ['["Wolves have bulky coats consisting of two layers. The first layer is made up of tough guard hairs designed to repel water and dirt. The second is a dense', " water-resistant undercoat that insulates. The undercoat is shed in the form of large tufts of fur in late spring or early summer (with yearly variations). A wolf will often rub against objects such as rocks and branches to encourage the loose fur to fall out. The undercoat is 




295


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |███████████████████████████████|100% (1/1) [Time Taken: 00:20, 20.02s/test case]



Metrics Summary

  - ✅ Contextual Precision (score: 1.0, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: None, error: None)

For test case:

  - input: Where do wolves have scent glands?
  - actual output: Wolves have scent glands all over their bodies, including at the base of the tail, between their toes, and in their eyes, genitalia, and skin.

  - expected output: Helping the woldf to effectively navigate over large expanses while concurrently keeping others informed of its whereabouts.
  - context: []


Overall Metric Pass Rates

Contextual Precision: 100.00% pass rate







296


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |███████████████████████████████|100% (1/1) [Time Taken: 00:20, 20.02s/test case]



Metrics Summary

  - ✅ Contextual Precision (score: 1.0, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: None, error: None)

For test case:

  - input: How much do wolves weigh?
  - actual output: Wolf weight varies considerably depending on location and sex.  The text gives a range of 32 to 62 kilograms (70 to 135 pounds), with some exceptionally large specimens exceeding 77 kg (170 lb).  Females generally weigh about 20% less than males.

  - expected output: typically varies between 32 and 62 kilograms
  - context: []
  - retrieval context: ['["Wolf weight and size can vary greatly worldwide', " tending to increase proportionally with latitude as predicted by Bergmann's Rule. In general", ' height varies from 0.6 to 0.95 meters (26 38 inches) at the shoulder and weight ranges from 32 to 62 kilograms (70 135 pounds)', ' which together make the gray wolf the largest of all wild canids.    Although rarely encountered', ' extreme specimens of more than 77 kg




297


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |███████████████████████████████|100% (1/1) [Time Taken: 00:20, 20.02s/test case]



Metrics Summary

  - ✅ Contextual Precision (score: 1.0, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: None, error: None)

For test case:

  - input: Why do wolves howl?
  - actual output: Wolves howl for a variety of reasons, including:

* **Keeping in touch:**  Maintaining contact with pack members over distances or in dense environments.
* **Calling pack members:** Summoning others to a specific location.
* **Communal bonding:** Strengthening social bonds within the pack.
* **Confusion of numbers:**  Creating uncertainty for rival packs about the size of the howling pack.


  - expected output: Howling helps pack members keep in touch, allowing them to communicate effectively in thickly forested areas or over great distances. Howling also helps to call pack members to a specific location. Howling can also serve as a declaration of territory, as shown in a dominant wolf&apos;s tendency to respond to a human imitation of a "rival" wolf in an area the wol




298


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |███████████████████████████████|100% (1/1) [Time Taken: 00:20, 20.01s/test case]



Metrics Summary

  - ✅ Contextual Precision (score: 1.0, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: None, error: None)

For test case:

  - input: What is surplus killing?
  - actual output: Surplus killing is the act of a predator killing more prey animals than it can consume at one time.  It's often attributed to an overstimulation of the predator's killing instinct when presented with an abundance of easily captured prey.

  - expected output: Surplus killing is defined as the killing of several prey animals too numerous to eat at one sitting.
  - context: []
  - retrieval context: ['["Surplus killing is defined as the killing of several prey animals too numerous to eat at one sitting. During a surplus-kill', " a predator's killing instinct is continually sparked off by the stimuli of so many prey animals unable to escape", " so that the predator cannot stop killing. An instance of surplus killing by wolves was witnessed in Canada's Northwest Territ




299


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |███████████████████████████████|100% (1/1) [Time Taken: 00:24, 24.70s/test case]



Metrics Summary

  - ✅ Contextual Precision (score: 1.0, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: None, error: None)

For test case:

  - input: Are Gray Wolves native to North America?
  - actual output: While Gray Wolves are currently found in North America, the text indicates they migrated there from Eurasia, meaning they are not *native* in the sense of having originated there.

  - expected output: Yes
  - context: []
  - retrieval context: ["['Current theories propose that the gray wolf first evolved in Eurasia during the early Pleistocene. The rate of changes observed in DNA sequence date the South-East Asiatic lineage to about 800", '000 years', ' as opposed to the American and European lineages which stretch back only 150', '000.    The gray wolf then migrated into North America from the Old World', ' probably via the Bering land bridge (that once joined Alaska and Siberia)', ' around 400', '000 years ago.  The gray wolf then coexisted with 




In [51]:
len(all_1results)

300

In [52]:
# Quick print for results after evaluation of single test cases, as needed
# Get scores to calculate average score
scores = []
for i in [all_1results]:
    for j in i:
        if type(j) == TestResult:
            scores.append(j.metrics_data[0].score)
            #print_test_result(j)
        else:
            print_test_result(j[0])

In [53]:
# Calculate the average for the metric (with 918 examples total)
scoredata = pd.DataFrame(scores, index=None)
scoredata.mean()

0    0.823368
dtype: float64

In [54]:
len(scoredata)

300

In [45]:
len(all_1results)

333

In [46]:
# to copy contents of all_results to the clipboard as needed
pd.DataFrame(all_1results).to_clipboard(index=False,header=False)
# all_results

In [47]:
# Save the scores
scoredata.to_csv("results/deepeval_updatedlib_contextualprecision_gemini_15_rag_mini_wikipedia_0_300.csv", index=False)

In [1]:
import os
from IPython.display import display, Markdown
import pandas as pd

In [39]:
# Calculating deepeval gemini 2.0 faithfulness
data1 = pd.read_csv("results/deepeval_faithfulness_gemini_2_rag_mini_wikipedia_0_121.csv", index_col=None)
data2 = pd.read_csv("results/deepeval_faithfulness_gemini_2_rag_mini_wikipedia_121_596.csv", index_col=None)
data3 = pd.read_csv("results/deepeval_faithfulness_gemini_2_rag_mini_wikipedia_596_903.csv", index_col=None)

In [40]:
alldata2 = pd.concat([data1, data2, data3], ignore_index=True)

In [41]:
alldata2

Unnamed: 0,0
0,1.0
1,1.0
2,1.0
3,1.0
4,1.0
...,...
898,1.0
899,1.0
900,1.0
901,1.0


In [43]:
alldata2.mean()

0    0.967882
dtype: float64

In [44]:
alldata2.to_csv("results/deepeval_faithfulness_gemini_2_rag_mini_wikipedia_903.csv", index=False)

In [15]:
refinedscoredata = scoredata[:903]

In [18]:
# Calculate the average for the metric
refinedscoredata.mean()

0    0.792454
dtype: float64

In [17]:
# Save the refined scores
refinedscoredata.to_csv("results/deepeval_contextprecision_rag_mini_wikipedia_903.csv", index=False)

In [26]:
# DeepEval also has the RAGAS metrics available for evaluation

# Unforuntately the RAGAS metrics in DeepEval only accept langChain chat models, so the Gemini DeepEvalBaseLLM class will not work with these metrics
# Need to use our LangChain LLM created earlier:  
# llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash")
# doc_embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004") 

ragasmetric = RagasMetric(model=llm, embeddings=doc_embeddings)
ragas_ar = RAGASAnswerRelevancyMetric(model=llm, embeddings=doc_embeddings)
ragas_f = RAGASFaithfulnessMetric(model=llm)
ragas_crecall = RAGASContextualRecallMetric(model=llm)
ragas_cp = RAGASContextualPrecisionMetric(model=llm)
ragas_crel = RAGASContextualRelevancyMetric(model=llm) # Note: This metric did not work in testing; returned errors related to the model

In [None]:
# Example of evaluation for RagasMetric, an average of RAGAS's Answer Relevancy, Faithfulness, Contextual Recall, and Contextual Precision metrics
eval_ragas = evaluate(test_cases=[evaldataset.test_cases[0]], metrics=[ragasmetric], throttle_value=90)

In [None]:
# Example of evaluation for each ragas metric individually
eval_ragas_f = evaluate(test_cases=evaldataset.test_cases, metrics=[ragas_f], throttle_value=90)