# Notebook to examine rag evaluation results

In [1]:
# Imports
import sys
import pandas as pd
import numpy as np
sys.path.append("../dev/")
sys.path.append("../src/")
import time

from csv_helpers import get_csv_files_from_dir

In [33]:
# Get eval results file names
eval_results_dir = "../parallel_100_rows_eval/miniWiki"
eval_results_file_names = get_csv_files_from_dir(eval_results_dir)

# Loop over all eval results files
# Get column names which are the config paramters and the mean correctness or some other metric
columns = [
    "queExp",
    "rerank",
    "cExp",
    "backRev",
    "numRefLim",
    "metric",
    "evaluator",
] + ["MeanCorrectness"]
results_df = pd.DataFrame(columns=columns)

for filename in eval_results_file_names[:]:  # Iterate over all eval results files
    # Filter files: Only look at files with quExp1_rerank1_cExp*_backRevFalse_numRef4
    # first_file = f"{eval_results_dir}/{eval_results_file_names[1]}"  # Slice for dev
    file = eval_results_dir + "/" + filename  # Slice for dev
    # Read eval results from CSV
    eval_results_df = pd.read_csv(file)
    correctness_values = eval_results_df["Correct"].dropna().tolist()
    #print(f"Correctness values: {correctness_values}")
    mean = sum(correctness_values) / len(correctness_values)
    # Project the values to be between 0 and 1
    mean_scaled = (mean - min(correctness_values)) / (max(correctness_values) - min(correctness_values))

    #print(f"Mean correctness: {mean}")
    # Append the correctness values to the list
    config = filename.split("_")
    #print(f" Config: {config}")
    results_df = pd.concat(
        [
            results_df,
            pd.DataFrame(
                [
                    {
                        "queExp": config[0],
                        "rerank": config[1],
                        "cExp": config[2],
                        "backRev": config[3],
                        "numRefLim": config[4],
                        "metric": config[5],
                        "evaluator": config[6],
                        "MeanCorrectness": mean_scaled,
                    }
                ]
            ),
        ]
    )

results_df[results_df["evaluator"] == "ROUGE-2.csv"].sort_values(by="MeanCorrectness", ascending=False)


  results_df = pd.concat(


Unnamed: 0,queExp,rerank,cExp,backRev,numRefLim,metric,evaluator,MeanCorrectness
0,quExp1,rerankTrue,cExpTrue,backRevFalse,numRefLim4,all,ROUGE-2.csv,0.88958
0,quExp1,rerankTrue,cExpTrue,backRevFalse,numRefLim2,all,ROUGE-2.csv,0.88695
0,quExp1,rerankTrue,cExpTrue,backRevFalse,numRefLim6,all,ROUGE-2.csv,0.88625
0,quExp1,rerankFalse,cExpTrue,backRevFalse,numRefLim4,all,ROUGE-2.csv,0.87381
0,quExp1,rerankTrue,cExpTrue,backRevTrue,numRefLim2,all,ROUGE-2.csv,0.87175
0,quExp1,rerankFalse,cExpTrue,backRevFalse,numRefLim6,all,ROUGE-2.csv,0.86965
0,quExp1,rerankFalse,cExpTrue,backRevTrue,numRefLim6,all,ROUGE-2.csv,0.86899
0,quExp1,rerankTrue,cExpTrue,backRevTrue,numRefLim4,all,ROUGE-2.csv,0.86874
0,quExp1,rerankFalse,cExpTrue,backRevFalse,numRefLim2,all,ROUGE-2.csv,0.86795
0,quExp1,rerankFalse,cExpTrue,backRevTrue,numRefLim2,all,ROUGE-2.csv,0.86589


In [14]:
# Look at results from two 2 files
# Get eval results file names
eval_results_dir = "../parallel_100_rows_eval/miniWiki"
eval_results_file_names = get_csv_files_from_dir(eval_results_dir)

simple_config_file = eval_results_dir + "/" + eval_results_file_names[1]
complex_config_file = eval_results_dir + "/" + eval_results_file_names[29]
print(f" simple config: {simple_config_file}")
print(f" complex config: {complex_config_file}")

simple_config_df = pd.read_csv(simple_config_file)
simple_config_df.head(10)

#complex_config_df = pd.read_csv(complex_config_file)
#complex_config_df.head(10)

 simple config: ../parallel_100_rows_eval/miniWiki/quExp1_rerankFalse_cExpFalse_backRevFalse_numRefLim2_all_llm_judge.csv
 complex config: ../parallel_100_rows_eval/miniWiki/quExp1_rerankTrue_cExpFalse_backRevFalse_numRefLim6_all_llm_judge.csv


Unnamed: 0,question,answer,contexts,contexts_ids,ground_truth,goldPassages,Correct,CR,Faithfulness,AR
0,Was Abraham Lincoln the sixteenth President of...,Yes.,"Context: Abraham Lincoln (February 12, 1809 â...","278, 319",yes,,5.0,[5. 5.],[2. 3.],5.0
1,Did Lincoln sign the National Banking Act of 1...,No,Context: . Other important legislation involv...,"360, 1797",yes,,1.0,[4. 5.],[5. 5.],5.0
2,Did his mother die of pneumonia?,No,Context: Alice Hathaway Lee Roosevelt (July 2...,"2894, 262",no,,5.0,[5. 2.],[5. 5.],5.0
3,How many long was Lincoln's formal education?,About 18 months of schooling.,Context: Lincoln's formal education consisted...,"287, 376",18 months,,5.0,[5. 5.],[5. 3.],5.0
4,When did Lincoln begin his political career?,"Lincoln began his political career in 1832, at...","Context: . During this time, he made many tri...","302, 289",1832,,5.0,[4. 5.],[3. 5.],5.0
5,What did The Legal Tender Act of 1862 establish?,"The United States Note, the first paper curren...",Context: The Legal Tender Act of 1862 establi...,"361, 330","the United States Note, the first paper curren...",,5.0,[5. 5.],[5. 5.],5.0
6,Who suggested Lincoln grow a beard?,11-year-old Grace Bedell,Context: While Lincoln is usually portrayed b...,"382, 380",11-year-old Grace Bedell,,5.0,[5. 3.],[5. 2.],5.0
7,When did the Gettysburg address argue that Ame...,The Gettysburg Address argued that America was...,Context: In his Gettysburg Address Lincoln re...,"358, 379",1776,,5.0,[5. 5.],[5. 5.],5.0
8,Did Lincoln beat John C. Breckinridge in the 1...,"Yes, Lincoln beat John C. Breckinridge of the ...","Context: On November 6, 1860, Lincoln was ele...","319, 202",yes,,5.0,[5. 5.],[5. 4.],5.0
9,Was Abraham Lincoln the first President of the...,No,"Context: Abraham Lincoln (February 12, 1809 â...","278, 297",No,,5.0,[3. 5.],[3. 5.],2.0


In [4]:
# Look at filenames 
# Get eval results file names
eval_results_dir = "../parallel_100_rows_eval/miniWiki"
eval_results_file_names = get_csv_files_from_dir(eval_results_dir)

# Loop over all eval results files and output the filenames with an index in the dir
for i, filename in enumerate(eval_results_file_names[:]):  # Iterate over all eval results files
    print(f"{i}: {filename}")

0: quExp1_rerankFalse_cExpFalse_backRevFalse_numRefLim2_all_ROUGE-2.csv
1: quExp1_rerankFalse_cExpFalse_backRevFalse_numRefLim2_all_llm_judge.csv
2: quExp1_rerankFalse_cExpFalse_backRevFalse_numRefLim4_all_ROUGE-2.csv
3: quExp1_rerankFalse_cExpFalse_backRevFalse_numRefLim4_all_llm_judge.csv
4: quExp1_rerankFalse_cExpFalse_backRevFalse_numRefLim6_all_ROUGE-2.csv
5: quExp1_rerankFalse_cExpFalse_backRevFalse_numRefLim6_all_llm_judge.csv
6: quExp1_rerankFalse_cExpFalse_backRevTrue_numRefLim2_all_ROUGE-2.csv
7: quExp1_rerankFalse_cExpFalse_backRevTrue_numRefLim2_all_llm_judge.csv
8: quExp1_rerankFalse_cExpFalse_backRevTrue_numRefLim4_all_ROUGE-2.csv
9: quExp1_rerankFalse_cExpFalse_backRevTrue_numRefLim4_all_llm_judge.csv
10: quExp1_rerankFalse_cExpFalse_backRevTrue_numRefLim6_all_ROUGE-2.csv
11: quExp1_rerankFalse_cExpFalse_backRevTrue_numRefLim6_all_llm_judge.csv
12: quExp1_rerankFalse_cExpTrue_backRevFalse_numRefLim2_all_ROUGE-2.csv
13: quExp1_rerankFalse_cExpTrue_backRevFalse_numRefLim2_

In [32]:
# Get eval results file names
eval_results_dir = "../parallel_100_rows_eval/miniWiki"
eval_results_file_names = get_csv_files_from_dir(eval_results_dir)

results_df = pd.DataFrame(columns=columns)
# Loop over all eval results files and output the filenames with an index in the dir

for filename in eval_results_file_names[0:1]:  # Iterate over all eval results files
    # Filter files: Only look at files with quExp1_rerank1_cExp*_backRevFalse_numRef4
    # first_file = f"{eval_results_dir}/{eval_results_file_names[1]}"  # Slice for dev
    file = eval_results_dir + "/" + filename  # Slice for dev
    print(f"File: {file}")
    # Read eval results from CSV
    eval_results_df = pd.read_csv(file)
    # Get first 10 rows 
    # eval_results_df = eval_results_df.head(10)
    # print(f"Eval results: {eval_results_df.head()}")

    # Sum the CR, Faithfulness and AR values and put in a new colum called triad sum
    eval_results_df["CR"] = eval_results_df["CR"].apply(lambda x: sum(map(float, x.strip('[]').split())) / len(x.strip('[]').split()) if pd.notna(x) else np.nan)
    eval_results_df["Faithfulness"] = eval_results_df["Faithfulness"].apply(lambda x: sum(map(float, x.strip('[]').split())) / len(x.strip('[]').split()) if pd.notna(x) else np.nan)
    eval_results_df["TriadSum"] = (eval_results_df["CR"] + eval_results_df["Faithfulness"] + eval_results_df["AR"]) / 3

    # Take the first element of context relevance, faithfulness and answer relevance
    # eval_results_df["CR"] = eval_results_df["CR"].apply(lambda x: float(x.strip('[]').split()[0]))
    # eval_results_df["Faithfulness"] = eval_results_df["Faithfulness"].apply(lambda x: float(x.strip('[]').split()[0]))
    # eval_results_df["TriadSum"] = ( eval_results_df["CR"]* 2 + eval_results_df["Faithfulness"]* 2 + eval_results_df["AR"] ) / 3
    
    
eval_results_df[["answer",'ground_truth',"TriadSum","CR","Faithfulness","AR", "Correct"]]
#eval_results_df[["TriadSum","CR","Faithfulness","AR", "Correct"]].corr()
    # Examine correlation between TriadSum and Correct


File: ../parallel_100_rows_eval/miniWiki/quExp1_rerankFalse_cExpFalse_backRevFalse_numRefLim2_all_ROUGE-2.csv


Unnamed: 0,answer,ground_truth,TriadSum,CR,Faithfulness,AR,Correct
0,Yes.,yes,0.3,0.9,0.0,0.0,1.0
1,No,yes,0.0555,0.1665,0.0,0.0,0.0
2,No,no,0.111,0.333,0.0,0.0,1.0
3,About 18 months of schooling.,18 months,0.198667,0.5,0.096,0.0,1.0
4,"Lincoln began his political career in 1832, at...",1832,0.333167,0.2855,0.143,0.571,1.0
5,"The United States Note, the first paper curren...","the United States Note, the first paper curren...",0.208833,0.3335,0.182,0.111,1.0
6,11-year-old Grace Bedell,11-year-old Grace Bedell,0.121167,0.25,0.1135,0.0,1.0
7,The Gettysburg Address argued that America was...,1776,0.384333,0.3,0.153,0.7,1.0
8,"Yes, Lincoln beat John C. Breckinridge of the ...",yes,0.529667,0.35,0.339,0.9,1.0
9,No,No,0.2,0.6,0.0,0.0,1.0


## Evaluate gold passages if given

In [6]:
# Get pipe results file names
pipe_results_dir = "../parallel_100_rows_pipe/miniBiosQA"
pipe_results_file_names = get_csv_files_from_dir(pipe_results_dir)

# Define eval params
method = "all"
evaluator = "sem_similarity"

# Column names with the config params and the matches and sum of matches
columns = [
    "queExp",
    "rerank",
    "cExp",
    "backRev",
    "numRefLim",
] + ["matches", "sum_matches"]
# Define dataframe to hold the parameter confiurations and the gold passage matches
context_id_matches = pd.DataFrame(columns=columns)
# Time the evaluation
start = time.time()
# Loop over all pipe results files to conduct evaluation
for pipe_results_file_name in pipe_results_file_names[:]:  # Slice for dev
    # Only look at files with quExp1_rerank1_cExpFalse_backRevFalse_numRef
    pipe_results_file = f"{pipe_results_dir}/{pipe_results_file_name}"
    print(pipe_results_file)
    # Get param settings from the file name
    param_settings = pipe_results_file_name.split("_")

    # Test print results
    # for elem in pipe_results:
    #    pprint(elem)

    # Evaluate pipe results
    slice_for_dev = 5  # Slice for dev
    df = pd.read_csv(pipe_results_file)

    # Keep only the 'contexts_ids' and 'goldPassages' columns
    # Calcualte matches between the ids in the two columns
    df["contexts_ids"] = df["contexts_ids"].apply(
        lambda row: list(map(int, row.split(", "))) if row else []
    )
    print(df["goldPassages"])
    df["goldPassages"] = df["goldPassages"].apply(
        lambda row: list(map(int, row.split(", "))) if row else []
    )
    # Calculate matches between the ids in the two columns
    matches = df.apply(
        lambda row: len(set(row["contexts_ids"]).intersection(row["goldPassages"])),
        axis=1,
    )
    # Add the matches to the DataFrame
    df = df[["contexts_ids", "goldPassages"]]
    # df["matches"] = matches

    # Write the eval results to a csv file
    # eval_results_dir = "./parallel_100_rows_eval"
    # Add param settings and matches to the DataFrame
    # print(f"Param settings: {param_settings}")
    # print(f"Matches \n: {matches.array}")
    # Add the param settings and matches row per row to the DataFrame
    context_id_matches = pd.concat(
        [
            context_id_matches,
            pd.DataFrame(
                [
                    {
                        "queExp": param_settings[0],
                        "rerank": param_settings[1],
                        "cExp": param_settings[2],
                        "backRev": param_settings[3],
                        "numRefLim": param_settings[4],
                        "matches": matches.array,
                        "sum_matches": matches.sum(),
                    }
                ]
            ),
        ]
    )


end = time.time()
pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
print("Context matches: \n")
context_id_matches


../parallel_100_rows_pipe/miniBiosQA/quExp1_rerankFalse_cExpFalse_backRevFalse_numRefLim2_.csv
0                          23179372, 19270706, 23184418
1     26997282, 21589869, 19822671, 29867216, 153618...
2     20215713, 17851009, 22882019, 12527940, 243483...
3                                              26530723
4     23074401, 17039265, 18922117, 17463177, 163615...
                            ...                        
95                                             32868918
96    8864644, 8761317, 3384188, 22402252, 3850773, ...
97                                             21456524
98    29209056, 26240385, 24824069, 27773677, 285754...
99                         33057672, 30088449, 30892111
Name: goldPassages, Length: 100, dtype: object
../parallel_100_rows_pipe/miniBiosQA/quExp1_rerankFalse_cExpFalse_backRevFalse_numRefLim4_.csv
0                          23179372, 19270706, 23184418
1     26997282, 21589869, 19822671, 29867216, 153618...
2     20215713, 17851009, 22882019,

Unnamed: 0,queExp,rerank,cExp,backRev,numRefLim,matches,sum_matches
0,quExp1,rerankFalse,cExpFalse,backRevFalse,numRefLim2,"[2, 1, 2, 1, 1, 1, 2, 2, 2, 0, 0, 1, 2, 2, 1, ...",125
0,quExp1,rerankFalse,cExpFalse,backRevFalse,numRefLim4,"[2, 1, 4, 1, 1, 2, 4, 4, 3, 0, 0, 2, 3, 2, 1, ...",203
0,quExp1,rerankFalse,cExpFalse,backRevFalse,numRefLim6,"[3, 1, 5, 1, 1, 4, 6, 5, 4, 0, 0, 2, 4, 2, 1, ...",266
0,quExp1,rerankFalse,cExpFalse,backRevTrue,numRefLim2,"[2, 1, 2, 1, 1, 1, 2, 2, 2, 0, 0, 1, 2, 2, 1, ...",125
0,quExp1,rerankFalse,cExpFalse,backRevTrue,numRefLim4,"[2, 1, 4, 1, 1, 2, 4, 4, 3, 0, 0, 2, 3, 2, 1, ...",203
0,quExp1,rerankFalse,cExpFalse,backRevTrue,numRefLim6,"[3, 1, 5, 1, 1, 4, 6, 5, 4, 0, 0, 2, 4, 2, 1, ...",266
0,quExp1,rerankFalse,cExpTrue,backRevFalse,numRefLim2,"[2, 1, 2, 1, 1, 1, 2, 2, 2, 0, 0, 1, 2, 2, 1, ...",125
0,quExp1,rerankFalse,cExpTrue,backRevFalse,numRefLim4,"[2, 1, 4, 1, 1, 2, 4, 4, 3, 0, 0, 2, 3, 2, 1, ...",203
0,quExp1,rerankFalse,cExpTrue,backRevFalse,numRefLim6,"[3, 1, 5, 1, 1, 4, 6, 5, 4, 0, 0, 2, 4, 2, 1, ...",266
0,quExp1,rerankFalse,cExpTrue,backRevTrue,numRefLim2,"[2, 1, 2, 1, 1, 1, 2, 2, 2, 0, 0, 1, 2, 2, 1, ...",125


In [7]:
# Test ROUGE scores
import sys


from pprint import pprint

from pipe import RagPipe
from vector_store import VectorStore
from dataset_helpers import DatasetHelpers

# Define API ENDPOINTS
LLM_URL = "http://10.103.251.104:8040/v1"
LLM_NAME = "llama3.1:latest"
LLM_70B_NAME = "llama3.1:70b"
MARQO_URL = "http://10.103.251.104:8882"
MARQO_URL_GPU = "http://10.103.251.104:8880"

##
## Load the VectorStore
##

documentDB = VectorStore(MARQO_URL_GPU)  # Connect to marqo client via python API
print(documentDB.getIndexes())  # Print all indexes
documentDB.connectIndex("miniwiki-gpu")  # Connect to the minibio
stats = documentDB.getIndexStats()
print(stats)

##
## Load Dataset
##
# Load QM queries
datasetHelpers = DatasetHelpers()
corpus_list, queries, ground_truths, goldPassages = (
    datasetHelpers.loadMiniWiki()
)  # Mini Bios


##
## Load the RagPipe
##

pipe = RagPipe()
pipe.connectVectorStore(documentDB)
pipe.connectLLM(LLM_URL, LLM_NAME)


##
## Set parameters for the pipeline
##


pipe.setConfigs(
    lang="EN",
    query_expansion=1,
    rerank=False,
    prepost_context=False,
    background_reversed=False,
    search_ref_lex=2,
    search_ref_sem=2,
    num_ref_lim=4,
    model_temp=0.1,
    answer_token_num=50,
)


# Run pipeline
pipe.run(questions=queries[:5], ground_truths=ground_truths, goldPassagesIds=goldPassages)
print(pipe.rag_elements)

[{'indexName': 'balkan-images'}, {'indexName': 'demo'}, {'indexName': 'balkan'}, {'indexName': 'minibios-qa-gpu'}, {'indexName': 'miniwiki-gpu'}]
Index connected: miniwiki-gpu 
{'numberOfDocuments': 7432, 'numberOfVectors': 9284, 'backend': {'memoryUsedPercentage': 0.56995741283, 'storageUsedPercentage': 70.26710776112999}}
Loading MiniWiki dataset
 Language model URL: http://10.103.251.104:8040/v1
 Language model connected: llama3.1:latest
Using already indexed documents.
 You are using index: miniwiki-gpu
Index Stats:  {'numberOfDocuments': 7432, 'numberOfVectors': 9284, 'backend': {'memoryUsedPercentage': 0.56995741283, 'storageUsedPercentage': 70.26710776112999}}


100%|██████████| 5/5 [00:02<00:00,  2.07it/s]

[{'question': 'Was Abraham Lincoln the sixteenth President of the United States?', 'answer': 'Yes', 'contexts': [' Context: Abraham Lincoln (February 12, 1809 â\x80\x93 April 15, 1865) was the sixteenth President of the United States, serving from March 4, 1861 until his assassination ', ' Context: On November 6, 1860, Lincoln was elected as the 16th President of the United States, beating Democrat Stephen A. Douglas, John C. Breckinridge of the Southern Democrats, and John Bell of the new Constitutional Union Party '], 'contexts_ids': [278, 319], 'ground_truth': 'yes', 'goldPassages': None}, {'question': 'Did Lincoln sign the National Banking Act of 1863?', 'answer': 'No', 'contexts': [' Context: . Other important legislation involved economic matters, including the first income tax and higher tariffs ', ' Context: . Congress with Grant in mind on March 2, 1864. On March 12, Grant became general-in-chief of all the armies of the United States. '], 'contexts_ids': [360, 1797], 'ground_




In [26]:
# Get answer and ground_truth of rag elements
from evaluate import ROUGE
answer = pipe.rag_elements[0]["answer"]
ground_truth = pipe.rag_elements[0]["ground_truth"]
for element  in pipe.rag_elements:
    answer = element["answer"]
    ground_truth = element["ground_truth"]
    print(f"Answer: {answer}")
    print(f"Ground truth: {ground_truth}")
    rouge = ROUGE(answer, ground_truth)
    pprint(rouge)
    print("\n")


Answer: Yes
Ground truth: yes
Reference: ['yes']
Candidate: ['yes']
1.0


Answer: No
Ground truth: yes
Reference: ['yes']
Candidate: ['no']
0.0


Answer: No
Ground truth: no
Reference: ['no']
Candidate: ['no']
1.0


Answer: About 18 months of schooling.
Ground truth: 18 months
Reference: ['18', 'months']
Candidate: ['about', '18', 'months', 'of', 'schooling']
1.0


Answer: In 1832, at age 23.
Ground truth: 1832
Reference: ['1832']
Candidate: ['in', '1832', 'at', 'age', '23']
1.0




In [9]:
from evaluate import evaluate
eval_results = evaluate(rag_elements=pipe.rag_elements,evaluator="ROUGE-2",method="all")

100%|██████████| 5/5 [00:00<00:00, 3437.39it/s]


Reference: ['was', 'abraham', 'lincoln', 'the', 'sixteenth', 'president', 'of', 'the', 'united', 'states']
Candidate: ['context', 'abraham', 'lincoln', 'february', '12', '1809', 'â', 'april', '15', '1865', 'was', 'the', 'sixteenth', 'president', 'of', 'the', 'united', 'states', 'serving', 'from', 'march', '4', '1861', 'until', 'his', 'assassination']
Reference: ['was', 'abraham', 'lincoln', 'the', 'sixteenth', 'president', 'of', 'the', 'united', 'states']
Candidate: ['context', 'on', 'november', '6', '1860', 'lincoln', 'was', 'elected', 'as', 'the', '16th', 'president', 'of', 'the', 'united', 'states', 'beating', 'democrat', 'stephen', 'a', 'douglas', 'john', 'c', 'breckinridge', 'of', 'the', 'southern', 'democrats', 'and', 'john', 'bell', 'of', 'the', 'new', 'constitutional', 'union', 'party']
Reference: ['did', 'lincoln', 'sign', 'the', 'national', 'banking', 'act', 'of', '1863']
Candidate: ['context', 'other', 'important', 'legislation', 'involved', 'economic', 'matters', 'including

100%|██████████| 5/5 [00:00<00:00, 3507.53it/s]


Reference: ['context', 'abraham', 'lincoln', 'february', '12', '1809', 'â', 'april', '15', '1865', 'was', 'the', 'sixteenth', 'president', 'of', 'the', 'united', 'states', 'serving', 'from', 'march', '4', '1861', 'until', 'his', 'assassination']
Candidate: ['yes']
Reference: ['context', 'on', 'november', '6', '1860', 'lincoln', 'was', 'elected', 'as', 'the', '16th', 'president', 'of', 'the', 'united', 'states', 'beating', 'democrat', 'stephen', 'a', 'douglas', 'john', 'c', 'breckinridge', 'of', 'the', 'southern', 'democrats', 'and', 'john', 'bell', 'of', 'the', 'new', 'constitutional', 'union', 'party']
Candidate: ['yes']
Reference: ['context', 'other', 'important', 'legislation', 'involved', 'economic', 'matters', 'including', 'the', 'first', 'income', 'tax', 'and', 'higher', 'tariffs']
Candidate: ['no']
Reference: ['context', 'congress', 'with', 'grant', 'in', 'mind', 'on', 'march', '2', '1864', 'on', 'march', '12', 'grant', 'became', 'general', 'in', 'chief', 'of', 'all', 'the', 'ar

100%|██████████| 5/5 [00:00<00:00, 9929.70it/s]


Reference: ['was', 'abraham', 'lincoln', 'the', 'sixteenth', 'president', 'of', 'the', 'united', 'states']
Candidate: ['yes']
Reference: ['did', 'lincoln', 'sign', 'the', 'national', 'banking', 'act', 'of', '1863']
Candidate: ['no']
Reference: ['did', 'his', 'mother', 'die', 'of', 'pneumonia']
Candidate: ['no']
Reference: ['how', 'many', 'long', 'was', 'lincoln', 's', 'formal', 'education']
Candidate: ['about', '18', 'months', 'of', 'schooling']
Reference: ['when', 'did', 'lincoln', 'begin', 'his', 'political', 'career']
Candidate: ['in', '1832', 'at', 'age', '23']


100%|██████████| 5/5 [00:00<00:00, 5344.42it/s]

Reference: ['yes']
Candidate: ['yes']
Reference: ['yes']
Candidate: ['no']
Reference: ['no']
Candidate: ['no']
Reference: ['18', 'months']
Candidate: ['about', '18', 'months', 'of', 'schooling']
Reference: ['1832']
Candidate: ['in', '1832', 'at', 'age', '23']
 Scores: [1. 0. 1. 1. 1.]





In [10]:
eval_results

{'context_relevance': array([[1.   , 0.8  ,   nan],
        [0.111, 0.222,   nan],
        [0.333, 0.333,   nan],
        [0.5  , 0.5  ,   nan],
        [0.   , 0.571, 0.286]]),
 'faithfulness': array([[0.   , 0.   ,   nan],
        [0.   , 0.   ,   nan],
        [0.   , 0.   ,   nan],
        [0.185, 0.   ,   nan],
        [0.   , 0.139, 0.026]]),
 'answer_relevance': array([0., 0., 0., 0., 0.]),
 'correctness': array([1., 0., 1., 1., 1.])}