In [1]:
from eval import evaluate_single_input
from datasets import load_dataset
import faiss
import numpy as np
import pickle
import openai
from openai import OpenAI

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Initialize DeepInfra OpenAI Client
openai = OpenAI(
    api_key="2Q2AU9IG4jKLdqmRrHc2UxaLP8hHB0ii",  # Replace with your key
    base_url="https://api.deepinfra.com/v1/openai",
)

In [3]:
def load_faiss_index(index_path, metadata_path):
    """
    Loads the FAISS index and metadata from local storage.
    """
    index = faiss.read_index(index_path)
    with open(metadata_path, 'rb') as f:
        metadata = pickle.load(f)
    return index, metadata

In [4]:
def search_faiss(query, top_k, index_path, metadata_path):
    """
    Searches FAISS for the most relevant chunks to the query.
    """
    # Load index and metadata
    index, metadata = load_faiss_index(index_path, metadata_path)

    # Generate embedding for the query
    response = openai.embeddings.create(
        model="BAAI/bge-m3",
        input=query,
        encoding_format="float"
    )
    query_embedding = np.array([response.data[0].embedding])

    # Search FAISS index for the top_k results
    distances, indices = index.search(query_embedding, top_k)

    # Retrieve matching chunks
    results = []
    for i, idx in enumerate(indices[0]):
        results.append((metadata[idx], distances[0][i]))
    return results

In [5]:
# Load the dataset
dataset = load_dataset(
path='BlackFear/istanbul-qa-dataset',
trust_remote_code=True,
)

data = dataset['test']

# Extract data
queries = data['question']
references = data['reference']


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 100%|██████████| 2631/2631 [00:00<00:00, 292362.27 examples/s]
Generating test split: 100%|██████████| 300/300 [00:00<00:00, 74956.29 examples/s]


In [6]:
def generate_answer(query, context):
    """
    Generates an answer based on the query and retrieved context using GPT.
    """
    # Prepare the input prompt
    prompt = f"""
    Use the following contexts to answer the question:

    Context:
    {context}

    Question:
    {query}

    Answer:
    """

    # Query the Llama model
    response = openai.chat.completions.create(
        model="meta-llama/Llama-3.3-70B-Instruct-Turbo",
        messages=[
            {"role": "system", "content": "You will be answering questions about Istanbul. Please provide the answer to the following question."},
            {"role": "user", "content": prompt}
        ]
    )
    return response.choices[0].message.content.strip()

In [7]:
def rag_pipeline(query, top_k, index_path, metadata_path):
    """
    Executes the RAG pipeline to retrieve relevant chunks and generate an answer.
    """
    # Step 1: Retrieve relevant chunks
    results = search_faiss(query, top_k, index_path, metadata_path)

    # Step 2: Combine the retrieved chunks as context
    context = "\n\n".join([f"Chunk {i+1}: {result[0]}" for i, result in enumerate(results)])

    # Step 3: Generate an answer using GPT
    answer = generate_answer(query, context)

    return answer, results

In [8]:
from tqdm import tqdm
scores = []
for i in tqdm(range(len(queries)), total=len(queries)):
    query = queries[i]
    reference = references[i]
    answer,results = rag_pipeline(query, top_k=3, index_path='faiss_index_semantic.index', metadata_path='metadata_semantic.pkl')
    results = [result[0] for result in results]
    score = evaluate_single_input(query, reference, answer, results)
    print(score)
    scores.append(score)

# 21m

  0%|          | 1/300 [00:06<33:55,  6.81s/it]

{'Faithfulness': 0.8, 'Relevancy': 0.8, 'Factual Correctness': 0.875, 'BLEU Score': 0.17464747586953588, 'Overall Score': 0.8300000000000001}


  1%|          | 2/300 [00:09<21:05,  4.25s/it]

{'Faithfulness': 0.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0, 'Overall Score': 0.7}


  1%|          | 3/300 [00:14<22:24,  4.53s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.5, 'Factual Correctness': 0.5, 'BLEU Score': 0.3498866783250506, 'Overall Score': 0.5}


  1%|▏         | 4/300 [00:16<18:00,  3.65s/it]

{'Faithfulness': 0.5, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.24028114141347542, 'Overall Score': 0.85}


  2%|▏         | 5/300 [00:20<17:55,  3.65s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.9876, 'BLEU Score': 0.2714872600627347, 'Overall Score': 0.99504}


  2%|▏         | 6/300 [00:23<17:24,  3.55s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.9876, 'Factual Correctness': 0.9876, 'BLEU Score': 0.6054657750562689, 'Overall Score': 0.99132}


  2%|▏         | 7/300 [00:32<26:33,  5.44s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.3, 'Factual Correctness': 0.5, 'BLEU Score': 0.001949174943170598, 'Overall Score': 0.44}


  3%|▎         | 8/300 [00:40<30:08,  6.19s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 1.0, 'BLEU Score': 0.02284411468856457, 'Overall Score': 0.9400000000000001}


  3%|▎         | 9/300 [00:46<29:19,  6.05s/it]

{'Faithfulness': 0.987, 'Relevancy': 0.92, 'Factual Correctness': 0.9876, 'BLEU Score': 0.19702452505214854, 'Overall Score': 0.9671400000000001}


  3%|▎         | 10/300 [00:49<24:20,  5.04s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.33649324423301513, 'Overall Score': 1.0}


  4%|▎         | 11/300 [00:51<20:41,  4.29s/it]

{'Faithfulness': 0.8, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0.021105340631872645, 'Overall Score': 0.74}


  4%|▍         | 12/300 [00:54<18:29,  3.85s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0.22894156860669912, 'Overall Score': 0.8}


  4%|▍         | 13/300 [00:56<16:17,  3.41s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0, 'Overall Score': 0.8}


  5%|▍         | 14/300 [01:02<19:50,  4.16s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.7, 'Factual Correctness': 0.0, 'BLEU Score': 0.009607695286097472, 'Overall Score': 0.51}


  5%|▌         | 15/300 [01:12<27:18,  5.75s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.008678355060660479, 'Overall Score': 0.0}


  5%|▌         | 16/300 [01:15<23:52,  5.04s/it]

{'Faithfulness': 0.9876, 'Relevancy': 0.9876, 'Factual Correctness': 0.5, 'BLEU Score': 0.535327611100933, 'Overall Score': 0.7925599999999999}


  6%|▌         | 17/300 [01:18<20:51,  4.42s/it]

{'Faithfulness': 0.5, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0.01553712569276035, 'Overall Score': 0.6499999999999999}


  6%|▌         | 18/300 [01:21<18:01,  3.83s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.1239040201758117, 'Overall Score': 1.0}


  6%|▋         | 19/300 [01:31<27:37,  5.90s/it]

{'Faithfulness': 0.75, 'Relevancy': 0.7, 'Factual Correctness': 0.75, 'BLEU Score': 0.03799941243788066, 'Overall Score': 0.735}


  7%|▋         | 20/300 [01:44<36:29,  7.82s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.5, 'Factual Correctness': 0.5, 'BLEU Score': 0.025264103178128893, 'Overall Score': 0.5}


  7%|▋         | 21/300 [01:47<30:19,  6.52s/it]

{'Faithfulness': 0.8, 'Relevancy': 0.8, 'Factual Correctness': 0.875, 'BLEU Score': 0.13410639648320277, 'Overall Score': 0.8300000000000001}


  7%|▋         | 22/300 [01:51<26:15,  5.67s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 1.0, 'BLEU Score': 0.5481799462606964, 'Overall Score': 0.9400000000000001}


  8%|▊         | 23/300 [01:53<21:50,  4.73s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.1933853138176172, 'Overall Score': 1.0}


  8%|▊         | 24/300 [01:56<18:21,  3.99s/it]

{'Faithfulness': 0.8974, 'Relevancy': 0.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.8091067115702212, 'Overall Score': 0.6692199999999999}


  8%|▊         | 25/300 [01:59<17:11,  3.75s/it]

{'Faithfulness': 0.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.016060081131239495, 'Overall Score': 0.7}


  9%|▊         | 26/300 [02:01<15:37,  3.42s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.5, 'Factual Correctness': 0.8, 'BLEU Score': 0.01976560930094397, 'Overall Score': 0.6200000000000001}


  9%|▉         | 27/300 [02:06<17:07,  3.77s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 1.0, 'BLEU Score': 0.02898497051727735, 'Overall Score': 0.9400000000000001}


  9%|▉         | 28/300 [02:09<15:45,  3.48s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.3, 'Factual Correctness': 1.0, 'BLEU Score': 0.316227766016838, 'Overall Score': 0.49}


 10%|▉         | 29/300 [02:13<16:21,  3.62s/it]

{'Faithfulness': 0.9876, 'Relevancy': 0.9, 'Factual Correctness': 0.875, 'BLEU Score': 0.42515035180170696, 'Overall Score': 0.91628}


 10%|█         | 30/300 [02:15<14:48,  3.29s/it]

{'Faithfulness': 0.0, 'Relevancy': 1.0, 'Factual Correctness': 0.0, 'BLEU Score': 0, 'Overall Score': 0.3}


 10%|█         | 31/300 [02:20<16:24,  3.66s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.8, 'Factual Correctness': 0.5, 'BLEU Score': 0.05360359357946223, 'Overall Score': 0.5900000000000001}


 11%|█         | 32/300 [02:23<16:07,  3.61s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.98, 'Factual Correctness': 0.75, 'BLEU Score': 0.4377121098094503, 'Overall Score': 0.894}


 11%|█         | 33/300 [02:30<19:59,  4.49s/it]

{'Faithfulness': 0.0, 'Relevancy': 1.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.013679192123121896, 'Overall Score': 0.3}


 11%|█▏        | 34/300 [02:34<18:54,  4.27s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.03986357128268015, 'Overall Score': 1.0}


 12%|█▏        | 35/300 [02:38<18:45,  4.25s/it]

{'Faithfulness': 0.5, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.007913247271422612, 'Overall Score': 0.85}


 12%|█▏        | 36/300 [02:42<18:47,  4.27s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.9, 'Factual Correctness': 0.9876, 'BLEU Score': 0.23020656163897005, 'Overall Score': 0.9650400000000001}


 12%|█▏        | 37/300 [02:45<17:32,  4.00s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.3559528809154241, 'Overall Score': 1.0}


 13%|█▎        | 38/300 [02:53<21:57,  5.03s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.7, 'Factual Correctness': 1.0, 'BLEU Score': 1.0, 'Overall Score': 0.91}


 13%|█▎        | 39/300 [02:58<22:23,  5.15s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.026012784404037925, 'Overall Score': 1.0}


 13%|█▎        | 40/300 [03:03<21:09,  4.88s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.735408443636839, 'Overall Score': 1.0}


 14%|█▎        | 41/300 [03:05<18:17,  4.24s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.7, 'Factual Correctness': 1.0, 'BLEU Score': 0.7361703354503866, 'Overall Score': 0.91}


 14%|█▍        | 42/300 [03:13<23:16,  5.41s/it]

{'Faithfulness': 0.9876, 'Relevancy': 0.8, 'Factual Correctness': 0.5, 'BLEU Score': 0.07069415914978927, 'Overall Score': 0.73628}


 14%|█▍        | 43/300 [03:16<18:58,  4.43s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.3, 'Factual Correctness': 0.0, 'BLEU Score': 0.1778279410038923, 'Overall Score': 0.09}


 15%|█▍        | 44/300 [03:19<17:14,  4.04s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.9876, 'Factual Correctness': 0.5, 'BLEU Score': 0.54533764355819, 'Overall Score': 0.49628}


 15%|█▌        | 45/300 [03:21<15:15,  3.59s/it]

{'Faithfulness': 0.0, 'Relevancy': 1.0, 'Factual Correctness': 0.0, 'BLEU Score': 1.0, 'Overall Score': 0.3}


 15%|█▌        | 46/300 [03:26<16:55,  4.00s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 1.0, 'BLEU Score': 0.02878787818101127, 'Overall Score': 0.9400000000000001}


 16%|█▌        | 47/300 [03:30<17:05,  4.05s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.9876, 'Factual Correctness': 0.75, 'BLEU Score': 0.39392661757434155, 'Overall Score': 0.89628}


 16%|█▌        | 48/300 [03:33<15:23,  3.67s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0, 'Overall Score': 1.0}


 16%|█▋        | 49/300 [03:38<16:58,  4.06s/it]

{'Faithfulness': 0.8, 'Relevancy': 1.0, 'Factual Correctness': 0.8, 'BLEU Score': 0.00410726430147258, 'Overall Score': 0.8600000000000001}


 17%|█▋        | 50/300 [03:41<15:28,  3.71s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.5, 'Factual Correctness': 1.0, 'BLEU Score': 0.2829583596743847, 'Overall Score': 0.55}


 17%|█▋        | 51/300 [03:44<14:40,  3.53s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0, 'Overall Score': 0.8}


 17%|█▋        | 52/300 [03:47<14:14,  3.45s/it]

{'Faithfulness': 0.5, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.4412994555017375, 'Overall Score': 0.85}


 18%|█▊        | 53/300 [03:50<12:50,  3.12s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.5, 'Factual Correctness': 1.0, 'BLEU Score': 0.11362193664674995, 'Overall Score': 0.55}


 18%|█▊        | 54/300 [03:53<13:06,  3.20s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.4028998029112093, 'Overall Score': 1.0}


 18%|█▊        | 55/300 [03:58<14:59,  3.67s/it]

{'Faithfulness': 0.9876, 'Relevancy': 0.8, 'Factual Correctness': 0.875, 'BLEU Score': 0.06973093458034044, 'Overall Score': 0.88628}


 19%|█▊        | 56/300 [04:00<13:20,  3.28s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.5, 'Factual Correctness': 1.0, 'BLEU Score': 1.0, 'Overall Score': 0.55}


 19%|█▉        | 57/300 [04:03<12:58,  3.20s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 0.5, 'BLEU Score': 0.12991916506579942, 'Overall Score': 0.74}


 19%|█▉        | 58/300 [04:06<12:42,  3.15s/it]

{'Faithfulness': 0.8974, 'Relevancy': 0.7, 'Factual Correctness': 0.5, 'BLEU Score': 0.38890556115271097, 'Overall Score': 0.6792199999999999}


 20%|█▉        | 59/300 [04:09<12:33,  3.13s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0.3290385879986622, 'Overall Score': 0.8}


 20%|██        | 60/300 [04:14<14:27,  3.61s/it]

{'Faithfulness': 0.9876, 'Relevancy': 0.7, 'Factual Correctness': 0.5, 'BLEU Score': 0.008089657572395275, 'Overall Score': 0.70628}


 20%|██        | 61/300 [04:17<13:12,  3.32s/it]

{'Faithfulness': 0.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0, 'Overall Score': 0.7}


 21%|██        | 62/300 [04:22<15:31,  3.91s/it]

{'Faithfulness': 0.987, 'Relevancy': 0.7, 'Factual Correctness': 0.75, 'BLEU Score': 0.04635351290779111, 'Overall Score': 0.8061}


 21%|██        | 63/300 [04:26<15:24,  3.90s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 1.0, 'BLEU Score': 0.479676449968321, 'Overall Score': 0.9400000000000001}


 21%|██▏       | 64/300 [04:29<13:55,  3.54s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.033031643180138064, 'Overall Score': 1.0}


 22%|██▏       | 65/300 [04:37<18:55,  4.83s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.875, 'BLEU Score': 0.011433361115787456, 'Overall Score': 0.95}


 22%|██▏       | 66/300 [04:40<17:30,  4.49s/it]

{'Faithfulness': 0.9876, 'Relevancy': 0.8, 'Factual Correctness': 0.8, 'BLEU Score': 0.0421655762861391, 'Overall Score': 0.85628}


 22%|██▏       | 67/300 [04:51<24:53,  6.41s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.018850319022747353, 'Overall Score': 1.0}


 23%|██▎       | 68/300 [04:58<25:10,  6.51s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0.010802314890908065, 'Overall Score': 0.8}


 23%|██▎       | 69/300 [05:01<21:12,  5.51s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 0.8, 'BLEU Score': 0.5828233954152653, 'Overall Score': 0.8600000000000001}


 23%|██▎       | 70/300 [05:04<18:33,  4.84s/it]

{'Faithfulness': 0.8, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0.01428363257865929, 'Overall Score': 0.74}


 24%|██▎       | 71/300 [05:09<17:50,  4.67s/it]

{'Faithfulness': 0.8, 'Relevancy': 0.7, 'Factual Correctness': 0.0, 'BLEU Score': 0.011398615457120072, 'Overall Score': 0.44999999999999996}


 24%|██▍       | 72/300 [05:13<17:32,  4.61s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0.05684653076227407, 'Overall Score': 0.8}


 24%|██▍       | 73/300 [05:17<16:44,  4.43s/it]

{'Faithfulness': 0.9876, 'Relevancy': 0.9876, 'Factual Correctness': 0.5, 'BLEU Score': 0.17333601895365613, 'Overall Score': 0.7925599999999999}


 25%|██▍       | 74/300 [05:23<18:34,  4.93s/it]

{'Faithfulness': 0.8, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.20823924096105695, 'Overall Score': 0.9400000000000001}


 25%|██▌       | 75/300 [05:31<22:05,  5.89s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0.06541868941754436, 'Overall Score': 0.8}


 25%|██▌       | 76/300 [05:34<18:16,  4.89s/it]

{'Faithfulness': 0.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 1.0, 'Overall Score': 0.7}


 26%|██▌       | 77/300 [05:38<17:08,  4.61s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.9876, 'Factual Correctness': 0.9876, 'BLEU Score': 0.4236842651859699, 'Overall Score': 0.99132}


 26%|██▌       | 78/300 [05:42<16:08,  4.36s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 0.9876, 'BLEU Score': 0.17098323692758396, 'Overall Score': 0.9350400000000001}


 26%|██▋       | 79/300 [05:45<15:29,  4.20s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 1.0, 'BLEU Score': 0.4348553979929487, 'Overall Score': 0.9400000000000001}


 27%|██▋       | 80/300 [05:51<16:45,  4.57s/it]

{'Faithfulness': 0.8973, 'Relevancy': 0.7, 'Factual Correctness': 0.5, 'BLEU Score': 0.1180751972626564, 'Overall Score': 0.67919}


 27%|██▋       | 81/300 [06:02<23:31,  6.44s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.5, 'Factual Correctness': 0.5, 'BLEU Score': 0.0011079891322284116, 'Overall Score': 0.5}


 27%|██▋       | 82/300 [06:05<19:36,  5.39s/it]

{'Faithfulness': 0.8, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0.015718877363021206, 'Overall Score': 0.74}


 28%|██▊       | 83/300 [06:10<19:16,  5.33s/it]

{'Faithfulness': 0.8, 'Relevancy': 0.8, 'Factual Correctness': 0.5, 'BLEU Score': 0.00995793413371623, 'Overall Score': 0.6799999999999999}


 28%|██▊       | 84/300 [06:14<18:02,  5.01s/it]

{'Faithfulness': 0.8974, 'Relevancy': 0.75, 'Factual Correctness': 0.5, 'BLEU Score': 0.006032401726201458, 'Overall Score': 0.69422}


 28%|██▊       | 85/300 [06:17<16:02,  4.48s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.9, 'Factual Correctness': 0.5, 'BLEU Score': 0.12740810460323607, 'Overall Score': 0.77}


 29%|██▊       | 86/300 [06:25<18:56,  5.31s/it]

{'Faithfulness': 0.9876, 'Relevancy': 0.8, 'Factual Correctness': 0.8, 'BLEU Score': 0.31510266845634133, 'Overall Score': 0.85628}


 29%|██▉       | 87/300 [06:28<16:35,  4.67s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.9876, 'BLEU Score': 0.5572806310452209, 'Overall Score': 0.99504}


 29%|██▉       | 88/300 [06:31<14:52,  4.21s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.12673718536830808, 'Overall Score': 1.0}


 30%|██▉       | 89/300 [06:39<18:53,  5.37s/it]

{'Faithfulness': 0.9876, 'Relevancy': 0.9876, 'Factual Correctness': 0.5, 'BLEU Score': 0.009765222631418293, 'Overall Score': 0.7925599999999999}


 30%|███       | 90/300 [06:42<15:57,  4.56s/it]

{'Faithfulness': 0.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.1778279410038923, 'Overall Score': 0.7}


 30%|███       | 91/300 [06:47<16:19,  4.69s/it]

{'Faithfulness': 0.9231, 'Relevancy': 0.8, 'Factual Correctness': 0.9876, 'BLEU Score': 0.44947804052082696, 'Overall Score': 0.9119700000000001}


 31%|███       | 92/300 [06:49<14:02,  4.05s/it]

{'Faithfulness': 0.5, 'Relevancy': 1.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.018850319022747353, 'Overall Score': 0.44999999999999996}


 31%|███       | 93/300 [06:55<15:59,  4.64s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.3, 'Factual Correctness': 0.0, 'BLEU Score': 0.0016698379449578307, 'Overall Score': 0.24}


 31%|███▏      | 94/300 [07:02<18:41,  5.45s/it]

{'Faithfulness': 0.9876, 'Relevancy': 0.8, 'Factual Correctness': 0.875, 'BLEU Score': 0.2727094903908803, 'Overall Score': 0.88628}


 32%|███▏      | 95/300 [07:05<15:57,  4.67s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.1374708101760565, 'Overall Score': 0.6}


 32%|███▏      | 96/300 [07:09<15:10,  4.46s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0.010802314890908065, 'Overall Score': 0.8}


 32%|███▏      | 97/300 [07:16<17:31,  5.18s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.9, 'Factual Correctness': 0.9876, 'BLEU Score': 0.22744558280062077, 'Overall Score': 0.9650400000000001}


 33%|███▎      | 98/300 [07:20<15:44,  4.68s/it]

{'Faithfulness': 0.8, 'Relevancy': 0.9876, 'Factual Correctness': 0.5, 'BLEU Score': 0.0359604157411963, 'Overall Score': 0.73628}


 33%|███▎      | 99/300 [07:24<15:16,  4.56s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 0.5, 'BLEU Score': 0.07984434410510546, 'Overall Score': 0.74}


 33%|███▎      | 100/300 [07:28<14:19,  4.30s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.43319877589068106, 'Overall Score': 1.0}


 34%|███▎      | 101/300 [07:30<12:30,  3.77s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0, 'Overall Score': 0.8}


 34%|███▍      | 102/300 [07:33<11:23,  3.45s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.0, 'BLEU Score': 0, 'Overall Score': 0.6}


 34%|███▍      | 103/300 [07:38<12:39,  3.85s/it]

{'Faithfulness': 0.75, 'Relevancy': 0.7, 'Factual Correctness': 0.5, 'BLEU Score': 0.04715336796407491, 'Overall Score': 0.635}


 35%|███▍      | 104/300 [07:43<14:18,  4.38s/it]

{'Faithfulness': 0.9876, 'Relevancy': 1.0, 'Factual Correctness': 0.875, 'BLEU Score': 0.2143961175741021, 'Overall Score': 0.94628}


 35%|███▌      | 105/300 [07:46<12:59,  4.00s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.025098621243978974, 'Overall Score': 1.0}


 35%|███▌      | 106/300 [07:49<11:39,  3.61s/it]

{'Faithfulness': 0.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.316227766016838, 'Overall Score': 0.7}


 36%|███▌      | 107/300 [07:51<10:20,  3.22s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.5, 'Factual Correctness': 1.0, 'BLEU Score': 1.0, 'Overall Score': 0.55}


 36%|███▌      | 108/300 [07:57<12:51,  4.02s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.019965747160974663, 'Overall Score': 1.0}


 36%|███▋      | 109/300 [08:00<11:57,  3.76s/it]

{'Faithfulness': 0.9231, 'Relevancy': 0.9, 'Factual Correctness': 0.75, 'BLEU Score': 0.05302620834594049, 'Overall Score': 0.8469300000000001}


 37%|███▋      | 110/300 [08:03<10:21,  3.27s/it]

{'Faithfulness': 0.0, 'Relevancy': 1.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.00033492903248398575, 'Overall Score': 0.3}


 37%|███▋      | 111/300 [08:09<13:04,  4.15s/it]

{'Faithfulness': 0.8973, 'Relevancy': 0.9876, 'Factual Correctness': 0.875, 'BLEU Score': 0.004157514774097771, 'Overall Score': 0.91547}


 37%|███▋      | 112/300 [08:13<13:09,  4.20s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.3, 'Factual Correctness': 1.0, 'BLEU Score': 1.0, 'Overall Score': 0.49}


 38%|███▊      | 113/300 [08:19<14:43,  4.72s/it]

{'Faithfulness': 0.9876, 'Relevancy': 0.7, 'Factual Correctness': 0.5, 'BLEU Score': 0.003087626696985844, 'Overall Score': 0.70628}


 38%|███▊      | 114/300 [08:26<17:08,  5.53s/it]

{'Faithfulness': 0.2, 'Relevancy': 0.2, 'Factual Correctness': 0.0, 'BLEU Score': 0.008197363904099848, 'Overall Score': 0.12}


 38%|███▊      | 115/300 [08:31<16:16,  5.28s/it]

{'Faithfulness': 0.9876, 'Relevancy': 0.92, 'Factual Correctness': 0.75, 'BLEU Score': 0.07700112744378823, 'Overall Score': 0.87228}


 39%|███▊      | 116/300 [08:35<14:37,  4.77s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.6341922683775969, 'Overall Score': 1.0}


 39%|███▉      | 117/300 [08:38<13:09,  4.31s/it]

{'Faithfulness': 0.9231, 'Relevancy': 0.8, 'Factual Correctness': 0.5, 'BLEU Score': 0.024010981785257506, 'Overall Score': 0.7169300000000001}


 39%|███▉      | 118/300 [08:47<17:43,  5.84s/it]

{'Faithfulness': 0.987, 'Relevancy': 0.9876, 'Factual Correctness': 0.9876, 'BLEU Score': 0.08932700906793695, 'Overall Score': 0.98742}


 40%|███▉      | 119/300 [08:51<15:43,  5.21s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.9876, 'Factual Correctness': 0.875, 'BLEU Score': 0.1885850947396187, 'Overall Score': 0.94628}


 40%|████      | 120/300 [08:55<14:51,  4.95s/it]

{'Faithfulness': 0.8973, 'Relevancy': 0.8, 'Factual Correctness': 0.5, 'BLEU Score': 0.4979882017762261, 'Overall Score': 0.70919}


 40%|████      | 121/300 [09:00<14:40,  4.92s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.9, 'Factual Correctness': 0.875, 'BLEU Score': 0.25723962032456954, 'Overall Score': 0.9200000000000002}


 41%|████      | 122/300 [09:03<12:34,  4.24s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.023980296761827107, 'Overall Score': 1.0}


 41%|████      | 123/300 [09:06<11:01,  3.74s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 1.0, 'Overall Score': 1.0}


 41%|████▏     | 124/300 [09:10<11:29,  3.92s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.9, 'Factual Correctness': 1.0, 'BLEU Score': 0.5758522372341492, 'Overall Score': 0.9700000000000001}


 42%|████▏     | 125/300 [09:13<11:05,  3.80s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0, 'Overall Score': 1.0}


 42%|████▏     | 126/300 [09:16<10:00,  3.45s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.5, 'Factual Correctness': 0.5, 'BLEU Score': 0.0026792460101782304, 'Overall Score': 0.5}


 42%|████▏     | 127/300 [09:21<11:20,  3.93s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.3, 'Factual Correctness': 0.0, 'BLEU Score': 0.055154521574922, 'Overall Score': 0.24}


 43%|████▎     | 128/300 [09:24<10:25,  3.64s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0, 'Overall Score': 0.8}


 43%|████▎     | 129/300 [09:29<11:51,  4.16s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.92, 'Factual Correctness': 0.75, 'BLEU Score': 0.2481185671118853, 'Overall Score': 0.8760000000000001}


 43%|████▎     | 130/300 [09:32<10:14,  3.62s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.3, 'Factual Correctness': 1.0, 'BLEU Score': 0.1778279410038923, 'Overall Score': 0.49}


 44%|████▎     | 131/300 [09:38<12:07,  4.30s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.08430118946484098, 'Overall Score': 1.0}


 44%|████▍     | 132/300 [09:42<12:07,  4.33s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.875, 'BLEU Score': 0.3487208082877262, 'Overall Score': 0.95}


 44%|████▍     | 133/300 [09:45<10:43,  3.85s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.0, 'BLEU Score': 0, 'Overall Score': 0.6}


 45%|████▍     | 134/300 [09:47<09:30,  3.44s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.050712153369465586, 'Overall Score': 1.0}


 45%|████▌     | 135/300 [09:50<09:02,  3.29s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.9, 'Factual Correctness': 0.9234, 'BLEU Score': 0.16538645421594048, 'Overall Score': 0.9393600000000001}


 45%|████▌     | 136/300 [09:56<10:45,  3.93s/it]

{'Faithfulness': 0.9876, 'Relevancy': 0.8, 'Factual Correctness': 0.75, 'BLEU Score': 0.22574247616080947, 'Overall Score': 0.83628}


 46%|████▌     | 137/300 [10:00<10:57,  4.03s/it]

{'Faithfulness': 0.8, 'Relevancy': 0.8, 'Factual Correctness': 0.75, 'BLEU Score': 0.017419743019031778, 'Overall Score': 0.78}


 46%|████▌     | 138/300 [10:05<11:31,  4.27s/it]

{'Faithfulness': 0.9876, 'Relevancy': 0.9876, 'Factual Correctness': 0.5, 'BLEU Score': 0.11103081472293377, 'Overall Score': 0.7925599999999999}


 46%|████▋     | 139/300 [10:08<10:19,  3.85s/it]

{'Faithfulness': 0.8, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0, 'Overall Score': 0.74}


 47%|████▋     | 140/300 [10:13<11:51,  4.44s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.3, 'Factual Correctness': 0.5, 'BLEU Score': 0.0038060718506189493, 'Overall Score': 0.29000000000000004}


 47%|████▋     | 141/300 [10:17<11:02,  4.17s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.8, 'Factual Correctness': 0.5, 'BLEU Score': 0.05039518688486959, 'Overall Score': 0.5900000000000001}


 47%|████▋     | 142/300 [10:22<11:23,  4.32s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 0.5, 'BLEU Score': 0.045298051905638646, 'Overall Score': 0.74}


 48%|████▊     | 143/300 [10:25<10:26,  3.99s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 0.875, 'BLEU Score': 0.18373344524824237, 'Overall Score': 0.8900000000000001}


 48%|████▊     | 144/300 [10:28<09:48,  3.78s/it]

{'Faithfulness': 0.8, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0, 'Overall Score': 0.9400000000000001}


 48%|████▊     | 145/300 [10:32<09:37,  3.72s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 0.5, 'BLEU Score': 0.030904914794178312, 'Overall Score': 0.74}


 49%|████▊     | 146/300 [10:35<09:21,  3.65s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.0151463165231073, 'Overall Score': 0.0}


 49%|████▉     | 147/300 [10:40<10:08,  3.98s/it]

{'Faithfulness': 0.9876, 'Relevancy': 0.8, 'Factual Correctness': 0.5, 'BLEU Score': 0.3188214519868951, 'Overall Score': 0.73628}


 49%|████▉     | 148/300 [10:44<10:07,  4.00s/it]

{'Faithfulness': 0.9876, 'Relevancy': 0.8, 'Factual Correctness': 0.5, 'BLEU Score': 0.009760395370568378, 'Overall Score': 0.73628}


 50%|████▉     | 149/300 [10:46<08:53,  3.54s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0, 'Overall Score': 0.8}


 50%|█████     | 150/300 [10:52<10:21,  4.15s/it]

{'Faithfulness': 0.8973, 'Relevancy': 0.8, 'Factual Correctness': 0.5, 'BLEU Score': 0, 'Overall Score': 0.70919}


 50%|█████     | 151/300 [10:55<09:08,  3.68s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.5, 'Factual Correctness': 1.0, 'BLEU Score': 0.11362193664674995, 'Overall Score': 0.55}


 51%|█████     | 152/300 [10:58<08:48,  3.57s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.875, 'BLEU Score': 0.3159148237572417, 'Overall Score': 0.95}


 51%|█████     | 153/300 [11:01<08:14,  3.36s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0.4371707208826036, 'Overall Score': 0.8}


 51%|█████▏    | 154/300 [11:05<08:37,  3.55s/it]

{'Faithfulness': 0.9231, 'Relevancy': 0.8, 'Factual Correctness': 0.875, 'BLEU Score': 0.2974693820138358, 'Overall Score': 0.86693}


 52%|█████▏    | 155/300 [11:12<11:03,  4.57s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.0025579476037043062, 'Overall Score': 0.0}


 52%|█████▏    | 156/300 [11:15<10:08,  4.23s/it]

{'Faithfulness': 0.75, 'Relevancy': 0.8, 'Factual Correctness': 0.75, 'BLEU Score': 0.13253857683103734, 'Overall Score': 0.765}


 52%|█████▏    | 157/300 [11:20<10:32,  4.42s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.0, 'Factual Correctness': 0.0, 'BLEU Score': 0, 'Overall Score': 0.0}


 53%|█████▎    | 158/300 [11:23<09:27,  3.99s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 0.0, 'BLEU Score': 0.37239098949398236, 'Overall Score': 0.54}


 53%|█████▎    | 159/300 [11:27<09:00,  3.83s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 0.875, 'BLEU Score': 0.46899177269829906, 'Overall Score': 0.8900000000000001}


 53%|█████▎    | 160/300 [11:36<13:07,  5.62s/it]

{'Faithfulness': 0.9876, 'Relevancy': 0.9876, 'Factual Correctness': 0.789, 'BLEU Score': 0.004385300184057802, 'Overall Score': 0.9081600000000001}


 54%|█████▎    | 161/300 [11:38<10:35,  4.57s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.1778279410038923, 'Overall Score': 1.0}


 54%|█████▍    | 162/300 [11:41<09:25,  4.10s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0.5828233954152653, 'Overall Score': 0.8}


 54%|█████▍    | 163/300 [11:45<09:08,  4.00s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.498924651153352, 'Overall Score': 1.0}


 55%|█████▍    | 164/300 [11:48<08:33,  3.78s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.9876, 'Factual Correctness': 0.75, 'BLEU Score': 0.5424340753977054, 'Overall Score': 0.89628}


 55%|█████▌    | 165/300 [11:52<08:14,  3.66s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.875, 'BLEU Score': 0.2442195895839763, 'Overall Score': 0.95}


 55%|█████▌    | 166/300 [11:56<08:42,  3.90s/it]

{'Faithfulness': 0.8, 'Relevancy': 0.7, 'Factual Correctness': 0.5, 'BLEU Score': 0.2578919807609506, 'Overall Score': 0.6499999999999999}


 56%|█████▌    | 167/300 [11:59<07:32,  3.41s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0, 'Overall Score': 1.0}


 56%|█████▌    | 168/300 [12:04<08:42,  3.96s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.2, 'Factual Correctness': 0.0, 'BLEU Score': 0.003263584888402175, 'Overall Score': 0.06}


 56%|█████▋    | 169/300 [12:09<09:29,  4.35s/it]

{'Faithfulness': 0.9231, 'Relevancy': 0.8, 'Factual Correctness': 0.92, 'BLEU Score': 0.1642851022398886, 'Overall Score': 0.88493}


 57%|█████▋    | 170/300 [12:11<07:57,  3.67s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.1778279410038923, 'Overall Score': 0.4}


 57%|█████▋    | 171/300 [12:14<07:17,  3.39s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.12336445138236939, 'Overall Score': 0.0}


 57%|█████▋    | 172/300 [12:17<06:50,  3.21s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.06985342056580097, 'Overall Score': 1.0}


 58%|█████▊    | 173/300 [12:21<07:37,  3.60s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.005145174324020217, 'Overall Score': 0.15}


 58%|█████▊    | 174/300 [12:25<07:27,  3.55s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 0.875, 'BLEU Score': 0.6733187455351302, 'Overall Score': 0.8900000000000001}


 58%|█████▊    | 175/300 [12:30<08:35,  4.12s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.00366753025392797, 'Overall Score': 0.0}


 59%|█████▊    | 176/300 [12:34<08:25,  4.08s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.9, 'Factual Correctness': 0.875, 'BLEU Score': 0.47331425446394204, 'Overall Score': 0.9200000000000002}


 59%|█████▉    | 177/300 [12:37<07:24,  3.62s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.5, 'Factual Correctness': 0.5, 'BLEU Score': 0.09329460218997072, 'Overall Score': 0.6499999999999999}


 59%|█████▉    | 178/300 [12:40<07:16,  3.57s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.9876, 'BLEU Score': 0.21343856914627957, 'Overall Score': 0.99504}


 60%|█████▉    | 179/300 [12:43<06:58,  3.46s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 0.5, 'BLEU Score': 0.39811015372394626, 'Overall Score': 0.74}


 60%|██████    | 180/300 [12:47<06:53,  3.44s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.7, 'Factual Correctness': 0.9876, 'BLEU Score': 0.4562799427006306, 'Overall Score': 0.60504}


 60%|██████    | 181/300 [12:51<07:28,  3.77s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.7, 'Factual Correctness': 0.0, 'BLEU Score': 0.02165505098802441, 'Overall Score': 0.51}


 61%|██████    | 182/300 [12:56<08:12,  4.17s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.3, 'Factual Correctness': 0.5, 'BLEU Score': 0, 'Overall Score': 0.44}


 61%|██████    | 183/300 [12:59<07:29,  3.84s/it]

{'Faithfulness': 0.8973, 'Relevancy': 1.0, 'Factual Correctness': 0.875, 'BLEU Score': 0.3679682596057247, 'Overall Score': 0.91919}


 61%|██████▏   | 184/300 [13:07<09:48,  5.07s/it]

{'Faithfulness': 0.9876, 'Relevancy': 0.9, 'Factual Correctness': 0.9876, 'BLEU Score': 0.19451497315451405, 'Overall Score': 0.9613200000000001}


 62%|██████▏   | 185/300 [13:11<08:38,  4.51s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 1.0, 'BLEU Score': 0.5494911668430702, 'Overall Score': 0.9400000000000001}


 62%|██████▏   | 186/300 [13:14<07:52,  4.15s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.9, 'Factual Correctness': 0.5, 'BLEU Score': 0.2418068126014414, 'Overall Score': 0.77}


 62%|██████▏   | 187/300 [13:17<07:14,  3.84s/it]

{'Faithfulness': 0.5, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 1.0, 'Overall Score': 0.85}


 63%|██████▎   | 188/300 [13:20<06:38,  3.56s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.8, 'Factual Correctness': 0.75, 'BLEU Score': 0.056122223243057295, 'Overall Score': 0.6900000000000001}


 63%|██████▎   | 189/300 [13:26<07:56,  4.29s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.7, 'Factual Correctness': 0.5, 'BLEU Score': 0.04081462921322397, 'Overall Score': 0.56}


 63%|██████▎   | 190/300 [13:29<07:28,  4.08s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0.11369306152454814, 'Overall Score': 0.8}


 64%|██████▎   | 191/300 [13:34<07:53,  4.34s/it]

{'Faithfulness': 0.9876, 'Relevancy': 0.8, 'Factual Correctness': 0.875, 'BLEU Score': 0.1951358933330626, 'Overall Score': 0.88628}


 64%|██████▍   | 192/300 [13:38<07:25,  4.13s/it]

{'Faithfulness': 0.9876, 'Relevancy': 0.9876, 'Factual Correctness': 0.5, 'BLEU Score': 0.3444037291432751, 'Overall Score': 0.7925599999999999}


 64%|██████▍   | 193/300 [13:43<07:49,  4.39s/it]

{'Faithfulness': 0.8, 'Relevancy': 0.8, 'Factual Correctness': 0.8, 'BLEU Score': 0.1735983235251396, 'Overall Score': 0.8}


 65%|██████▍   | 194/300 [13:51<09:39,  5.47s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.9876, 'Factual Correctness': 1.0, 'BLEU Score': 0.17986194418489154, 'Overall Score': 0.9962799999999999}


 65%|██████▌   | 195/300 [14:08<15:52,  9.07s/it]

{'Faithfulness': 0.8973, 'Relevancy': 0.9876, 'Factual Correctness': 0.875, 'BLEU Score': 0.2093236104337789, 'Overall Score': 0.91547}


 65%|██████▌   | 196/300 [14:12<12:52,  7.43s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 0.5, 'BLEU Score': 0.019454290935168924, 'Overall Score': 0.74}


 66%|██████▌   | 197/300 [14:15<10:32,  6.14s/it]

{'Faithfulness': 0.8, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0.23588448106534202, 'Overall Score': 0.74}


 66%|██████▌   | 198/300 [14:18<08:57,  5.27s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.23956565612760206, 'Overall Score': 1.0}


 66%|██████▋   | 199/300 [14:22<07:45,  4.61s/it]

{'Faithfulness': 0.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.3976353643835253, 'Overall Score': 0.7}


 67%|██████▋   | 200/300 [14:25<07:00,  4.21s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.5, 'Factual Correctness': 0.5, 'BLEU Score': 0.04299633317640399, 'Overall Score': 0.5}


 67%|██████▋   | 201/300 [14:28<06:38,  4.02s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 0.875, 'BLEU Score': 0.22934769499188126, 'Overall Score': 0.8900000000000001}


 67%|██████▋   | 202/300 [14:32<06:15,  3.84s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0, 'Overall Score': 1.0}


 68%|██████▊   | 203/300 [14:36<06:17,  3.89s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.9, 'Factual Correctness': 0.8, 'BLEU Score': 0.033241378434410024, 'Overall Score': 0.8900000000000001}


 68%|██████▊   | 204/300 [14:39<05:39,  3.54s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.7, 'Factual Correctness': 0.0, 'BLEU Score': 0.027952555963587523, 'Overall Score': 0.51}


 68%|██████▊   | 205/300 [14:43<05:59,  3.79s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.004295119260690564, 'Overall Score': 0.0}


 69%|██████▊   | 206/300 [14:46<05:42,  3.64s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 1.0, 'BLEU Score': 0.4962822700197384, 'Overall Score': 0.9400000000000001}


 69%|██████▉   | 207/300 [14:48<04:59,  3.22s/it]

{'Faithfulness': 0.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.24028114141347542, 'Overall Score': 0.7}


 69%|██████▉   | 208/300 [14:52<04:56,  3.22s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 0.875, 'BLEU Score': 0.18518626791165133, 'Overall Score': 0.8900000000000001}


 70%|██████▉   | 209/300 [14:54<04:29,  2.96s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.3, 'Factual Correctness': 1.0, 'BLEU Score': 0.3976353643835253, 'Overall Score': 0.64}


 70%|███████   | 210/300 [14:56<04:13,  2.81s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.02777619034011792, 'Overall Score': 0.6}


 70%|███████   | 211/300 [14:59<04:13,  2.85s/it]

{'Faithfulness': 0.9876, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0.007913247271422612, 'Overall Score': 0.7962799999999999}


 71%|███████   | 212/300 [15:03<04:40,  3.19s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.9876, 'Factual Correctness': 0.875, 'BLEU Score': 0.006244385951234721, 'Overall Score': 0.94628}


 71%|███████   | 213/300 [15:07<04:36,  3.18s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.7, 'Factual Correctness': 1.0, 'BLEU Score': 0.25381494737245897, 'Overall Score': 0.76}


 71%|███████▏  | 214/300 [15:12<05:30,  3.85s/it]

{'Faithfulness': 0.8, 'Relevancy': 0.7, 'Factual Correctness': 0.0, 'BLEU Score': 0.09964655439992519, 'Overall Score': 0.44999999999999996}


 72%|███████▏  | 215/300 [15:15<05:18,  3.74s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 1.0, 'BLEU Score': 0.7001575310229897, 'Overall Score': 0.9400000000000001}


 72%|███████▏  | 216/300 [15:22<06:15,  4.47s/it]

{'Faithfulness': 0.7, 'Relevancy': 0.3, 'Factual Correctness': 0.5, 'BLEU Score': 0.0021470850489829487, 'Overall Score': 0.5}


 72%|███████▏  | 217/300 [15:25<05:44,  4.15s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.9, 'Factual Correctness': 0.5, 'BLEU Score': 0.4067434794181572, 'Overall Score': 0.77}


 73%|███████▎  | 218/300 [15:28<05:09,  3.77s/it]

{'Faithfulness': 0.0, 'Relevancy': 1.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.3976353643835253, 'Overall Score': 0.3}


 73%|███████▎  | 219/300 [15:32<05:08,  3.81s/it]

{'Faithfulness': 0.987, 'Relevancy': 0.9, 'Factual Correctness': 0.875, 'BLEU Score': 0.166887086590326, 'Overall Score': 0.9161000000000001}


 73%|███████▎  | 220/300 [15:37<05:31,  4.14s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 0.75, 'BLEU Score': 0.2500653935141143, 'Overall Score': 0.8400000000000001}


 74%|███████▎  | 221/300 [15:41<05:38,  4.29s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 1.0, 'BLEU Score': 1.0, 'Overall Score': 0.9400000000000001}


 74%|███████▍  | 222/300 [15:46<05:44,  4.42s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.061538103874647754, 'Overall Score': 0.0}


 74%|███████▍  | 223/300 [15:50<05:33,  4.33s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.9, 'Factual Correctness': 0.9876, 'BLEU Score': 0.32511653451044564, 'Overall Score': 0.9650400000000001}


 75%|███████▍  | 224/300 [15:53<05:00,  3.96s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.018850319022747353, 'Overall Score': 0.6}


 75%|███████▌  | 225/300 [15:56<04:32,  3.64s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.7, 'Factual Correctness': 1.0, 'BLEU Score': 0.1880230734180335, 'Overall Score': 0.91}


 75%|███████▌  | 226/300 [15:59<03:59,  3.24s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.5, 'Factual Correctness': 0.5, 'BLEU Score': 0, 'Overall Score': 0.35}


 76%|███████▌  | 227/300 [16:01<03:41,  3.03s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.5, 'Factual Correctness': 0.5, 'BLEU Score': 0.3004843884984905, 'Overall Score': 0.35}


 76%|███████▌  | 228/300 [16:05<03:51,  3.21s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 1.0, 'BLEU Score': 0.762465858623486, 'Overall Score': 0.9400000000000001}


 76%|███████▋  | 229/300 [16:08<03:45,  3.18s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.4832697830906221, 'Overall Score': 1.0}


 77%|███████▋  | 230/300 [16:16<05:23,  4.62s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.0, 'Factual Correctness': 0.0, 'BLEU Score': 0, 'Overall Score': 0.0}


 77%|███████▋  | 231/300 [16:24<06:41,  5.81s/it]

{'Faithfulness': 0.75, 'Relevancy': 0.5, 'Factual Correctness': 0.5, 'BLEU Score': 0.006902101961911971, 'Overall Score': 0.575}


 77%|███████▋  | 232/300 [16:29<06:20,  5.60s/it]

{'Faithfulness': 0.9876, 'Relevancy': 0.8, 'Factual Correctness': 0.875, 'BLEU Score': 0.17643174755405056, 'Overall Score': 0.88628}


 78%|███████▊  | 233/300 [16:35<06:12,  5.56s/it]

{'Faithfulness': 0.75, 'Relevancy': 0.7, 'Factual Correctness': 0.5, 'BLEU Score': 0.01025915836612856, 'Overall Score': 0.635}


 78%|███████▊  | 234/300 [16:41<06:12,  5.64s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.9876, 'Factual Correctness': 0.875, 'BLEU Score': 0.39266063610874746, 'Overall Score': 0.94628}


 78%|███████▊  | 235/300 [16:45<05:47,  5.34s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.00846177132910779, 'Overall Score': 0.0}


 79%|███████▊  | 236/300 [16:49<05:17,  4.96s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.01853380378290996, 'Overall Score': 0.0}


 79%|███████▉  | 237/300 [16:52<04:25,  4.21s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.8, 'Factual Correctness': 0.5, 'BLEU Score': 0.6262844962765468, 'Overall Score': 0.5900000000000001}


 79%|███████▉  | 238/300 [16:54<03:41,  3.58s/it]

{'Faithfulness': 0.5, 'Relevancy': 1.0, 'Factual Correctness': 0.0, 'BLEU Score': 0, 'Overall Score': 0.44999999999999996}


 80%|███████▉  | 239/300 [16:57<03:28,  3.42s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.2240750868020436, 'Overall Score': 1.0}


 80%|████████  | 240/300 [17:03<04:03,  4.05s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.0, 'Factual Correctness': 0.0, 'BLEU Score': 0, 'Overall Score': 0.15}


 80%|████████  | 241/300 [17:08<04:13,  4.30s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 0.8976, 'BLEU Score': 0.24289282936861023, 'Overall Score': 0.8990400000000001}


 81%|████████  | 242/300 [17:11<03:48,  3.94s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.3237722713145643, 'Overall Score': 1.0}


 81%|████████  | 243/300 [17:13<03:22,  3.56s/it]

{'Faithfulness': 0.8973, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0.2626909894424158, 'Overall Score': 0.76919}


 81%|████████▏ | 244/300 [17:23<04:58,  5.32s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.0, 'Factual Correctness': 0.5, 'BLEU Score': 0.0012933368729412665, 'Overall Score': 0.2}


 82%|████████▏ | 245/300 [17:25<04:10,  4.55s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 1.0, 'Overall Score': 1.0}


 82%|████████▏ | 246/300 [17:29<03:53,  4.32s/it]

{'Faithfulness': 0.8973, 'Relevancy': 0.9, 'Factual Correctness': 0.5, 'BLEU Score': 0.10414419091986514, 'Overall Score': 0.73919}


 82%|████████▏ | 247/300 [17:32<03:23,  3.84s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.10445522730720382, 'Overall Score': 1.0}


 83%|████████▎ | 248/300 [17:36<03:20,  3.86s/it]

{'Faithfulness': 0.9876, 'Relevancy': 0.92, 'Factual Correctness': 0.9876, 'BLEU Score': 0.429697109563662, 'Overall Score': 0.9673200000000001}


 83%|████████▎ | 249/300 [17:40<03:15,  3.83s/it]

{'Faithfulness': 0.8973, 'Relevancy': 0.7, 'Factual Correctness': 0.875, 'BLEU Score': 0.011407522846435572, 'Overall Score': 0.8291900000000001}


 83%|████████▎ | 250/300 [17:44<03:25,  4.11s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 0.9876, 'BLEU Score': 0.3381307292971254, 'Overall Score': 0.9350400000000001}


 84%|████████▎ | 251/300 [18:00<06:03,  7.43s/it]

{'Faithfulness': 0.987, 'Relevancy': 0.7, 'Factual Correctness': 0.5, 'BLEU Score': 0.010100053456347076, 'Overall Score': 0.7061}


 84%|████████▍ | 252/300 [18:02<04:40,  5.84s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.5, 'Factual Correctness': 0.0, 'BLEU Score': 0.19180183554164504, 'Overall Score': 0.15}


 84%|████████▍ | 253/300 [18:07<04:20,  5.54s/it]

{'Faithfulness': 0.987, 'Relevancy': 0.9234, 'Factual Correctness': 0.875, 'BLEU Score': 0.2802078137383573, 'Overall Score': 0.9231199999999999}


 85%|████████▍ | 254/300 [18:10<03:44,  4.87s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.8, 'BLEU Score': 0.21409092659758044, 'Overall Score': 0.92}


 85%|████████▌ | 255/300 [18:13<03:16,  4.36s/it]

{'Faithfulness': 0.5, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0, 'Overall Score': 0.85}


 85%|████████▌ | 256/300 [18:16<02:51,  3.91s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 1.0, 'Overall Score': 1.0}


 86%|████████▌ | 257/300 [18:19<02:39,  3.71s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.0446323613785333, 'Overall Score': 0.6}


 86%|████████▌ | 258/300 [18:24<02:50,  4.07s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.9876, 'Factual Correctness': 1.0, 'BLEU Score': 0.7160350546947921, 'Overall Score': 0.9962799999999999}


 86%|████████▋ | 259/300 [18:28<02:47,  4.08s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0.26529518334824453, 'Overall Score': 0.8}


 87%|████████▋ | 260/300 [18:31<02:34,  3.85s/it]

{'Faithfulness': 0.9876, 'Relevancy': 0.9, 'Factual Correctness': 0.8, 'BLEU Score': 0.24979117342213525, 'Overall Score': 0.8862800000000001}


 87%|████████▋ | 261/300 [18:34<02:19,  3.57s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.3, 'Factual Correctness': 0.0, 'BLEU Score': 0, 'Overall Score': 0.24}


 87%|████████▋ | 262/300 [18:40<02:41,  4.26s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.5, 'Factual Correctness': 0.5, 'BLEU Score': 0.048383149649462574, 'Overall Score': 0.5}


 88%|████████▊ | 263/300 [18:43<02:18,  3.74s/it]

{'Faithfulness': 0.5, 'Relevancy': 1.0, 'Factual Correctness': 0.0, 'BLEU Score': 0, 'Overall Score': 0.44999999999999996}


 88%|████████▊ | 264/300 [18:48<02:35,  4.32s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.9876, 'Factual Correctness': 0.5, 'BLEU Score': 0.17254269827671675, 'Overall Score': 0.64628}


 88%|████████▊ | 265/300 [18:51<02:14,  3.85s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0.021105340631872645, 'Overall Score': 0.8}


 89%|████████▊ | 266/300 [18:55<02:14,  3.94s/it]

{'Faithfulness': 0.8976, 'Relevancy': 0.8, 'Factual Correctness': 0.875, 'BLEU Score': 0.5031925670910535, 'Overall Score': 0.85928}


 89%|████████▉ | 267/300 [18:57<01:51,  3.39s/it]

{'Faithfulness': 0.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0, 'Overall Score': 0.7}


 89%|████████▉ | 268/300 [19:02<01:57,  3.67s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 0.5, 'BLEU Score': 0.012109013026441871, 'Overall Score': 0.74}


 90%|████████▉ | 269/300 [19:05<01:47,  3.46s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0, 'Overall Score': 0.8}


 90%|█████████ | 270/300 [19:09<01:47,  3.60s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 0.875, 'BLEU Score': 0.24180681260144146, 'Overall Score': 0.8900000000000001}


 90%|█████████ | 271/300 [19:12<01:41,  3.48s/it]

{'Faithfulness': 0.875, 'Relevancy': 1.0, 'Factual Correctness': 0.9876, 'BLEU Score': 0.30004556274899286, 'Overall Score': 0.9575400000000001}


 91%|█████████ | 272/300 [19:16<01:44,  3.74s/it]

{'Faithfulness': 0.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.668740304976422, 'Overall Score': 0.7}


 91%|█████████ | 273/300 [19:21<01:46,  3.96s/it]

{'Faithfulness': 0.8, 'Relevancy': 0.7, 'Factual Correctness': 0.5, 'BLEU Score': 0.24903719809455385, 'Overall Score': 0.6499999999999999}


 91%|█████████▏| 274/300 [19:23<01:33,  3.59s/it]

{'Faithfulness': 0.5, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.04939382737115371, 'Overall Score': 0.85}


 92%|█████████▏| 275/300 [19:27<01:28,  3.56s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.7941054938007994, 'Overall Score': 1.0}


 92%|█████████▏| 276/300 [19:30<01:19,  3.30s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.0, 'BLEU Score': 0, 'Overall Score': 0.6}


 92%|█████████▏| 277/300 [19:32<01:12,  3.15s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.017033186037639283, 'Overall Score': 1.0}


 93%|█████████▎| 278/300 [19:35<01:06,  3.02s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.8, 'Factual Correctness': 1.0, 'BLEU Score': 1.0, 'Overall Score': 0.64}


 93%|█████████▎| 279/300 [19:39<01:05,  3.14s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 0.5, 'BLEU Score': 0.4924790605054523, 'Overall Score': 0.74}


 93%|█████████▎| 280/300 [19:42<01:07,  3.36s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.010032055247005962, 'Overall Score': 0.0}


 94%|█████████▎| 281/300 [19:47<01:12,  3.81s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.0, 'Factual Correctness': 0.0, 'BLEU Score': 0, 'Overall Score': 0.0}


 94%|█████████▍| 282/300 [19:50<01:00,  3.39s/it]

{'Faithfulness': 0.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.007444903200446295, 'Overall Score': 0.7}


 94%|█████████▍| 283/300 [19:55<01:06,  3.92s/it]

{'Faithfulness': 0.8976, 'Relevancy': 0.9, 'Factual Correctness': 0.5, 'BLEU Score': 0.009871721786705418, 'Overall Score': 0.7392799999999999}


 95%|█████████▍| 284/300 [19:58<00:58,  3.66s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.8, 'Factual Correctness': 1.0, 'BLEU Score': 0.3976353643835253, 'Overall Score': 0.79}


 95%|█████████▌| 285/300 [20:00<00:49,  3.31s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 1.0, 'Overall Score': 1.0}


 95%|█████████▌| 286/300 [20:03<00:44,  3.21s/it]

{'Faithfulness': 0.9876, 'Relevancy': 0.8, 'Factual Correctness': 0.9876, 'BLEU Score': 0.3258816484718314, 'Overall Score': 0.93132}


 96%|█████████▌| 287/300 [20:07<00:42,  3.28s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0.05949757664833034, 'Overall Score': 0.8}


 96%|█████████▌| 288/300 [20:12<00:45,  3.81s/it]

{'Faithfulness': 0.9876, 'Relevancy': 0.8, 'Factual Correctness': 0.75, 'BLEU Score': 0.26337949320553106, 'Overall Score': 0.83628}


 96%|█████████▋| 289/300 [20:15<00:39,  3.56s/it]

{'Faithfulness': 0.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.5329462628216854, 'Overall Score': 0.7}


 97%|█████████▋| 290/300 [20:17<00:32,  3.23s/it]

{'Faithfulness': 0.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.5623413251903491, 'Overall Score': 0.7}


 97%|█████████▋| 291/300 [20:24<00:38,  4.28s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.005157142709886005, 'Overall Score': 1.0}


 97%|█████████▋| 292/300 [20:27<00:31,  3.98s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 0.5, 'BLEU Score': 0.17395797375642233, 'Overall Score': 0.74}


 98%|█████████▊| 293/300 [20:31<00:27,  3.96s/it]

{'Faithfulness': 0.9876, 'Relevancy': 0.9, 'Factual Correctness': 0.875, 'BLEU Score': 0.342104577056443, 'Overall Score': 0.91628}


 98%|█████████▊| 294/300 [20:34<00:20,  3.47s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.3, 'Factual Correctness': 0.5, 'BLEU Score': 0.17216896116316355, 'Overall Score': 0.44}


 98%|█████████▊| 295/300 [20:36<00:15,  3.17s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0.050712153369465586, 'Overall Score': 0.8}


 99%|█████████▊| 296/300 [20:40<00:13,  3.39s/it]

{'Faithfulness': 0.5, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.021847844937497602, 'Overall Score': 0.85}


 99%|█████████▉| 297/300 [20:43<00:09,  3.28s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.316227766016838, 'Overall Score': 0.4}


 99%|█████████▉| 298/300 [20:47<00:07,  3.57s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.9876, 'Factual Correctness': 0.875, 'BLEU Score': 0.2069016238132598, 'Overall Score': 0.94628}


100%|█████████▉| 299/300 [20:52<00:03,  3.80s/it]

{'Faithfulness': 0.9876, 'Relevancy': 0.92, 'Factual Correctness': 0.5, 'BLEU Score': 0.45454679014090354, 'Overall Score': 0.7722800000000001}


100%|██████████| 300/300 [20:59<00:00,  4.20s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.9876, 'Factual Correctness': 0.8, 'BLEU Score': 0.08103230024642705, 'Overall Score': 0.91628}





In [9]:
faithfullness, relevance, factual, bleu, overall = 0, 0, 0, 0, 0
for i, score in enumerate(scores):
    faithfullness += scores[i]['Faithfulness']
    relevance += scores[i]['Relevancy']
    factual += scores[i]['Factual Correctness']
    bleu += scores[i]['BLEU Score']
    overall += scores[i]['Overall Score']

faithfullness /= len(scores)
relevance /= len(scores)
factual /= len(scores)
bleu /= len(scores)
overall /= len(scores)

In [10]:
print(f"Faithfullness: {faithfullness}")
print(f"Relevance: {relevance}")
print(f"Factual Correctness: {factual}")
print(f"BLEU Score: {bleu}")
print(f"Overall Score: {overall}")

"""Faithfullness: 0.7438866666666664
Relevance: 0.7978940000000004
Factual Correctness: 0.6772726666666667
BLEU Score: 0.21887150058996804
Overall Score: 0.733443266666666"""

Faithfullness: 0.7438866666666664
Relevance: 0.7978940000000004
Factual Correctness: 0.6772726666666667
BLEU Score: 0.21887150058996804
Overall Score: 0.733443266666666
