In [11]:
from eval import evaluate_single_input
from datasets import load_dataset
import faiss
import numpy as np
import pickle
import openai
from openai import OpenAI
!pip install rank_bm25
from rank_bm25 import BM25Okapi  # Sparse retrieval (BM25)



In [12]:
# Initialize DeepInfra OpenAI Client
openai = OpenAI(
    api_key="2Q2AU9IG4jKLdqmRrHc2UxaLP8hHB0ii",  # Replace with your key
    base_url="https://api.deepinfra.com/v1/openai",
)

In [13]:
def load_faiss_index(index_path, metadata_path):
    """
    Loads the FAISS index and metadata from local storage.
    """
    index = faiss.read_index(index_path)
    with open(metadata_path, 'rb') as f:
        metadata = pickle.load(f)
    return index, metadata

In [14]:
def search_faiss(query, top_k, index_path, metadata_path):
    """
    Searches FAISS for the most relevant chunks to the query.
    """
    # Load index and metadata
    index, metadata = load_faiss_index(index_path, metadata_path)

    # Generate embedding for the query
    response = openai.embeddings.create(
        model="BAAI/bge-m3",
        input=query,
        encoding_format="float"
    )
    query_embedding = np.array([response.data[0].embedding])

    # Search FAISS index for the top_k results
    distances, indices = index.search(query_embedding, top_k)

    # Retrieve matching chunks
    results = []
    for i, idx in enumerate(indices[0]):
        results.append((metadata[idx], distances[0][i]))
    return results

In [15]:
# Load the dataset
dataset = load_dataset(
path='BlackFear/istanbul-qa-dataset',
trust_remote_code=True,
)

data = dataset['test']

# Extract data
queries = data['question']
references = data['reference']


In [16]:
def generate_answer(query, context):
    """
    Generates an answer based on the query and retrieved context using GPT.
    """
    # Prepare the input prompt
    prompt = f"""
    Use the following contexts to answer the question:

    Context:
    {context}

    Question:
    {query}

    Answer:
    """

    # Query the Llama model
    response = openai.chat.completions.create(
        model="meta-llama/Llama-3.3-70B-Instruct-Turbo",
        messages=[
            {"role": "system", "content": "You will be answering questions about Istanbul. Please provide the answer to the following question."},
            {"role": "user", "content": prompt}
        ]
    )
    return response.choices[0].message.content.strip()

In [17]:
def hyde(query):
    """
    Generates a hypothetical document based on the query using LLM (Query2Doc).
    Combines the query and hypothetical document for retrieval.
    """
    # Prompt LLM to create a hypothetical document
    prompt = f"""
    Generate a detailed hypothetical document based on the query below:
    Just give the document nothing else.
    Query: {query}

    Hypothetical Document:
    """
    response = openai.chat.completions.create(
        model="meta-llama/Llama-3.3-70B-Instruct-Turbo",
        messages=[{'role': 'user', 'content': prompt}],
    )
    hypothetical_doc = response.choices[0].message.content.strip()

    # Combine query with the hypothetical document
    return hypothetical_doc

In [18]:
def hybrid_search(query, top_k, index_path, metadata_path):
    """
    Combines dense (FAISS) and sparse (BM25) retrieval for better results.
    """
    # Load FAISS index and metadata
    index, metadata = load_faiss_index(index_path, metadata_path)

    # Dense Retrieval (FAISS)
    # Generate embedding for query
    response = openai.embeddings.create(
        model="BAAI/bge-m3",  # Dense embedding model
        input=query,
        encoding_format="float"
    )
    query_embedding = np.array([response.data[0].embedding])
    distances, indices = index.search(query_embedding, top_k)

    # Sparse Retrieval (BM25)
    tokenized_metadata = [doc.split() for doc in metadata]  # Preprocess metadata
    bm25 = BM25Okapi(tokenized_metadata)
    sparse_scores = bm25.get_scores(query.split())

    # Normalize and combine scores (Hybrid Search)
    dense_scores = 1 / (1 + distances[0])  # Convert FAISS distances to similarity
    sparse_scores = np.array(sparse_scores)
    sparse_scores = sparse_scores / np.max(sparse_scores)  # Normalize BM25 scores

    # Combine dense and sparse scores (50-50 weight)
    combined_scores = 0.5 * dense_scores + 0.5 * sparse_scores[indices[0]]

    # Sort results based on combined scores
    sorted_indices = np.argsort(-combined_scores)  # Descending order
    results = [(metadata[indices[0][i]], combined_scores[i]) for i in sorted_indices]

    return results[:top_k]

In [19]:
def rag_pipeline(query, top_k=3, method="hyde",retrive_method='hybrid', index_path='index.faiss', metadata_path='metadata.pkl'):
    """
    Executes the RAG pipeline with different retrieval methods.
    Options: 'query2doc', 'hyde', 'hybrid'
    """
    # Modify query based on retrieval method
    if method == 'query2doc':
        query = query + " " + hyde(query)  # Query2Doc method
    elif method == 'hyde':
        query = hyde(query)  # Hypothetical document only for retrieval

    # Perform retrieval based on method
    if retrive_method == 'hybrid':
        results = hybrid_search(query, top_k, index_path, metadata_path)
    else:
        results = search_faiss(query, top_k, index_path, metadata_path)

    # Combine retrieved chunks as context
    context = "\n\n".join([f"Chunk {i+1}: {result[0]}" for i, result in enumerate(results)])

    # Generate an answer using GPT
    answer = generate_answer(query, context)

    # Return the answer and retrieved results
    return answer, results

In [20]:
from tqdm import tqdm
scores = []
for i in tqdm(range(len(queries)), desc="Processing queries", total=len(queries)):
    query = queries[i]
    reference = references[i]
    answer,results = rag_pipeline(query, top_k=3, method="",retrive_method='hybrid', index_path='faiss_index_recursive.bin', metadata_path='metadata_recursive.pkl')
    results = [result[0] for result in results]
    score = evaluate_single_input(query, reference, answer, results)
    print(score)
    scores.append(score)


Processing queries:   0%|          | 1/300 [00:04<20:03,  4.03s/it]

{'Faithfulness': 0.9876, 'Relevancy': 0.7, 'Factual Correctness': 0.75, 'BLEU Score': 0.14088803229434801, 'Overall Score': 0.80628}


Processing queries:   1%|          | 2/300 [00:11<29:49,  6.00s/it]

{'Faithfulness': 0.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.05873949094699214, 'Overall Score': 0.7}


Processing queries:   1%|          | 3/300 [00:14<23:35,  4.77s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.5, 'Factual Correctness': 0.5, 'BLEU Score': 0.023956565612760213, 'Overall Score': 0.5}


Processing queries:   1%|▏         | 4/300 [00:17<18:46,  3.81s/it]

{'Faithfulness': 0.5, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.24028114141347542, 'Overall Score': 0.85}


Processing queries:   2%|▏         | 5/300 [00:22<21:05,  4.29s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.021650458023945426, 'Overall Score': 0.0}


Processing queries:   2%|▏         | 6/300 [00:25<18:41,  3.81s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.9876, 'Factual Correctness': 0.9876, 'BLEU Score': 0.6848075777090852, 'Overall Score': 0.99132}


Processing queries:   2%|▏         | 7/300 [00:29<19:51,  4.07s/it]

{'Faithfulness': 0.789, 'Relevancy': 0.3, 'Factual Correctness': 0.5, 'BLEU Score': 0.002877533669929391, 'Overall Score': 0.5267}


Processing queries:   3%|▎         | 8/300 [00:32<17:34,  3.61s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.2, 'Factual Correctness': 1.0, 'BLEU Score': 0.09069748827745895, 'Overall Score': 0.61}


Processing queries:   3%|▎         | 9/300 [00:39<22:57,  4.73s/it]

{'Faithfulness': 0.8973, 'Relevancy': 0.9876, 'Factual Correctness': 0.9876, 'BLEU Score': 0.1377594470357124, 'Overall Score': 0.96051}


Processing queries:   3%|▎         | 10/300 [00:46<25:58,  5.37s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.3, 'Factual Correctness': 0.5, 'BLEU Score': 0.002615117268701878, 'Overall Score': 0.44}


Processing queries:   4%|▎         | 11/300 [00:49<22:29,  4.67s/it]

{'Faithfulness': 0.9876, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0.012300686288463768, 'Overall Score': 0.7962799999999999}


Processing queries:   4%|▍         | 12/300 [00:53<21:05,  4.39s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0.30130404892785684, 'Overall Score': 0.8}


Processing queries:   4%|▍         | 13/300 [00:59<23:23,  4.89s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.5, 'Factual Correctness': 0.5, 'BLEU Score': 0, 'Overall Score': 0.5}


Processing queries:   5%|▍         | 14/300 [01:03<22:08,  4.64s/it]

{'Faithfulness': 0.8973, 'Relevancy': 0.8, 'Factual Correctness': 0.5, 'BLEU Score': 0.011653898241358245, 'Overall Score': 0.70919}


Processing queries:   5%|▌         | 15/300 [01:07<21:06,  4.44s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.011692093421606195, 'Overall Score': 0.0}


Processing queries:   5%|▌         | 16/300 [01:10<19:29,  4.12s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.05260868474998189, 'Overall Score': 0.0}


Processing queries:   6%|▌         | 17/300 [01:12<16:52,  3.58s/it]

{'Faithfulness': 0.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.1778279410038923, 'Overall Score': 0.7}


Processing queries:   6%|▌         | 18/300 [01:17<17:50,  3.80s/it]

{'Faithfulness': 0.8973, 'Relevancy': 1.0, 'Factual Correctness': 0.9876, 'BLEU Score': 0.011440503453835606, 'Overall Score': 0.96423}


Processing queries:   6%|▋         | 19/300 [01:22<19:59,  4.27s/it]

{'Faithfulness': 0.9876, 'Relevancy': 0.7, 'Factual Correctness': 0.875, 'BLEU Score': 0.1473429006806873, 'Overall Score': 0.8562799999999999}


Processing queries:   7%|▋         | 20/300 [01:27<21:22,  4.58s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.5, 'Factual Correctness': 0.5, 'BLEU Score': 0.08384879131223627, 'Overall Score': 0.5}


Processing queries:   7%|▋         | 21/300 [01:32<20:41,  4.45s/it]

{'Faithfulness': 0.8, 'Relevancy': 0.7, 'Factual Correctness': 0.8976, 'BLEU Score': 0.027427210913946506, 'Overall Score': 0.80904}


Processing queries:   7%|▋         | 22/300 [01:35<18:53,  4.08s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 1.0, 'BLEU Score': 0.7498810286408993, 'Overall Score': 0.9400000000000001}


Processing queries:   8%|▊         | 23/300 [01:37<16:22,  3.55s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.1933853138176172, 'Overall Score': 1.0}


Processing queries:   8%|▊         | 24/300 [01:40<15:30,  3.37s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.3, 'Factual Correctness': 0.5, 'BLEU Score': 0.053463162573637084, 'Overall Score': 0.44}


Processing queries:   8%|▊         | 25/300 [01:43<14:14,  3.11s/it]

{'Faithfulness': 0.5, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.016060081131239495, 'Overall Score': 0.85}


Processing queries:   9%|▊         | 26/300 [01:45<13:15,  2.90s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 1.0, 'Overall Score': 1.0}


Processing queries:   9%|▉         | 27/300 [01:49<14:56,  3.29s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.006244385951234721, 'Overall Score': 0.0}


Processing queries:   9%|▉         | 28/300 [01:51<13:20,  2.94s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.3, 'Factual Correctness': 1.0, 'BLEU Score': 0.316227766016838, 'Overall Score': 0.49}


Processing queries:  10%|▉         | 29/300 [01:55<13:53,  3.08s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.017805390573849337, 'Overall Score': 0.0}


Processing queries:  10%|█         | 30/300 [01:57<12:34,  2.79s/it]

{'Faithfulness': 0.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.1778279410038923, 'Overall Score': 0.7}


Processing queries:  10%|█         | 31/300 [02:00<13:22,  2.98s/it]

{'Faithfulness': 0.75, 'Relevancy': 0.8, 'Factual Correctness': 0.75, 'BLEU Score': 0.3313880358713493, 'Overall Score': 0.765}


Processing queries:  11%|█         | 32/300 [02:03<13:20,  2.99s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.98, 'Factual Correctness': 0.8, 'BLEU Score': 0.3804736860717707, 'Overall Score': 0.914}


Processing queries:  11%|█         | 33/300 [02:06<13:30,  3.03s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.005641822153183602, 'Overall Score': 0.0}


Processing queries:  11%|█▏        | 34/300 [02:09<12:32,  2.83s/it]

{'Faithfulness': 0.5, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.03986357128268015, 'Overall Score': 0.85}


Processing queries:  12%|█▏        | 35/300 [02:11<12:19,  2.79s/it]

{'Faithfulness': 0.5, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0, 'Overall Score': 0.6499999999999999}


Processing queries:  12%|█▏        | 36/300 [02:14<12:39,  2.88s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 0.75, 'BLEU Score': 0.38091370416670794, 'Overall Score': 0.8400000000000001}


Processing queries:  12%|█▏        | 37/300 [02:18<13:03,  2.98s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.3559528809154241, 'Overall Score': 1.0}


Processing queries:  13%|█▎        | 38/300 [02:20<12:39,  2.90s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.7, 'Factual Correctness': 1.0, 'BLEU Score': 1.0, 'Overall Score': 0.91}


Processing queries:  13%|█▎        | 39/300 [02:23<11:46,  2.71s/it]

{'Faithfulness': 0.5, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0.08633400213704504, 'Overall Score': 0.6499999999999999}


Processing queries:  13%|█▎        | 40/300 [02:29<16:03,  3.70s/it]

{'Faithfulness': 0.9876, 'Relevancy': 0.9, 'Factual Correctness': 1.0, 'BLEU Score': 0.13024841798009487, 'Overall Score': 0.96628}


Processing queries:  14%|█▎        | 41/300 [02:31<14:22,  3.33s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.7, 'Factual Correctness': 1.0, 'BLEU Score': 0.7361703354503866, 'Overall Score': 0.91}


Processing queries:  14%|█▍        | 42/300 [02:40<21:07,  4.91s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.3, 'Factual Correctness': 0.5, 'BLEU Score': 0.055315857891618116, 'Overall Score': 0.44}


Processing queries:  14%|█▍        | 43/300 [02:44<20:03,  4.68s/it]

{'Faithfulness': 0.8, 'Relevancy': 0.8, 'Factual Correctness': 1.0, 'BLEU Score': 0, 'Overall Score': 0.88}


Processing queries:  15%|█▍        | 44/300 [02:47<18:09,  4.26s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.9876, 'Factual Correctness': 1.0, 'BLEU Score': 0.45410737314088695, 'Overall Score': 0.84628}


Processing queries:  15%|█▌        | 45/300 [02:49<15:30,  3.65s/it]

{'Faithfulness': 0.0, 'Relevancy': 1.0, 'Factual Correctness': 0.0, 'BLEU Score': 1.0, 'Overall Score': 0.3}


Processing queries:  15%|█▌        | 46/300 [02:53<14:57,  3.53s/it]

{'Faithfulness': 0.8, 'Relevancy': 0.8, 'Factual Correctness': 0.5, 'BLEU Score': 0.02253741272267485, 'Overall Score': 0.6799999999999999}


Processing queries:  16%|█▌        | 47/300 [02:56<14:13,  3.37s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.42902556537105646, 'Overall Score': 1.0}


Processing queries:  16%|█▌        | 48/300 [03:00<15:36,  3.72s/it]

{'Faithfulness': 0.8912, 'Relevancy': 0.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.00366753025392797, 'Overall Score': 0.26736}


Processing queries:  16%|█▋        | 49/300 [03:06<18:08,  4.34s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.0028919892142542742, 'Overall Score': 0.0}


Processing queries:  17%|█▋        | 50/300 [03:10<17:25,  4.18s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.11110969989860409, 'Overall Score': 0.0}


Processing queries:  17%|█▋        | 51/300 [03:14<17:09,  4.13s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0, 'Overall Score': 0.8}


Processing queries:  17%|█▋        | 52/300 [03:22<21:33,  5.22s/it]

{'Faithfulness': 0.8976, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.4412994555017375, 'Overall Score': 0.96928}


Processing queries:  18%|█▊        | 53/300 [03:24<18:28,  4.49s/it]

{'Faithfulness': 0.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.1495348781221221, 'Overall Score': 0.7}


Processing queries:  18%|█▊        | 54/300 [03:32<21:41,  5.29s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.9876, 'Factual Correctness': 0.875, 'BLEU Score': 0.31535540524901323, 'Overall Score': 0.94628}


Processing queries:  18%|█▊        | 55/300 [03:37<21:46,  5.33s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.7, 'Factual Correctness': 0.875, 'BLEU Score': 0.24671763489589052, 'Overall Score': 0.8600000000000001}


Processing queries:  19%|█▊        | 56/300 [03:41<19:53,  4.89s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.11633369384516798, 'Overall Score': 0.0}


Processing queries:  19%|█▉        | 57/300 [03:48<23:03,  5.69s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.8, 'Factual Correctness': 0.5, 'BLEU Score': 0.10308675254291907, 'Overall Score': 0.5900000000000001}


Processing queries:  19%|█▉        | 58/300 [03:54<22:49,  5.66s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.3, 'Factual Correctness': 0.5, 'BLEU Score': 0.034262787194737264, 'Overall Score': 0.44}


Processing queries:  20%|█▉        | 59/300 [03:56<18:55,  4.71s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0.3290385879986622, 'Overall Score': 0.8}


Processing queries:  20%|██        | 60/300 [04:00<17:35,  4.40s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.016734480530603443, 'Overall Score': 0.6}


Processing queries:  20%|██        | 61/300 [04:03<15:42,  3.94s/it]

{'Faithfulness': 0.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0, 'Overall Score': 0.7}


Processing queries:  21%|██        | 62/300 [04:07<16:09,  4.07s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.3, 'Factual Correctness': 0.0, 'BLEU Score': 0, 'Overall Score': 0.24}


Processing queries:  21%|██        | 63/300 [04:12<16:22,  4.15s/it]

{'Faithfulness': 0.8976, 'Relevancy': 0.8, 'Factual Correctness': 0.5, 'BLEU Score': 0.2603965252496297, 'Overall Score': 0.7092799999999999}


Processing queries:  21%|██▏       | 64/300 [04:14<14:30,  3.69s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.1495348781221221, 'Overall Score': 1.0}


Processing queries:  22%|██▏       | 65/300 [04:21<17:38,  4.50s/it]

{'Faithfulness': 0.987, 'Relevancy': 0.9876, 'Factual Correctness': 0.9876, 'BLEU Score': 0.07174630278990472, 'Overall Score': 0.98742}


Processing queries:  22%|██▏       | 66/300 [04:23<15:20,  3.93s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 0.5, 'BLEU Score': 0.346697783111003, 'Overall Score': 0.74}


Processing queries:  22%|██▏       | 67/300 [04:26<13:46,  3.55s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.018850319022747353, 'Overall Score': 1.0}


Processing queries:  23%|██▎       | 68/300 [04:36<21:40,  5.60s/it]

{'Faithfulness': 0.7, 'Relevancy': 0.7, 'Factual Correctness': 0.5, 'BLEU Score': 0.0008488328084126482, 'Overall Score': 0.62}


Processing queries:  23%|██▎       | 69/300 [04:46<26:06,  6.78s/it]

{'Faithfulness': 0.8976, 'Relevancy': 0.8, 'Factual Correctness': 1.0, 'BLEU Score': 0.9036020036098448, 'Overall Score': 0.90928}


Processing queries:  23%|██▎       | 70/300 [04:49<21:33,  5.63s/it]

{'Faithfulness': 0.75, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0.03285702044797773, 'Overall Score': 0.7249999999999999}


Processing queries:  24%|██▎       | 71/300 [04:52<18:45,  4.91s/it]

{'Faithfulness': 0.789, 'Relevancy': 0.8, 'Factual Correctness': 0.0, 'BLEU Score': 0.03776949794525175, 'Overall Score': 0.4767}


Processing queries:  24%|██▍       | 72/300 [04:55<16:21,  4.31s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0, 'Overall Score': 1.0}


Processing queries:  24%|██▍       | 73/300 [04:58<15:15,  4.03s/it]

{'Faithfulness': 0.987, 'Relevancy': 0.9, 'Factual Correctness': 0.875, 'BLEU Score': 0.14868720326332424, 'Overall Score': 0.9161000000000001}


Processing queries:  25%|██▍       | 74/300 [05:02<14:32,  3.86s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.008817767310939813, 'Overall Score': 0.0}


Processing queries:  25%|██▌       | 75/300 [05:06<14:58,  3.99s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0.0768968397262906, 'Overall Score': 0.8}


Processing queries:  25%|██▌       | 76/300 [05:09<14:03,  3.77s/it]

{'Faithfulness': 0.897, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0.009629943614188135, 'Overall Score': 0.7690999999999999}


Processing queries:  26%|██▌       | 77/300 [05:13<13:50,  3.72s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.7817608073542297, 'Overall Score': 1.0}


Processing queries:  26%|██▌       | 78/300 [05:16<12:53,  3.48s/it]

{'Faithfulness': 0.8974, 'Relevancy': 0.7, 'Factual Correctness': 0.5, 'BLEU Score': 0.03662395773133831, 'Overall Score': 0.6792199999999999}


Processing queries:  26%|██▋       | 79/300 [05:19<12:48,  3.48s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 1.0, 'BLEU Score': 0.4348553979929487, 'Overall Score': 0.9400000000000001}


Processing queries:  27%|██▋       | 80/300 [05:25<15:29,  4.23s/it]

{'Faithfulness': 0.8, 'Relevancy': 0.7, 'Factual Correctness': 0.5, 'BLEU Score': 0.23049482057713108, 'Overall Score': 0.6499999999999999}


Processing queries:  27%|██▋       | 81/300 [05:29<14:41,  4.02s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.2, 'Factual Correctness': 0.0, 'BLEU Score': 0, 'Overall Score': 0.21}


Processing queries:  27%|██▋       | 82/300 [05:32<13:17,  3.66s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0.015718877363021206, 'Overall Score': 0.8}


Processing queries:  28%|██▊       | 83/300 [05:36<14:26,  4.00s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.0, 'Factual Correctness': 0.0, 'BLEU Score': 0, 'Overall Score': 0.0}


Processing queries:  28%|██▊       | 84/300 [05:40<13:33,  3.76s/it]

{'Faithfulness': 0.75, 'Relevancy': 0.75, 'Factual Correctness': 0.5, 'BLEU Score': 0, 'Overall Score': 0.6499999999999999}


Processing queries:  28%|██▊       | 85/300 [05:44<13:45,  3.84s/it]

{'Faithfulness': 0.75, 'Relevancy': 0.7, 'Factual Correctness': 0.5, 'BLEU Score': 0.004826741258911654, 'Overall Score': 0.635}


Processing queries:  29%|██▊       | 86/300 [05:48<13:58,  3.92s/it]

{'Faithfulness': 0.8, 'Relevancy': 0.8, 'Factual Correctness': 0.8, 'BLEU Score': 0.37832307763098255, 'Overall Score': 0.8}


Processing queries:  29%|██▉       | 87/300 [05:51<13:07,  3.70s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.043179000236065884, 'Overall Score': 0.0}


Processing queries:  29%|██▉       | 88/300 [05:53<11:43,  3.32s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.1777835117834348, 'Overall Score': 1.0}


Processing queries:  30%|██▉       | 89/300 [05:58<12:49,  3.65s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.009366074330005709, 'Overall Score': 0.0}


Processing queries:  30%|███       | 90/300 [06:05<16:47,  4.80s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.3, 'Factual Correctness': 0.0, 'BLEU Score': 0, 'Overall Score': 0.24}


Processing queries:  30%|███       | 91/300 [06:14<20:31,  5.89s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.5, 'Factual Correctness': 0.5, 'BLEU Score': 0.0031908653555779358, 'Overall Score': 0.5}


Processing queries:  31%|███       | 92/300 [06:17<17:10,  4.95s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.7, 'Factual Correctness': 0.0, 'BLEU Score': 0.014628063653657535, 'Overall Score': 0.36}


Processing queries:  31%|███       | 93/300 [06:23<18:13,  5.28s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.3, 'Factual Correctness': 0.5, 'BLEU Score': 0.0023555952768576035, 'Overall Score': 0.44}


Processing queries:  31%|███▏      | 94/300 [06:25<15:40,  4.57s/it]

{'Faithfulness': 0.897, 'Relevancy': 0.7, 'Factual Correctness': 0.875, 'BLEU Score': 0.20720177984194466, 'Overall Score': 0.8291}


Processing queries:  32%|███▏      | 95/300 [06:28<13:44,  4.02s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.5, 'Factual Correctness': 0.0, 'BLEU Score': 0, 'Overall Score': 0.44999999999999996}


Processing queries:  32%|███▏      | 96/300 [06:31<12:14,  3.60s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.2, 'Factual Correctness': 1.0, 'BLEU Score': 0.11362193664674995, 'Overall Score': 0.46}


Processing queries:  32%|███▏      | 97/300 [06:36<13:18,  3.93s/it]

{'Faithfulness': 0.75, 'Relevancy': 0.7, 'Factual Correctness': 0.8, 'BLEU Score': 0.1561131741999243, 'Overall Score': 0.755}


Processing queries:  33%|███▎      | 98/300 [06:40<14:14,  4.23s/it]

{'Faithfulness': 0.7891, 'Relevancy': 0.8, 'Factual Correctness': 0.5, 'BLEU Score': 0.005474870710453651, 'Overall Score': 0.67673}


Processing queries:  33%|███▎      | 99/300 [06:45<14:00,  4.18s/it]

{'Faithfulness': 0.8976, 'Relevancy': 0.8, 'Factual Correctness': 0.75, 'BLEU Score': 0.016496625424967437, 'Overall Score': 0.80928}


Processing queries:  33%|███▎      | 100/300 [06:48<13:00,  3.90s/it]

{'Faithfulness': 0.5, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0.146798691397542, 'Overall Score': 0.6499999999999999}


Processing queries:  34%|███▎      | 101/300 [06:51<12:40,  3.82s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0, 'Overall Score': 0.8}


Processing queries:  34%|███▍      | 102/300 [06:54<11:27,  3.47s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.0, 'BLEU Score': 0, 'Overall Score': 0.6}


Processing queries:  34%|███▍      | 103/300 [06:58<11:22,  3.47s/it]

{'Faithfulness': 0.9876, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.1969221590285716, 'Overall Score': 0.9962799999999999}


Processing queries:  35%|███▍      | 104/300 [07:02<12:09,  3.72s/it]

{'Faithfulness': 0.9876, 'Relevancy': 0.9, 'Factual Correctness': 0.875, 'BLEU Score': 0.5993418090318327, 'Overall Score': 0.91628}


Processing queries:  35%|███▌      | 105/300 [07:06<12:15,  3.77s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.025098621243978974, 'Overall Score': 1.0}


Processing queries:  35%|███▌      | 106/300 [07:08<10:34,  3.27s/it]

{'Faithfulness': 0.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.316227766016838, 'Overall Score': 0.7}


Processing queries:  36%|███▌      | 107/300 [07:12<11:46,  3.66s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.3, 'Factual Correctness': 0.5, 'BLEU Score': 0.0031845144858397693, 'Overall Score': 0.44}


Processing queries:  36%|███▌      | 108/300 [07:19<14:31,  4.54s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.003330819561259882, 'Overall Score': 0.0}


Processing queries:  36%|███▋      | 109/300 [07:22<13:23,  4.21s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.9876, 'BLEU Score': 0.5264663902174495, 'Overall Score': 0.99504}


Processing queries:  37%|███▋      | 110/300 [07:25<11:21,  3.59s/it]

{'Faithfulness': 0.0, 'Relevancy': 1.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.00033492903248398575, 'Overall Score': 0.3}


Processing queries:  37%|███▋      | 111/300 [07:29<11:52,  3.77s/it]

{'Faithfulness': 0.8976, 'Relevancy': 0.8, 'Factual Correctness': 0.75, 'BLEU Score': 0.04467395935842264, 'Overall Score': 0.80928}


Processing queries:  37%|███▋      | 112/300 [07:32<11:00,  3.51s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.3, 'Factual Correctness': 1.0, 'BLEU Score': 1.0, 'Overall Score': 0.49}


Processing queries:  38%|███▊      | 113/300 [07:36<11:14,  3.61s/it]

{'Faithfulness': 0.8912, 'Relevancy': 0.7, 'Factual Correctness': 0.5, 'BLEU Score': 0.04037102236950707, 'Overall Score': 0.67736}


Processing queries:  38%|███▊      | 114/300 [07:43<15:10,  4.90s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.008508474131867486, 'Overall Score': 0.0}


Processing queries:  38%|███▊      | 115/300 [07:47<13:56,  4.52s/it]

{'Faithfulness': 0.8973, 'Relevancy': 0.92, 'Factual Correctness': 0.9876, 'BLEU Score': 0.040071098820477234, 'Overall Score': 0.9402300000000001}


Processing queries:  39%|███▊      | 116/300 [07:55<17:25,  5.68s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.8, 'Factual Correctness': 0.5, 'BLEU Score': 0.09622710741211887, 'Overall Score': 0.5900000000000001}


Processing queries:  39%|███▉      | 117/300 [08:00<15:57,  5.23s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.3, 'Factual Correctness': 0.5, 'BLEU Score': 0.012360545410216943, 'Overall Score': 0.44}


Processing queries:  39%|███▉      | 118/300 [08:08<18:19,  6.04s/it]

{'Faithfulness': 0.8912, 'Relevancy': 0.8, 'Factual Correctness': 0.75, 'BLEU Score': 0.06978065245536882, 'Overall Score': 0.8073600000000001}


Processing queries:  40%|███▉      | 119/300 [08:11<16:03,  5.32s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 0.875, 'BLEU Score': 0.1808713155931068, 'Overall Score': 0.8900000000000001}


Processing queries:  40%|████      | 120/300 [08:15<15:00,  5.00s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.3, 'Factual Correctness': 0.5, 'BLEU Score': 0.021919222940181927, 'Overall Score': 0.44}


Processing queries:  40%|████      | 121/300 [08:19<13:22,  4.48s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.9, 'Factual Correctness': 0.5, 'BLEU Score': 0.2922455022158055, 'Overall Score': 0.77}


Processing queries:  41%|████      | 122/300 [08:22<12:20,  4.16s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.0, 'Factual Correctness': 0.0, 'BLEU Score': 0, 'Overall Score': 0.0}


Processing queries:  41%|████      | 123/300 [08:28<13:35,  4.61s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.7012055133086459, 'Overall Score': 1.0}


Processing queries:  41%|████▏     | 124/300 [08:31<11:52,  4.05s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 1.0, 'BLEU Score': 0.5462816013751063, 'Overall Score': 0.9400000000000001}


Processing queries:  42%|████▏     | 125/300 [08:33<10:31,  3.61s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0, 'Overall Score': 1.0}


Processing queries:  42%|████▏     | 126/300 [08:36<10:12,  3.52s/it]

{'Faithfulness': 0.9876, 'Relevancy': 0.9, 'Factual Correctness': 0.5, 'BLEU Score': 0.18564899566656387, 'Overall Score': 0.7662800000000001}


Processing queries:  42%|████▏     | 127/300 [08:39<09:26,  3.28s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 0.5, 'BLEU Score': 0.06501220695060562, 'Overall Score': 0.74}


Processing queries:  43%|████▎     | 128/300 [08:43<09:45,  3.40s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.0, 'BLEU Score': 0, 'Overall Score': 0.6}


Processing queries:  43%|████▎     | 129/300 [08:46<09:13,  3.24s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.3, 'Factual Correctness': 0.5, 'BLEU Score': 0.007221283582336468, 'Overall Score': 0.5900000000000001}


Processing queries:  43%|████▎     | 130/300 [08:48<08:14,  2.91s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.3, 'Factual Correctness': 1.0, 'BLEU Score': 0.1778279410038923, 'Overall Score': 0.49}


Processing queries:  44%|████▎     | 131/300 [08:56<12:36,  4.47s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.0, 'Factual Correctness': 0.0, 'BLEU Score': 0, 'Overall Score': 0.0}


Processing queries:  44%|████▍     | 132/300 [08:59<11:06,  3.97s/it]

{'Faithfulness': 0.9876, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0.33248794444673613, 'Overall Score': 0.7962799999999999}


Processing queries:  44%|████▍     | 133/300 [09:02<10:34,  3.80s/it]

{'Faithfulness': 0.0, 'Relevancy': 1.0, 'Factual Correctness': 0.0, 'BLEU Score': 0, 'Overall Score': 0.3}


Processing queries:  45%|████▍     | 134/300 [09:04<09:04,  3.28s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.1495348781221221, 'Overall Score': 1.0}


Processing queries:  45%|████▌     | 135/300 [09:08<09:25,  3.43s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.9876, 'Factual Correctness': 0.9234, 'BLEU Score': 0.2920279702261309, 'Overall Score': 0.9656399999999999}


Processing queries:  45%|████▌     | 136/300 [09:16<13:00,  4.76s/it]

{'Faithfulness': 0.9876, 'Relevancy': 0.8, 'Factual Correctness': 1.0, 'BLEU Score': 0.25285586062277393, 'Overall Score': 0.93628}


Processing queries:  46%|████▌     | 137/300 [09:21<13:31,  4.98s/it]

{'Faithfulness': 0.8932, 'Relevancy': 0.7, 'Factual Correctness': 0.5, 'BLEU Score': 0.009240248103148035, 'Overall Score': 0.6779599999999999}


Processing queries:  46%|████▌     | 138/300 [09:27<13:48,  5.11s/it]

{'Faithfulness': 0.8973, 'Relevancy': 0.9, 'Factual Correctness': 0.5, 'BLEU Score': 0.07315986666459885, 'Overall Score': 0.73919}


Processing queries:  46%|████▋     | 139/300 [09:29<11:36,  4.33s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.3, 'Factual Correctness': 1.0, 'BLEU Score': 0, 'Overall Score': 0.49}


Processing queries:  47%|████▋     | 140/300 [09:32<10:35,  3.97s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.1472821272412462, 'Overall Score': 1.0}


Processing queries:  47%|████▋     | 141/300 [09:40<12:59,  4.90s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.5, 'Factual Correctness': 0.5, 'BLEU Score': 0.0076869318629609945, 'Overall Score': 0.5}


Processing queries:  47%|████▋     | 142/300 [09:44<12:34,  4.78s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.3, 'Factual Correctness': 0.5, 'BLEU Score': 0.012138611630770825, 'Overall Score': 0.44}


Processing queries:  48%|████▊     | 143/300 [09:47<11:20,  4.34s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 0.875, 'BLEU Score': 0.18373344524824237, 'Overall Score': 0.8900000000000001}


Processing queries:  48%|████▊     | 144/300 [09:50<10:19,  3.97s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0, 'Overall Score': 1.0}


Processing queries:  48%|████▊     | 145/300 [09:53<09:25,  3.65s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.7, 'Factual Correctness': 0.5, 'BLEU Score': 0.053463162573637084, 'Overall Score': 0.71}


Processing queries:  49%|████▊     | 146/300 [10:02<12:58,  5.06s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.3, 'Factual Correctness': 0.5, 'BLEU Score': 0.01711202977485925, 'Overall Score': 0.29000000000000004}


Processing queries:  49%|████▉     | 147/300 [10:13<17:45,  6.96s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.5, 'Factual Correctness': 0.5, 'BLEU Score': 0.08535036258472029, 'Overall Score': 0.5}


Processing queries:  49%|████▉     | 148/300 [10:16<14:51,  5.87s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 0.5, 'BLEU Score': 0.02284411468856457, 'Overall Score': 0.74}


Processing queries:  50%|████▉     | 149/300 [10:21<14:00,  5.56s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.0, 'Factual Correctness': 0.0, 'BLEU Score': 0, 'Overall Score': 0.0}


Processing queries:  50%|█████     | 150/300 [10:24<11:51,  4.74s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.0, 'BLEU Score': 0, 'Overall Score': 0.6}


Processing queries:  50%|█████     | 151/300 [10:27<10:08,  4.08s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.5, 'Factual Correctness': 1.0, 'BLEU Score': 0.11362193664674995, 'Overall Score': 0.55}


Processing queries:  51%|█████     | 152/300 [10:31<10:17,  4.17s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.8, 'Factual Correctness': 0.0, 'BLEU Score': 0.011577941920572241, 'Overall Score': 0.39}


Processing queries:  51%|█████     | 153/300 [10:35<10:01,  4.09s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0.4371707208826036, 'Overall Score': 0.8}


Processing queries:  51%|█████▏    | 154/300 [10:39<09:59,  4.11s/it]

{'Faithfulness': 0.9876, 'Relevancy': 0.7, 'Factual Correctness': 0.5, 'BLEU Score': 0.32494178152665515, 'Overall Score': 0.70628}


Processing queries:  52%|█████▏    | 155/300 [10:44<10:13,  4.23s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.0032047574588556275, 'Overall Score': 0.0}


Processing queries:  52%|█████▏    | 156/300 [10:48<10:07,  4.22s/it]

{'Faithfulness': 0.8976, 'Relevancy': 0.8, 'Factual Correctness': 0.75, 'BLEU Score': 0.0824186035805035, 'Overall Score': 0.80928}


Processing queries:  52%|█████▏    | 157/300 [10:51<09:26,  3.96s/it]

{'Faithfulness': 0.75, 'Relevancy': 0.8, 'Factual Correctness': 0.5, 'BLEU Score': 0.05692994482485313, 'Overall Score': 0.665}


Processing queries:  53%|█████▎    | 158/300 [10:55<09:16,  3.92s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.0, 'BLEU Score': 0, 'Overall Score': 0.6}


Processing queries:  53%|█████▎    | 159/300 [10:58<08:52,  3.78s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0.43350906413906354, 'Overall Score': 0.8}


Processing queries:  53%|█████▎    | 160/300 [11:03<09:39,  4.14s/it]

{'Faithfulness': 0.8973, 'Relevancy': 0.8, 'Factual Correctness': 0.75, 'BLEU Score': 0.0099458088695389, 'Overall Score': 0.8091900000000001}


Processing queries:  54%|█████▎    | 161/300 [11:06<08:33,  3.69s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.017033186037639283, 'Overall Score': 1.0}


Processing queries:  54%|█████▍    | 162/300 [11:14<11:07,  4.83s/it]

{'Faithfulness': 0.9876, 'Relevancy': 0.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.0674288113080084, 'Overall Score': 0.29628}


Processing queries:  54%|█████▍    | 163/300 [11:16<09:45,  4.27s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.9876, 'BLEU Score': 0.3525448367799153, 'Overall Score': 0.99504}


Processing queries:  55%|█████▍    | 164/300 [11:20<09:17,  4.10s/it]

{'Faithfulness': 0.5, 'Relevancy': 1.0, 'Factual Correctness': 0.875, 'BLEU Score': 0.4496214833476127, 'Overall Score': 0.8}


Processing queries:  55%|█████▌    | 165/300 [11:24<08:44,  3.88s/it]

{'Faithfulness': 0.897, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0.03058760346458022, 'Overall Score': 0.7690999999999999}


Processing queries:  55%|█████▌    | 166/300 [11:33<12:42,  5.69s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.2, 'Factual Correctness': 0.5, 'BLEU Score': 0.0016264282094567788, 'Overall Score': 0.41000000000000003}


Processing queries:  56%|█████▌    | 167/300 [11:36<10:22,  4.68s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0, 'Overall Score': 1.0}


Processing queries:  56%|█████▌    | 168/300 [11:39<09:12,  4.18s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 0.75, 'BLEU Score': 0.37239098949398236, 'Overall Score': 0.8400000000000001}


Processing queries:  56%|█████▋    | 169/300 [11:44<09:55,  4.55s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.9, 'Factual Correctness': 0.9876, 'BLEU Score': 0.4213919805847572, 'Overall Score': 0.9650400000000001}


Processing queries:  57%|█████▋    | 170/300 [11:47<08:28,  3.91s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.1778279410038923, 'Overall Score': 0.4}


Processing queries:  57%|█████▋    | 171/300 [11:51<08:37,  4.01s/it]

{'Faithfulness': 0.8973, 'Relevancy': 0.7, 'Factual Correctness': 0.0, 'BLEU Score': 0.005355924989097696, 'Overall Score': 0.47919}


Processing queries:  57%|█████▋    | 172/300 [11:54<07:48,  3.66s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.06985342056580097, 'Overall Score': 1.0}


Processing queries:  58%|█████▊    | 173/300 [11:59<08:28,  4.01s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.3, 'Factual Correctness': 0.5, 'BLEU Score': 0.0026744824235784732, 'Overall Score': 0.44}


Processing queries:  58%|█████▊    | 174/300 [12:03<08:34,  4.08s/it]

{'Faithfulness': 0.789, 'Relevancy': 0.3, 'Factual Correctness': 0.0, 'BLEU Score': 0.00606212098164554, 'Overall Score': 0.3267}


Processing queries:  58%|█████▊    | 175/300 [12:05<07:28,  3.58s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.5, 'Factual Correctness': 0.0, 'BLEU Score': 0.316227766016838, 'Overall Score': 0.15}


Processing queries:  59%|█████▊    | 176/300 [12:12<09:17,  4.50s/it]

{'Faithfulness': 0.897, 'Relevancy': 0.9, 'Factual Correctness': 0.9876, 'BLEU Score': 0.23884513521354037, 'Overall Score': 0.9341400000000001}


Processing queries:  59%|█████▉    | 177/300 [12:15<08:19,  4.06s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.5, 'Factual Correctness': 0.5, 'BLEU Score': 0.019383418023456653, 'Overall Score': 0.5}


Processing queries:  59%|█████▉    | 178/300 [12:18<07:32,  3.71s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0.21258710573693182, 'Overall Score': 0.8}


Processing queries:  60%|█████▉    | 179/300 [12:24<08:43,  4.33s/it]

{'Faithfulness': 0.9876, 'Relevancy': 0.8, 'Factual Correctness': 0.5, 'BLEU Score': 0.22548532630539453, 'Overall Score': 0.73628}


Processing queries:  60%|██████    | 180/300 [12:26<07:47,  3.89s/it]

{'Faithfulness': 0.0, 'Relevancy': 1.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.1073525213116119, 'Overall Score': 0.3}


Processing queries:  60%|██████    | 181/300 [12:30<07:27,  3.76s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.2, 'Factual Correctness': 0.0, 'BLEU Score': 0.015703799285850036, 'Overall Score': 0.21}


Processing queries:  61%|██████    | 182/300 [12:32<06:40,  3.40s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.316227766016838, 'Overall Score': 1.0}


Processing queries:  61%|██████    | 183/300 [12:37<07:16,  3.73s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 0.0, 'BLEU Score': 0.010511846841633778, 'Overall Score': 0.54}


Processing queries:  61%|██████▏   | 184/300 [12:42<07:44,  4.01s/it]

{'Faithfulness': 0.897, 'Relevancy': 0.7, 'Factual Correctness': 0.75, 'BLEU Score': 0.07692375026049747, 'Overall Score': 0.7791}


Processing queries:  62%|██████▏   | 185/300 [12:50<10:20,  5.39s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 1.0, 'BLEU Score': 0.7551475759603, 'Overall Score': 0.9400000000000001}


Processing queries:  62%|██████▏   | 186/300 [12:52<08:25,  4.44s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.0, 'Factual Correctness': 0.0, 'BLEU Score': 5.965462840365855e-05, 'Overall Score': 0.0}


Processing queries:  62%|██████▏   | 187/300 [12:58<08:43,  4.64s/it]

{'Faithfulness': 0.5, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 1.0, 'Overall Score': 0.85}


Processing queries:  63%|██████▎   | 188/300 [13:00<07:42,  4.13s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.8, 'Factual Correctness': 1.0, 'BLEU Score': 1.0, 'Overall Score': 0.79}


Processing queries:  63%|██████▎   | 189/300 [13:03<06:48,  3.68s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.3, 'Factual Correctness': 0.5, 'BLEU Score': 0, 'Overall Score': 0.29000000000000004}


Processing queries:  63%|██████▎   | 190/300 [13:07<06:37,  3.61s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0.15415064977510756, 'Overall Score': 0.8}


Processing queries:  64%|██████▎   | 191/300 [13:11<06:50,  3.76s/it]

{'Faithfulness': 0.9876, 'Relevancy': 0.8, 'Factual Correctness': 0.8976, 'BLEU Score': 0.2189425411240199, 'Overall Score': 0.89532}


Processing queries:  64%|██████▍   | 192/300 [13:17<07:55,  4.40s/it]

{'Faithfulness': 0.9876, 'Relevancy': 0.9876, 'Factual Correctness': 0.5, 'BLEU Score': 0.022229849552064022, 'Overall Score': 0.7925599999999999}


Processing queries:  64%|██████▍   | 193/300 [13:21<07:48,  4.38s/it]

{'Faithfulness': 0.8973, 'Relevancy': 0.7, 'Factual Correctness': 0.5, 'BLEU Score': 0.1667112120846934, 'Overall Score': 0.67919}


Processing queries:  65%|██████▍   | 194/300 [13:26<07:59,  4.52s/it]

{'Faithfulness': 0.9876, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.15435522972060658, 'Overall Score': 0.9962799999999999}


Processing queries:  65%|██████▌   | 195/300 [13:33<09:33,  5.46s/it]

{'Faithfulness': 0.897, 'Relevancy': 0.9876, 'Factual Correctness': 0.75, 'BLEU Score': 0.05573104658939866, 'Overall Score': 0.86538}


Processing queries:  65%|██████▌   | 196/300 [13:37<08:21,  4.82s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.3, 'Factual Correctness': 0.3423, 'BLEU Score': 0.010598507277493988, 'Overall Score': 0.37692000000000003}


Processing queries:  66%|██████▌   | 197/300 [13:39<06:56,  4.05s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.2, 'Factual Correctness': 0.5, 'BLEU Score': 0.20687381245863395, 'Overall Score': 0.26}


Processing queries:  66%|██████▌   | 198/300 [13:42<06:25,  3.78s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.9876, 'BLEU Score': 0.16306686105372709, 'Overall Score': 0.99504}


Processing queries:  66%|██████▋   | 199/300 [13:46<06:16,  3.72s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.0, 'Factual Correctness': 0.0, 'BLEU Score': 0, 'Overall Score': 0.0}


Processing queries:  67%|██████▋   | 200/300 [13:50<06:38,  3.99s/it]

{'Faithfulness': 0.7, 'Relevancy': 0.7, 'Factual Correctness': 0.5, 'BLEU Score': 0.009524331129825377, 'Overall Score': 0.62}


Processing queries:  67%|██████▋   | 201/300 [13:53<05:59,  3.63s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.7, 'Factual Correctness': 1.0, 'BLEU Score': 0.1889283697713974, 'Overall Score': 0.76}


Processing queries:  67%|██████▋   | 202/300 [13:56<05:45,  3.53s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.0, 'Factual Correctness': 0.0, 'BLEU Score': 0, 'Overall Score': 0.3}


Processing queries:  68%|██████▊   | 203/300 [14:00<05:32,  3.43s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.5, 'Factual Correctness': 0.5, 'BLEU Score': 0.021105340631872645, 'Overall Score': 0.5}


Processing queries:  68%|██████▊   | 204/300 [14:03<05:35,  3.50s/it]

{'Faithfulness': 0.897, 'Relevancy': 0.8, 'Factual Correctness': 0.5, 'BLEU Score': 0.026012784404037925, 'Overall Score': 0.7091000000000001}


Processing queries:  68%|██████▊   | 205/300 [14:07<05:40,  3.58s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.875, 'BLEU Score': 0.4018215766366551, 'Overall Score': 0.95}


Processing queries:  69%|██████▊   | 206/300 [14:10<05:19,  3.39s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 0.5, 'BLEU Score': 0.7012055133086459, 'Overall Score': 0.74}


Processing queries:  69%|██████▉   | 207/300 [14:13<05:17,  3.41s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.24028114141347542, 'Overall Score': 1.0}


Processing queries:  69%|██████▉   | 208/300 [14:17<05:28,  3.57s/it]

{'Faithfulness': 0.9876, 'Relevancy': 0.8, 'Factual Correctness': 0.875, 'BLEU Score': 0.01851974703879588, 'Overall Score': 0.88628}


Processing queries:  70%|██████▉   | 209/300 [14:22<05:41,  3.76s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.7, 'Factual Correctness': 0.5, 'BLEU Score': 0.003324612889947367, 'Overall Score': 0.56}


Processing queries:  70%|███████   | 210/300 [14:24<05:11,  3.46s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.02777619034011792, 'Overall Score': 0.6}


Processing queries:  70%|███████   | 211/300 [14:27<04:52,  3.28s/it]

{'Faithfulness': 0.897, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0, 'Overall Score': 0.7690999999999999}


Processing queries:  71%|███████   | 212/300 [14:31<04:56,  3.37s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.004503778123700044, 'Overall Score': 0.0}


Processing queries:  71%|███████   | 213/300 [14:34<04:57,  3.42s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 0.0, 'BLEU Score': 0.10145249973866254, 'Overall Score': 0.54}


Processing queries:  71%|███████▏  | 214/300 [14:40<05:59,  4.18s/it]

{'Faithfulness': 0.75, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0.6028817681965138, 'Overall Score': 0.7249999999999999}


Processing queries:  72%|███████▏  | 215/300 [14:48<07:16,  5.13s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.3, 'Factual Correctness': 0.5, 'BLEU Score': 0.0067755865182900004, 'Overall Score': 0.44}


Processing queries:  72%|███████▏  | 216/300 [14:51<06:33,  4.69s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.21279882820441612, 'Overall Score': 1.0}


Processing queries:  72%|███████▏  | 217/300 [14:55<06:08,  4.44s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.5, 'Factual Correctness': 0.5, 'BLEU Score': 0.02768274100926734, 'Overall Score': 0.5}


Processing queries:  73%|███████▎  | 218/300 [14:58<05:18,  3.88s/it]

{'Faithfulness': 0.0, 'Relevancy': 1.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.3976353643835253, 'Overall Score': 0.3}


Processing queries:  73%|███████▎  | 219/300 [15:04<06:05,  4.51s/it]

{'Faithfulness': 0.8976, 'Relevancy': 0.8, 'Factual Correctness': 0.5, 'BLEU Score': 0.34426631072695274, 'Overall Score': 0.7092799999999999}


Processing queries:  73%|███████▎  | 220/300 [15:07<05:24,  4.06s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.8, 'Factual Correctness': 1.0, 'BLEU Score': 0.6162607099729586, 'Overall Score': 0.79}


Processing queries:  74%|███████▎  | 221/300 [15:10<04:55,  3.73s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 1.0, 'BLEU Score': 0.7941054938007994, 'Overall Score': 0.9400000000000001}


Processing queries:  74%|███████▍  | 222/300 [15:13<04:30,  3.47s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 0.666, 'BLEU Score': 0.7245133511861958, 'Overall Score': 0.8064}


Processing queries:  74%|███████▍  | 223/300 [15:17<05:01,  3.92s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.9876, 'Factual Correctness': 0.5, 'BLEU Score': 0.3713984361467897, 'Overall Score': 0.7962799999999999}


Processing queries:  75%|███████▍  | 224/300 [15:20<04:30,  3.56s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.03986357128268015, 'Overall Score': 0.6}


Processing queries:  75%|███████▌  | 225/300 [15:26<05:09,  4.13s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.7, 'Factual Correctness': 1.0, 'BLEU Score': 0.1909876274879096, 'Overall Score': 0.91}


Processing queries:  75%|███████▌  | 226/300 [15:29<04:47,  3.89s/it]

{'Faithfulness': 0.8974, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.393755531055134, 'Overall Score': 0.96922}


Processing queries:  76%|███████▌  | 227/300 [15:31<04:06,  3.38s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.5, 'Factual Correctness': 0.5, 'BLEU Score': 0.3004843884984905, 'Overall Score': 0.5}


Processing queries:  76%|███████▌  | 228/300 [15:35<04:05,  3.41s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 1.0, 'BLEU Score': 0.762465858623486, 'Overall Score': 0.9400000000000001}


Processing queries:  76%|███████▋  | 229/300 [15:38<03:53,  3.29s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.4832697830906221, 'Overall Score': 1.0}


Processing queries:  77%|███████▋  | 230/300 [15:40<03:35,  3.08s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0.02777619034011792, 'Overall Score': 0.8}


Processing queries:  77%|███████▋  | 231/300 [15:45<04:03,  3.53s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.5, 'Factual Correctness': 0.0, 'BLEU Score': 0.005177502946366811, 'Overall Score': 0.3}


Processing queries:  77%|███████▋  | 232/300 [15:50<04:35,  4.06s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 0.9876, 'BLEU Score': 0.2952925422521023, 'Overall Score': 0.9350400000000001}


Processing queries:  78%|███████▊  | 233/300 [15:56<05:02,  4.52s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.5, 'Factual Correctness': 0.5, 'BLEU Score': 0.008231215992636891, 'Overall Score': 0.5}


Processing queries:  78%|███████▊  | 234/300 [16:06<06:44,  6.13s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.12560897767261706, 'Overall Score': 0.15}


Processing queries:  78%|███████▊  | 235/300 [16:10<05:55,  5.47s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.8, 'Factual Correctness': 0.75, 'BLEU Score': 0.44582686998428134, 'Overall Score': 0.6900000000000001}


Processing queries:  79%|███████▊  | 236/300 [16:14<05:25,  5.09s/it]

{'Faithfulness': 0.5, 'Relevancy': 1.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.020255986027125642, 'Overall Score': 0.44999999999999996}


Processing queries:  79%|███████▉  | 237/300 [16:20<05:51,  5.58s/it]

{'Faithfulness': 0.3423, 'Relevancy': 0.3, 'Factual Correctness': 0.5, 'BLEU Score': 0.05613759912173371, 'Overall Score': 0.39269}


Processing queries:  79%|███████▉  | 238/300 [16:24<05:12,  5.05s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.0, 'Factual Correctness': 0.0, 'BLEU Score': 0, 'Overall Score': 0.0}


Processing queries:  80%|███████▉  | 239/300 [16:27<04:25,  4.35s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.4324227075463215, 'Overall Score': 1.0}


Processing queries:  80%|████████  | 240/300 [16:29<03:41,  3.69s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.3, 'Factual Correctness': 0.0, 'BLEU Score': 0, 'Overall Score': 0.09}


Processing queries:  80%|████████  | 241/300 [16:34<04:00,  4.07s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 0.9876, 'BLEU Score': 0.3537177643681064, 'Overall Score': 0.9350400000000001}


Processing queries:  81%|████████  | 242/300 [16:40<04:20,  4.49s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.00318051352706924, 'Overall Score': 0.0}


Processing queries:  81%|████████  | 243/300 [16:43<04:02,  4.26s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.004089349842971759, 'Overall Score': 0.0}


Processing queries:  81%|████████▏ | 244/300 [16:46<03:31,  3.78s/it]

{'Faithfulness': 0.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0, 'Overall Score': 0.7}


Processing queries:  82%|████████▏ | 245/300 [16:49<03:19,  3.62s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 1.0, 'Overall Score': 1.0}


Processing queries:  82%|████████▏ | 246/300 [16:56<04:13,  4.69s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.5, 'Factual Correctness': 0.5, 'BLEU Score': 0.002062080067194881, 'Overall Score': 0.5}


Processing queries:  82%|████████▏ | 247/300 [16:59<03:33,  4.03s/it]

{'Faithfulness': 0.5, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 1.0, 'Overall Score': 0.85}


Processing queries:  83%|████████▎ | 248/300 [17:02<03:20,  3.86s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.9, 'Factual Correctness': 0.875, 'BLEU Score': 0.24671763489589052, 'Overall Score': 0.9200000000000002}


Processing queries:  83%|████████▎ | 249/300 [17:08<03:39,  4.31s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.7, 'Factual Correctness': 0.5, 'BLEU Score': 0.04620256681003074, 'Overall Score': 0.71}


Processing queries:  83%|████████▎ | 250/300 [17:11<03:21,  4.03s/it]

{'Faithfulness': 0.9876, 'Relevancy': 0.9, 'Factual Correctness': 1.0, 'BLEU Score': 0.47022156411316, 'Overall Score': 0.96628}


Processing queries:  84%|████████▎ | 251/300 [17:17<03:49,  4.69s/it]

{'Faithfulness': 0.8, 'Relevancy': 0.7, 'Factual Correctness': 0.5, 'BLEU Score': 0.006355066492354449, 'Overall Score': 0.6499999999999999}


Processing queries:  84%|████████▍ | 252/300 [17:20<03:12,  4.00s/it]

{'Faithfulness': 0.5, 'Relevancy': 1.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.24028114141347542, 'Overall Score': 0.44999999999999996}


Processing queries:  84%|████████▍ | 253/300 [17:24<03:08,  4.01s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.9, 'Factual Correctness': 0.9234, 'BLEU Score': 0.4432666712748559, 'Overall Score': 0.9393600000000001}


Processing queries:  85%|████████▍ | 254/300 [17:28<03:07,  4.08s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.0038253628844009133, 'Overall Score': 0.0}


Processing queries:  85%|████████▌ | 255/300 [17:31<02:46,  3.70s/it]

{'Faithfulness': 0.5, 'Relevancy': 1.0, 'Factual Correctness': 0.0, 'BLEU Score': 0, 'Overall Score': 0.44999999999999996}


Processing queries:  85%|████████▌ | 256/300 [17:34<02:41,  3.67s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 1.0, 'Overall Score': 1.0}


Processing queries:  86%|████████▌ | 257/300 [17:38<02:30,  3.50s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.0446323613785333, 'Overall Score': 0.6}


Processing queries:  86%|████████▌ | 258/300 [17:45<03:10,  4.55s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 1.0, 'BLEU Score': 0.6165255292124369, 'Overall Score': 0.9400000000000001}


Processing queries:  86%|████████▋ | 259/300 [17:55<04:19,  6.34s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.7, 'Factual Correctness': 0.5, 'BLEU Score': 0.027239426741461154, 'Overall Score': 0.56}


Processing queries:  87%|████████▋ | 260/300 [18:02<04:26,  6.66s/it]

{'Faithfulness': 0.8974, 'Relevancy': 0.9876, 'Factual Correctness': 0.5, 'BLEU Score': 0.3850322886878713, 'Overall Score': 0.7654999999999998}


Processing queries:  87%|████████▋ | 261/300 [18:09<04:20,  6.68s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.006243702571093031, 'Overall Score': 0.0}


Processing queries:  87%|████████▋ | 262/300 [18:17<04:26,  7.01s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.2, 'Factual Correctness': 0.0, 'BLEU Score': 0.05975702444514504, 'Overall Score': 0.06}


Processing queries:  88%|████████▊ | 263/300 [18:20<03:38,  5.90s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.8, 'Factual Correctness': 0.5, 'BLEU Score': 0, 'Overall Score': 0.5900000000000001}


Processing queries:  88%|████████▊ | 264/300 [18:26<03:31,  5.87s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.5, 'Factual Correctness': 0.5, 'BLEU Score': 0.0031848938480197775, 'Overall Score': 0.5}


Processing queries:  88%|████████▊ | 265/300 [18:28<02:49,  4.84s/it]

{'Faithfulness': 0.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.11362193664674995, 'Overall Score': 0.7}


Processing queries:  89%|████████▊ | 266/300 [18:33<02:44,  4.85s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.009999377369656478, 'Overall Score': 0.0}


Processing queries:  89%|████████▉ | 267/300 [18:36<02:22,  4.33s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.0, 'Factual Correctness': 0.0, 'BLEU Score': 0, 'Overall Score': 0.0}


Processing queries:  89%|████████▉ | 268/300 [18:40<02:08,  4.01s/it]

{'Faithfulness': 0.5, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0.008282282660969604, 'Overall Score': 0.6499999999999999}


Processing queries:  90%|████████▉ | 269/300 [18:43<01:54,  3.70s/it]

{'Faithfulness': 0.3423, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0, 'Overall Score': 0.60269}


Processing queries:  90%|█████████ | 270/300 [18:48<02:04,  4.14s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.5, 'Factual Correctness': 0.5, 'BLEU Score': 0.11054202892156728, 'Overall Score': 0.35}


Processing queries:  90%|█████████ | 271/300 [18:51<01:55,  3.98s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.3791153816091294, 'Overall Score': 1.0}


Processing queries:  91%|█████████ | 272/300 [18:55<01:44,  3.74s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.5, 'Factual Correctness': 0.5, 'BLEU Score': 0, 'Overall Score': 0.5}


Processing queries:  91%|█████████ | 273/300 [18:59<01:46,  3.95s/it]

{'Faithfulness': 0.8, 'Relevancy': 0.7, 'Factual Correctness': 0.5, 'BLEU Score': 0.20218647645300866, 'Overall Score': 0.6499999999999999}


Processing queries:  91%|█████████▏| 274/300 [19:02<01:31,  3.52s/it]

{'Faithfulness': 0.5, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.04939382737115371, 'Overall Score': 0.85}


Processing queries:  92%|█████████▏| 275/300 [19:05<01:26,  3.46s/it]

{'Faithfulness': 0.75, 'Relevancy': 1.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.024218026052883736, 'Overall Score': 0.5249999999999999}


Processing queries:  92%|█████████▏| 276/300 [19:08<01:16,  3.21s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0.018850319022747353, 'Overall Score': 0.8}


Processing queries:  92%|█████████▏| 277/300 [19:10<01:09,  3.01s/it]

{'Faithfulness': 0.9876, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.017033186037639283, 'Overall Score': 0.9962799999999999}


Processing queries:  93%|█████████▎| 278/300 [19:12<01:01,  2.77s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.0, 'Factual Correctness': 0.5, 'BLEU Score': 0.05501080739920602, 'Overall Score': 0.2}


Processing queries:  93%|█████████▎| 279/300 [19:15<00:58,  2.78s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.5, 'Factual Correctness': 1.0, 'BLEU Score': 0.8931539818068694, 'Overall Score': 0.7}


Processing queries:  93%|█████████▎| 280/300 [19:18<00:57,  2.89s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.009795841373797659, 'Overall Score': 0.0}


Processing queries:  94%|█████████▎| 281/300 [19:22<01:00,  3.16s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.0, 'Factual Correctness': 0.0, 'BLEU Score': 0, 'Overall Score': 0.0}


Processing queries:  94%|█████████▍| 282/300 [19:24<00:51,  2.87s/it]

{'Faithfulness': 0.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.015744053406497194, 'Overall Score': 0.7}


Processing queries:  94%|█████████▍| 283/300 [19:29<00:59,  3.53s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.0, 'Factual Correctness': 0.0, 'BLEU Score': 0.0029580435428915677, 'Overall Score': 0.0}


Processing queries:  95%|█████████▍| 284/300 [19:33<00:55,  3.48s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0.06541868941754436, 'Overall Score': 0.8}


Processing queries:  95%|█████████▌| 285/300 [19:35<00:46,  3.11s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 1.0, 'Overall Score': 1.0}


Processing queries:  95%|█████████▌| 286/300 [19:39<00:46,  3.31s/it]

{'Faithfulness': 0.9876, 'Relevancy': 0.8, 'Factual Correctness': 0.9876, 'BLEU Score': 0.2971567888130264, 'Overall Score': 0.93132}


Processing queries:  96%|█████████▌| 287/300 [19:42<00:41,  3.16s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0.0627251733901403, 'Overall Score': 0.8}


Processing queries:  96%|█████████▌| 288/300 [19:46<00:41,  3.42s/it]

{'Faithfulness': 0.897, 'Relevancy': 0.9, 'Factual Correctness': 0.75, 'BLEU Score': 0.1962709156038045, 'Overall Score': 0.8391000000000001}


Processing queries:  96%|█████████▋| 289/300 [19:48<00:35,  3.20s/it]

{'Faithfulness': 0.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.8408964152537145, 'Overall Score': 0.7}


Processing queries:  97%|█████████▋| 290/300 [19:51<00:29,  2.95s/it]

{'Faithfulness': 0.0, 'Relevancy': 1.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.5623413251903491, 'Overall Score': 0.7}


Processing queries:  97%|█████████▋| 291/300 [19:55<00:30,  3.36s/it]

{'Faithfulness': 0.8, 'Relevancy': 0.7, 'Factual Correctness': 0.8, 'BLEU Score': 0.002989232719814957, 'Overall Score': 0.77}


Processing queries:  97%|█████████▋| 292/300 [19:58<00:26,  3.28s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 0.5, 'BLEU Score': 0.17395797375642233, 'Overall Score': 0.74}


Processing queries:  98%|█████████▊| 293/300 [20:01<00:22,  3.23s/it]

{'Faithfulness': 0.9876, 'Relevancy': 0.98, 'Factual Correctness': 0.875, 'BLEU Score': 0.3679682596057247, 'Overall Score': 0.94028}


Processing queries:  98%|█████████▊| 294/300 [20:08<00:26,  4.41s/it]

{'Faithfulness': 0.5, 'Relevancy': 0.3, 'Factual Correctness': 1.0, 'BLEU Score': 1.0, 'Overall Score': 0.64}


Processing queries:  98%|█████████▊| 295/300 [20:12<00:21,  4.21s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0.0446323613785333, 'Overall Score': 0.8}


Processing queries:  99%|█████████▊| 296/300 [20:15<00:15,  3.81s/it]

{'Faithfulness': 1.0, 'Relevancy': 1.0, 'Factual Correctness': 0.5, 'BLEU Score': 0.4324227075463215, 'Overall Score': 0.8}


Processing queries:  99%|█████████▉| 297/300 [20:17<00:10,  3.39s/it]

{'Faithfulness': 0.0, 'Relevancy': 0.0, 'Factual Correctness': 1.0, 'BLEU Score': 0.316227766016838, 'Overall Score': 0.4}


Processing queries:  99%|█████████▉| 298/300 [20:21<00:06,  3.47s/it]

{'Faithfulness': 0.8976, 'Relevancy': 0.8, 'Factual Correctness': 0.9876, 'BLEU Score': 0.40185699044723144, 'Overall Score': 0.90432}


Processing queries: 100%|█████████▉| 299/300 [20:26<00:03,  3.82s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 0.5, 'BLEU Score': 0.42436787934785464, 'Overall Score': 0.74}


Processing queries: 100%|██████████| 300/300 [20:30<00:00,  4.10s/it]

{'Faithfulness': 1.0, 'Relevancy': 0.8, 'Factual Correctness': 0.9876, 'BLEU Score': 0.23770841795864292, 'Overall Score': 0.9350400000000001}





In [21]:
faithfullness, relevance, factual, bleu, overall = 0, 0, 0, 0, 0
for i, score in enumerate(scores):
    faithfullness += scores[i]['Faithfulness']
    relevance += scores[i]['Relevancy']
    factual += scores[i]['Factual Correctness']
    bleu += scores[i]['BLEU Score']
    overall += scores[i]['Overall Score']

faithfullness /= len(scores)
relevance /= len(scores)
factual /= len(scores)
bleu /= len(scores)
overall /= len(scores)

In [22]:
print(f"Faithfullness: {faithfullness}")
print(f"Relevance: {relevance}")
print(f"Factual Correctness: {factual}")
print(f"BLEU Score: {bleu}")
print(f"Overall Score: {overall}")
"""Faithfullness: 0.6501929999999998
Relevance: 0.6873533333333337
Factual Correctness: 0.5776309999999997
BLEU Score: 0.18773455804204672
Overall Score: 0.6323162999999997"""

Faithfullness: 0.6501929999999998
Relevance: 0.6873533333333337
Factual Correctness: 0.5776309999999997
BLEU Score: 0.18773455804204672
Overall Score: 0.6323162999999997
