### Load ground truth data

In [1]:
import pandas as pd

ground_truth_df = pd.read_json('evaluation_ground_truth.json').sample(n=100, random_state=1)
ground_truth_df

Unnamed: 0,ground_truth_faq_id,generated_question,ground_truth_question,ground_truth_answer,ground_truth_courier
48,9,Are there any exceptions to the notice period ...,What is the notice period for terminating my e...,Your notice period is specified in your employ...,"{'index': 2, 'first_name': 'Olivia', 'last_nam..."
1601,506,How can I contact support if I have issues dur...,What if my phone battery dies mid-delivery?,Charge your phone as quickly as possible. If t...,"{'index': 1, 'first_name': 'Liam', 'last_name'..."
1520,459,What should I do if a customer claims their or...,How do I handle a customer who didn't receive ...,Advise the customer to contact iDelivery's cus...,"{'index': 0, 'first_name': 'Emma', 'last_name'..."
1201,325,What are my rights regarding declining an order?,Can I refuse an order if the weather is bad?,"Yes, you have the right to decline an order if...","{'index': 1, 'first_name': 'Liam', 'last_name'..."
309,61,Do part-time and full-time employees have the ...,What are my rights as a part-time employee?,Part-time employees have the same rights as fu...,"{'index': 1, 'first_name': 'Liam', 'last_name'..."
...,...,...,...,...,...
111,22,What types of invoices will iDelivery provide?,What is the process for submitting invoices to...,iDelivery will typically generate a weekly or ...,"{'index': 2, 'first_name': 'Olivia', 'last_nam..."
1004,253,Can I manage my time effectively while working...,Can I use a second app at the same time?,"Yes, you can be active on multiple platforms. ...","{'index': 2, 'first_name': 'Olivia', 'last_nam..."
904,222,Is a driver's license necessary for my positio...,Do I need a special license to be a courier?,You need a valid driver's license for your veh...,"{'index': 0, 'first_name': 'Emma', 'last_name'..."
1399,396,What action is required after the waiting peri...,A customer is not home. What happens to the food?,"The app will guide you through the process, wh...","{'index': 1, 'first_name': 'Liam', 'last_name'..."


### Get LLM answers on generated questions

In [2]:
from qdrant_client import QdrantClient, models

qd_client = QdrantClient("http://localhost:6333")
EMBEDDING_DIMENSIONALITY = 512
model_handle = "jinaai/jina-embeddings-v2-small-en"
collection_name = "courier-faq"

def vector_search(question, country):
    # print('vector_search is called on question: '+question)
    
    query_points = qd_client.query_points(
        collection_name=collection_name,
        query=models.Document(
            text=question,
            model=model_handle 
        ),
        query_filter=models.Filter( 
            must=[
                models.FieldCondition(
                    key="country",
                    match=models.MatchAny(any=[country, "all"] )
                )
            ]
        ),
        limit=5,
        with_payload=True
    )

    results = []
    
    for point in query_points.points:
        results.append(point.payload)
    
    return results

vector_search("Can I reject orders?", 'DE')


[{'country': 'all',
  'question': 'Can I refuse an order?',
  'answer': 'Yes, you can decline an order. However, having a high acceptance rate can lead to more opportunities, and a very low rate may be reviewed by the company.'},
 {'country': 'all',
  'question': 'Can I refuse an order if the weather is bad?',
  'answer': 'Yes, you have the right to decline an order if you feel unsafe due to weather conditions. Your safety is a priority.'},
 {'country': 'all',
  'question': 'Can I refuse an order if the weather is bad?',
  'answer': 'Yes, you have the right to decline an order if you feel unsafe due to weather conditions. Your safety is a priority.'},
 {'country': 'all',
  'question': 'Can I refuse an order if the weather is bad?',
  'answer': 'Yes, you have the right to decline an order if you feel unsafe due to weather conditions. Your safety is a priority.'},
 {'country': 'all',
  'question': 'Can I refuse an order if the weather is bad?',
  'answer': 'Yes, you have the right to dec

In [3]:
from openai import OpenAI
import keys_secret

openai_client = OpenAI(api_key=keys_secret.openai_api_key)

def llm_aswer(prompt):
    response = openai_client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

def build_prompt(question, search_results, courier):
    prompt_template = """
You are the courier suport agent of a iDelivery company that handles food delivery in Germany, Netherlands and UK. 
The couriers working for this company are employees and freelancers. 

Courier {courier_first_name} is {courier_age} years old, has a {courier_contract_type} working contract and uses a {courier_vehicle_type} for delivery.
    
Answer the courier's QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}

""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"country: {doc['country']}\nquestion: {doc['question']}\nanswer: {doc['answer']}\n\n"

    # print(courier)
    prompt = prompt_template.format(question=question, 
                                    context=context, 
                                    courier_first_name=courier['first_name'],
                                    courier_age=courier['age'],
                                    courier_contract_type=courier['contract_type'],
                                    courier_vehicle_type=courier['vehicle_type'],
                                   ).strip()
    return prompt


In [4]:
from tqdm import tqdm

pbar = tqdm(total=len(ground_truth_df)) 

def get_llm_answer(ground_truth):
    question = ground_truth['generated_question']
    search_results = vector_search(question, ground_truth['ground_truth_courier']['country'])
    courier = ground_truth['ground_truth_courier']
    
    prompt = build_prompt(question, search_results, courier)
    # print(prompt)
    # print()
    # print("LLM answer:")
    answer_llm = llm_aswer(prompt)
    # print(answer_llm)

    pbar.update(1) 
    return answer_llm
    # break;
    

ground_truth_df['answer_llm'] = ground_truth_df.apply(get_llm_answer, axis=1)
# pbar.close()
ground_truth_df

100%|██████████████████████████████████████████████████████████████████| 100/100 [02:30<00:00,  1.70s/it]

Unnamed: 0,ground_truth_faq_id,generated_question,ground_truth_question,ground_truth_answer,ground_truth_courier,answer_llm
48,9,Are there any exceptions to the notice period ...,What is the notice period for terminating my e...,Your notice period is specified in your employ...,"{'index': 2, 'first_name': 'Olivia', 'last_nam...","As per your employment contract, the notice pe..."
1601,506,How can I contact support if I have issues dur...,What if my phone battery dies mid-delivery?,Charge your phone as quickly as possible. If t...,"{'index': 1, 'first_name': 'Liam', 'last_name'...","If you have issues during a delivery, you shou..."
1520,459,What should I do if a customer claims their or...,How do I handle a customer who didn't receive ...,Advise the customer to contact iDelivery's cus...,"{'index': 0, 'first_name': 'Emma', 'last_name'...",If a customer claims their order hasn't arrive...
1201,325,What are my rights regarding declining an order?,Can I refuse an order if the weather is bad?,"Yes, you have the right to decline an order if...","{'index': 1, 'first_name': 'Liam', 'last_name'...","As a freelancer with iDelivery, you have the r..."
309,61,Do part-time and full-time employees have the ...,What are my rights as a part-time employee?,Part-time employees have the same rights as fu...,"{'index': 1, 'first_name': 'Liam', 'last_name'...","Yes, part-time employees have the same rights ..."
...,...,...,...,...,...,...
111,22,What types of invoices will iDelivery provide?,What is the process for submitting invoices to...,iDelivery will typically generate a weekly or ...,"{'index': 2, 'first_name': 'Olivia', 'last_nam...","As a courier with an employment contract, iDel..."
1004,253,Can I manage my time effectively while working...,Can I use a second app at the same time?,"Yes, you can be active on multiple platforms. ...","{'index': 2, 'first_name': 'Olivia', 'last_nam...","Yes, you can be active on multiple platforms w..."
904,222,Is a driver's license necessary for my positio...,Do I need a special license to be a courier?,You need a valid driver's license for your veh...,"{'index': 0, 'first_name': 'Emma', 'last_name'...",As an employee courier using a bike for delive...
1399,396,What action is required after the waiting peri...,A customer is not home. What happens to the food?,"The app will guide you through the process, wh...","{'index': 1, 'first_name': 'Liam', 'last_name'...","After the waiting period, if the customer does..."


### Evaluate RAG on generated answers

In [5]:
prompt1_template = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer compared to the original answer provided.
Based on the relevance and similarity of the generated answer to the original answer, you will classify
it as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Original Answer: {answer_orig}
Generated Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the original
answer and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

prompt2_template = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [6]:
from openai import OpenAI
import keys_secret

openai_client = OpenAI(api_key=keys_secret.openai_api_key)


def llm_eval(prompt):
    response = openai_client.chat.completions.create(
        model='gpt-3.5-turbo',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [7]:
import json

pbar1 = tqdm(total=len(ground_truth_df)) 

def evaluate_rag(ground_truth):
    prompt = prompt1_template.format(
        answer_orig = ground_truth['ground_truth_answer'],
        question = ground_truth['generated_question'],
        answer_llm = ground_truth['answer_llm'],
    )
    
    result_raw = llm_eval(prompt)
    result = json.loads(result_raw)
    # print(result)
    pbar1.update(1) 
    return pd.Series([result['Relevance'], result['Explanation']], index=['answer_llm_eval', 'answer_llm_eval_explanation'])

# pbar1.close()
ground_truth_df[['answer_llm_eval', 'answer_llm_eval_explanation']] = ground_truth_df.apply(evaluate_rag, axis=1)    
ground_truth_df


  0%|                                                                            | 0/100 [00:00<?, ?it/s][A
  1%|▋                                                                   | 1/100 [00:01<02:39,  1.61s/it][A
  2%|█▎                                                                  | 2/100 [00:02<01:45,  1.08s/it][A
  3%|██                                                                  | 3/100 [00:03<01:29,  1.08it/s][A
  4%|██▋                                                                 | 4/100 [00:04<01:30,  1.06it/s][A
  5%|███▍                                                                | 5/100 [00:05<01:48,  1.15s/it][A
  6%|████                                                                | 6/100 [00:06<01:31,  1.02it/s][A
  7%|████▊                                                               | 7/100 [00:06<01:22,  1.13it/s][A
  8%|█████▍                                                              | 8/100 [00:07<01:23,  1.11it/s][A
  9%|██████       

Unnamed: 0,ground_truth_faq_id,generated_question,ground_truth_question,ground_truth_answer,ground_truth_courier,answer_llm,answer_llm_eval,answer_llm_eval_explanation
48,9,Are there any exceptions to the notice period ...,What is the notice period for terminating my e...,Your notice period is specified in your employ...,"{'index': 2, 'first_name': 'Olivia', 'last_nam...","As per your employment contract, the notice pe...",RELEVANT,The generated answer accurately reflects the i...
1601,506,How can I contact support if I have issues dur...,What if my phone battery dies mid-delivery?,Charge your phone as quickly as possible. If t...,"{'index': 1, 'first_name': 'Liam', 'last_name'...","If you have issues during a delivery, you shou...",NON_RELEVANT,The generated answer is not relevant to the or...
1520,459,What should I do if a customer claims their or...,How do I handle a customer who didn't receive ...,Advise the customer to contact iDelivery's cus...,"{'index': 0, 'first_name': 'Emma', 'last_name'...",If a customer claims their order hasn't arrive...,RELEVANT,The generated answer closely mirrors the origi...
1201,325,What are my rights regarding declining an order?,Can I refuse an order if the weather is bad?,"Yes, you have the right to decline an order if...","{'index': 1, 'first_name': 'Liam', 'last_name'...","As a freelancer with iDelivery, you have the r...",PARTLY_RELEVANT,The generated answer provides additional infor...
309,61,Do part-time and full-time employees have the ...,What are my rights as a part-time employee?,Part-time employees have the same rights as fu...,"{'index': 1, 'first_name': 'Liam', 'last_name'...","Yes, part-time employees have the same rights ...",RELEVANT,The generated answer directly addresses the qu...
...,...,...,...,...,...,...,...,...
111,22,What types of invoices will iDelivery provide?,What is the process for submitting invoices to...,iDelivery will typically generate a weekly or ...,"{'index': 2, 'first_name': 'Olivia', 'last_nam...","As a courier with an employment contract, iDel...",RELEVANT,The generated answer accurately reflects the c...
1004,253,Can I manage my time effectively while working...,Can I use a second app at the same time?,"Yes, you can be active on multiple platforms. ...","{'index': 2, 'first_name': 'Olivia', 'last_nam...","Yes, you can be active on multiple platforms w...",RELEVANT,The generated answer closely paraphrases the o...
904,222,Is a driver's license necessary for my positio...,Do I need a special license to be a courier?,You need a valid driver's license for your veh...,"{'index': 0, 'first_name': 'Emma', 'last_name'...",As an employee courier using a bike for delive...,PARTLY_RELEVANT,The generated answer correctly mentions that a...
1399,396,What action is required after the waiting peri...,A customer is not home. What happens to the food?,"The app will guide you through the process, wh...","{'index': 1, 'first_name': 'Liam', 'last_name'...","After the waiting period, if the customer does...",PARTLY_RELEVANT,The generated answer partially relates to the ...


In [8]:
ground_truth_df.answer_llm_eval.value_counts()

answer_llm_eval
RELEVANT           53
PARTLY_RELEVANT    39
NON_RELEVANT        8
Name: count, dtype: int64