### Load ground truth data

In [29]:
import pandas as pd

ground_truth_df = pd.read_json('evaluation_ground_truth.json').head(10)
ground_truth_df

Unnamed: 0,ground_truth_faq_id,generated_question,ground_truth_question,ground_truth_answer,ground_truth_courier
0,0,What type of employment contract do I have wit...,What is my contract type as an employee?,"As an employee, you will have a part-time or f...","{'index': 2, 'first_name': 'Olivia', 'last_nam..."
1,0,What benefits do I receive as an employee?,What is my contract type as an employee?,"As an employee, you will have a part-time or f...","{'index': 2, 'first_name': 'Olivia', 'last_nam..."
2,0,Am I eligible for paid vacation as an employee?,What is my contract type as an employee?,"As an employee, you will have a part-time or f...","{'index': 2, 'first_name': 'Olivia', 'last_nam..."
3,0,What distinguishes my contract from that of a ...,What is my contract type as an employee?,"As an employee, you will have a part-time or f...","{'index': 2, 'first_name': 'Olivia', 'last_nam..."
4,0,Does being an employee affect my salary struct...,What is my contract type as an employee?,"As an employee, you will have a part-time or f...","{'index': 2, 'first_name': 'Olivia', 'last_nam..."
5,1,Can you explain how my hourly pay is determine...,How is my hourly pay calculated as an employee?,Your pay is based on an agreed-upon hourly wag...,"{'index': 2, 'first_name': 'Olivia', 'last_nam..."
6,1,What kind of payment model do I have being an ...,How is my hourly pay calculated as an employee?,Your pay is based on an agreed-upon hourly wag...,"{'index': 2, 'first_name': 'Olivia', 'last_nam..."
7,1,Is my income predictable as an employee at iDe...,How is my hourly pay calculated as an employee?,Your pay is based on an agreed-upon hourly wag...,"{'index': 2, 'first_name': 'Olivia', 'last_nam..."
8,1,Will my hourly wage change based on delivery p...,How is my hourly pay calculated as an employee?,Your pay is based on an agreed-upon hourly wag...,"{'index': 2, 'first_name': 'Olivia', 'last_nam..."
9,1,How does my employment contract affect my pay ...,How is my hourly pay calculated as an employee?,Your pay is based on an agreed-upon hourly wag...,"{'index': 2, 'first_name': 'Olivia', 'last_nam..."


### Get LLM answers on generated questions

In [30]:
from qdrant_client import QdrantClient, models

qd_client = QdrantClient("http://localhost:6333")
EMBEDDING_DIMENSIONALITY = 512
model_handle = "jinaai/jina-embeddings-v2-small-en"
collection_name = "courier-faq"

def vector_search(question, country):
    # print('vector_search is called on question: '+question)
    
    query_points = qd_client.query_points(
        collection_name=collection_name,
        query=models.Document(
            text=question,
            model=model_handle 
        ),
        query_filter=models.Filter( 
            must=[
                models.FieldCondition(
                    key="country",
                    match=models.MatchAny(any=[country, "all"] )
                )
            ]
        ),
        limit=5,
        with_payload=True
    )

    results = []
    
    for point in query_points.points:
        results.append(point.payload)
    
    return results

vector_search("Can I reject orders?", 'DE')


[{'country': 'all',
  'question': 'Can I refuse an order?',
  'answer': 'Yes, you can decline an order. However, having a high acceptance rate can lead to more opportunities, and a very low rate may be reviewed by the company.'},
 {'country': 'all',
  'question': 'Can I refuse an order if the weather is bad?',
  'answer': 'Yes, you have the right to decline an order if you feel unsafe due to weather conditions. Your safety is a priority.'},
 {'country': 'all',
  'question': 'Can I refuse an order if the weather is bad?',
  'answer': 'Yes, you have the right to decline an order if you feel unsafe due to weather conditions. Your safety is a priority.'},
 {'country': 'all',
  'question': 'Can I refuse an order if the weather is bad?',
  'answer': 'Yes, you have the right to decline an order if you feel unsafe due to weather conditions. Your safety is a priority.'},
 {'country': 'all',
  'question': 'Can I refuse an order if the weather is bad?',
  'answer': 'Yes, you have the right to dec

In [31]:
from openai import OpenAI
import keys_secret

openai_client = OpenAI(api_key=keys_secret.openai_api_key)

def llm_aswer(prompt):
    response = openai_client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

def build_prompt(question, search_results, courier):
    prompt_template = """
You are the courier suport agent of a iDelivery company that handles food delivery in Germany, Netherlands and UK. 
The couriers working for this company are employees and freelancers. 

Courier {courier_first_name} is {courier_age} years old, has a {courier_contract_type} working contract and uses a {courier_vehicle_type} for delivery.
    
Answer the courier's QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}

""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"country: {doc['country']}\nquestion: {doc['question']}\nanswer: {doc['answer']}\n\n"

    # print(courier)
    prompt = prompt_template.format(question=question, 
                                    context=context, 
                                    courier_first_name=courier['first_name'],
                                    courier_age=courier['age'],
                                    courier_contract_type=courier['contract_type'],
                                    courier_vehicle_type=courier['vehicle_type'],
                                   ).strip()
    return prompt


In [32]:
from tqdm import tqdm

pbar = tqdm(total=len(ground_truth_df)) 

def get_llm_answer(ground_truth):
    question = ground_truth['generated_question']
    search_results = vector_search(question, ground_truth['ground_truth_courier']['country'])
    courier = ground_truth['ground_truth_courier']
    
    prompt = build_prompt(question, search_results, courier)
    # print(prompt)
    # print()
    # print("LLM answer:")
    answer_llm = llm_aswer(prompt)
    # print(answer_llm)

    pbar.update(1) 
    return answer_llm
    # break;
    

ground_truth_df['answer_llm'] = ground_truth_df.apply(get_llm_answer, axis=1)
pbar.close()
ground_truth_df

100%|████████████████████████████████████████████████████████████████████| 10/10 [00:16<00:00,  1.66s/it]


Unnamed: 0,ground_truth_faq_id,generated_question,ground_truth_question,ground_truth_answer,ground_truth_courier,answer_llm
0,0,What type of employment contract do I have wit...,What is my contract type as an employee?,"As an employee, you will have a part-time or f...","{'index': 2, 'first_name': 'Olivia', 'last_nam...","As an employee with iDelivery, you have a part..."
1,0,What benefits do I receive as an employee?,What is my contract type as an employee?,"As an employee, you will have a part-time or f...","{'index': 2, 'first_name': 'Olivia', 'last_nam...","As an employee at iDelivery, you receive sever..."
2,0,Am I eligible for paid vacation as an employee?,What is my contract type as an employee?,"As an employee, you will have a part-time or f...","{'index': 2, 'first_name': 'Olivia', 'last_nam...","Yes, as an employee, you are eligible for paid..."
3,0,What distinguishes my contract from that of a ...,What is my contract type as an employee?,"As an employee, you will have a part-time or f...","{'index': 2, 'first_name': 'Olivia', 'last_nam...","As an employee, your contract differs from tha..."
4,0,Does being an employee affect my salary struct...,What is my contract type as an employee?,"As an employee, you will have a part-time or f...","{'index': 2, 'first_name': 'Olivia', 'last_nam...","Yes, being an employee does affect your salary..."
5,1,Can you explain how my hourly pay is determine...,How is my hourly pay calculated as an employee?,Your pay is based on an agreed-upon hourly wag...,"{'index': 2, 'first_name': 'Olivia', 'last_nam...","As an employee, your hourly pay is determined ..."
6,1,What kind of payment model do I have being an ...,How is my hourly pay calculated as an employee?,Your pay is based on an agreed-upon hourly wag...,"{'index': 2, 'first_name': 'Olivia', 'last_nam...","As an employee, you have a part-time or full-t..."
7,1,Is my income predictable as an employee at iDe...,How is my hourly pay calculated as an employee?,Your pay is based on an agreed-upon hourly wag...,"{'index': 2, 'first_name': 'Olivia', 'last_nam...","Yes, your income as an employee at iDelivery i..."
8,1,Will my hourly wage change based on delivery p...,How is my hourly pay calculated as an employee?,Your pay is based on an agreed-upon hourly wag...,"{'index': 2, 'first_name': 'Olivia', 'last_nam...","No, your hourly wage will not change based on ..."
9,1,How does my employment contract affect my pay ...,How is my hourly pay calculated as an employee?,Your pay is based on an agreed-upon hourly wag...,"{'index': 2, 'first_name': 'Olivia', 'last_nam...","As an employee, your pay structure is influenc..."


### Evaluate RAG on generated answers

In [34]:
prompt1_template = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer compared to the original answer provided.
Based on the relevance and similarity of the generated answer to the original answer, you will classify
it as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Original Answer: {answer_orig}
Generated Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the original
answer and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

prompt2_template = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [35]:
from openai import OpenAI
import keys_secret

openai_client = OpenAI(api_key=keys_secret.openai_api_key)


def llm_eval(prompt):
    response = openai_client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [None]:
import json

pbar = tqdm(total=len(ground_truth_df)) 

def evaluate_rag(ground_truth):
    prompt = prompt1_template.format(
        answer_orig = ground_truth['ground_truth_answer'],
        question = ground_truth['generated_question'],
        answer_llm = ground_truth['answer_llm'],
    )
    
    result_raw = llm_eval(prompt)
    result = json.loads(result_raw)
    print(result)
    pbar.update(1) 
    return pd.Series([result['Relevance'], result['Explanation']], index=['answer_llm_eval', 'answer_llm_eval_explanation'])

pbar.close()
ground_truth_df[['answer_llm_eval', 'answer_llm_eval_explanation']] = ground_truth_df.apply(evaluate_rag, axis=1)    
ground_truth_df

  0%|                                                                             | 0/10 [00:00<?, ?it/s]


{'Relevance': 'RELEVANT', 'Explanation': 'The generated answer directly addresses the question about the type of employment contract with iDelivery and includes the same key details found in the original answer, such as part-time or full-time status, fixed salary, paid vacation, and social security benefits.'}
{'Relevance': 'PARTLY_RELEVANT', 'Explanation': 'The generated answer addresses the benefits of being an employee at iDelivery, which aligns with the inquiry about employee benefits. However, it includes additional information not present in the original answer, such as details about continued salary payment during illness and the provision of payroll statements, which may not be directly relevant to the core question about employee benefits.'}
{'Relevance': 'RELEVANT', 'Explanation': 'The generated answer directly addresses the question about eligibility for paid vacation as an employee and accurately reflects the information provided in the original answer. It confirms that emp