In [1]:
import pandas as pd

ground_truth_df = pd.read_json('evaluation_ground_truth.json')
ground_truth_df

Unnamed: 0,ground_truth_faq_id,generated_question,ground_truth_question,ground_truth_answer,ground_truth_courier
0,0,What type of employment contract do I have wit...,What is my contract type as an employee?,"As an employee, you will have a part-time or f...","{'index': 6, 'first_name': 'Jackson', 'last_na..."
1,0,Am I classified as an employee or a freelancer?,What is my contract type as an employee?,"As an employee, you will have a part-time or f...","{'index': 6, 'first_name': 'Jackson', 'last_na..."
2,0,Can you explain the benefits included in my em...,What is my contract type as an employee?,"As an employee, you will have a part-time or f...","{'index': 6, 'first_name': 'Jackson', 'last_na..."
3,0,What are the differences between an employee a...,What is my contract type as an employee?,"As an employee, you will have a part-time or f...","{'index': 6, 'first_name': 'Jackson', 'last_na..."
4,0,"As a courier in Germany, what support can I ex...",What is my contract type as an employee?,"As an employee, you will have a part-time or f...","{'index': 6, 'first_name': 'Jackson', 'last_na..."
...,...,...,...,...,...
900,654,Am I allowed to decline deliveries to public b...,Can I refuse to deliver to a public building?,"No, you must deliver to the specified address....","{'index': 1, 'first_name': 'Liam', 'last_name'..."
901,654,What is my age as listed in my courier profile?,Can I refuse to deliver to a public building?,"No, you must deliver to the specified address....","{'index': 1, 'first_name': 'Liam', 'last_name'..."
902,654,What type of contract do I have with iDelivery?,Can I refuse to deliver to a public building?,"No, you must deliver to the specified address....","{'index': 1, 'first_name': 'Liam', 'last_name'..."
903,654,Which type of vehicle do I use for deliveries?,Can I refuse to deliver to a public building?,"No, you must deliver to the specified address....","{'index': 1, 'first_name': 'Liam', 'last_name'..."


In [2]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)


def vector_search(question, country, score_threshold, limit):
    # print('vector_search is called on question: '+question)
    
    return qd_client.query_points(
        collection_name=collection_name,
        query=models.Document(
            text=question,
            model=model_handle 
        ),
        query_filter=models.Filter( 
            must=[
                models.FieldCondition(
                    key="country",
                    match=models.MatchAny(any=[country, "all"] )
                )
            ]
        ),
        score_threshold = score_threshold,
        limit=limit,
        with_payload=True
    )


In [3]:
from qdrant_client import QdrantClient, models

qd_client = QdrantClient("http://localhost:6333")
EMBEDDING_DIMENSIONALITY = 512
model_handle = "jinaai/jina-embeddings-v2-small-en"
collection_name = "courier-faq"


In [4]:
from tqdm.auto import tqdm

def evaluate(ground_truth_df, search_function):
    relevance_total = []

    for ground_truth in tqdm(ground_truth_df.to_dict('records')):
        doc_id = ground_truth['ground_truth_faq_id']

        generated_question = ground_truth['generated_question']
        country = ground_truth['ground_truth_courier']['country']
        
        result_list = search_function(generated_question, country)
        relevance = [result.id == doc_id for result in result_list.points]
        relevance_total.append(relevance)

        # print("ground_truth_faq_id: ",doc_id)
        # print("generated_question: ",generated_question)
        # print("ground_truth_question: ",ground_truth['ground_truth_question'])
        # print("country: ",country)
        # print("result_list.points: ",result_list.points)
        # print("relevance: ",relevance)
        # print()
        
        # break

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [6]:
evaluate(ground_truth_df, lambda question, country: vector_search(question, country, 0.8, 5))

  0%|          | 0/905 [00:00<?, ?it/s]

{'hit_rate': 0.8430939226519337, 'mrr': 0.7137937384898716}

In [7]:
import numpy as np

score = []
sampled_ground_truth_df = ground_truth_df.head(1000)

for score_threshold in np.arange(0.4, 1.0, 0.1):
    for limit in np.arange(5, 10, 1):
        print("Evaluating cofigs: score_threshold="+str(score_threshold)+", limit="+str(limit))
        result = evaluate(sampled_ground_truth_df, lambda question, country: vector_search(question, country, score_threshold, limit))
        print("Result: ", result)
        score.append({
            "hit_rate": result['hit_rate'],
            "mrr": result['mrr'],
            "score_threshold": score_threshold,
            "limit": limit,
        })

result = sorted(score, key=lambda item: item['mrr'], reverse=True)
print("Best params: ")
result


Evaluating cofigs: score_threshold=0.4, limit=5


  0%|          | 0/905 [00:00<?, ?it/s]

Result:  {'hit_rate': 0.8983425414364641, 'mrr': 0.7463535911602219}
Evaluating cofigs: score_threshold=0.4, limit=6


  0%|          | 0/905 [00:00<?, ?it/s]

Result:  {'hit_rate': 0.9082872928176795, 'mrr': 0.7480110497237579}
Evaluating cofigs: score_threshold=0.4, limit=7


  0%|          | 0/905 [00:00<?, ?it/s]

Result:  {'hit_rate': 0.9138121546961326, 'mrr': 0.748800315706394}
Evaluating cofigs: score_threshold=0.4, limit=8


  0%|          | 0/905 [00:00<?, ?it/s]

Result:  {'hit_rate': 0.9215469613259668, 'mrr': 0.7497671665351233}
Evaluating cofigs: score_threshold=0.4, limit=9


  0%|          | 0/905 [00:00<?, ?it/s]

Result:  {'hit_rate': 0.9292817679558011, 'mrr': 0.7506265894939936}
Evaluating cofigs: score_threshold=0.5, limit=5


  0%|          | 0/905 [00:00<?, ?it/s]

Result:  {'hit_rate': 0.8983425414364641, 'mrr': 0.7463535911602219}
Evaluating cofigs: score_threshold=0.5, limit=6


  0%|          | 0/905 [00:00<?, ?it/s]

Result:  {'hit_rate': 0.9082872928176795, 'mrr': 0.7480110497237579}
Evaluating cofigs: score_threshold=0.5, limit=7


  0%|          | 0/905 [00:00<?, ?it/s]

Result:  {'hit_rate': 0.9138121546961326, 'mrr': 0.748800315706394}
Evaluating cofigs: score_threshold=0.5, limit=8


  0%|          | 0/905 [00:00<?, ?it/s]

Result:  {'hit_rate': 0.9215469613259668, 'mrr': 0.7497671665351233}
Evaluating cofigs: score_threshold=0.5, limit=9


  0%|          | 0/905 [00:00<?, ?it/s]

Result:  {'hit_rate': 0.9292817679558011, 'mrr': 0.7506265894939936}
Evaluating cofigs: score_threshold=0.6, limit=5


  0%|          | 0/905 [00:00<?, ?it/s]

Result:  {'hit_rate': 0.8983425414364641, 'mrr': 0.7463535911602219}
Evaluating cofigs: score_threshold=0.6, limit=6


  0%|          | 0/905 [00:00<?, ?it/s]

Result:  {'hit_rate': 0.9082872928176795, 'mrr': 0.7480110497237579}
Evaluating cofigs: score_threshold=0.6, limit=7


  0%|          | 0/905 [00:00<?, ?it/s]

Result:  {'hit_rate': 0.9138121546961326, 'mrr': 0.748800315706394}
Evaluating cofigs: score_threshold=0.6, limit=8


  0%|          | 0/905 [00:00<?, ?it/s]

Result:  {'hit_rate': 0.9215469613259668, 'mrr': 0.7497671665351233}
Evaluating cofigs: score_threshold=0.6, limit=9


  0%|          | 0/905 [00:00<?, ?it/s]

Result:  {'hit_rate': 0.9292817679558011, 'mrr': 0.7506265894939936}
Evaluating cofigs: score_threshold=0.7, limit=5


  0%|          | 0/905 [00:00<?, ?it/s]

Result:  {'hit_rate': 0.8983425414364641, 'mrr': 0.7463535911602219}
Evaluating cofigs: score_threshold=0.7, limit=6


  0%|          | 0/905 [00:00<?, ?it/s]

Result:  {'hit_rate': 0.9082872928176795, 'mrr': 0.7480110497237579}
Evaluating cofigs: score_threshold=0.7, limit=7


  0%|          | 0/905 [00:00<?, ?it/s]

Result:  {'hit_rate': 0.9138121546961326, 'mrr': 0.748800315706394}
Evaluating cofigs: score_threshold=0.7, limit=8


  0%|          | 0/905 [00:00<?, ?it/s]

Result:  {'hit_rate': 0.9215469613259668, 'mrr': 0.7497671665351233}
Evaluating cofigs: score_threshold=0.7, limit=9


  0%|          | 0/905 [00:00<?, ?it/s]

Result:  {'hit_rate': 0.9292817679558011, 'mrr': 0.7506265894939936}
Evaluating cofigs: score_threshold=0.7999999999999999, limit=5


  0%|          | 0/905 [00:00<?, ?it/s]

Result:  {'hit_rate': 0.8430939226519337, 'mrr': 0.7137937384898716}
Evaluating cofigs: score_threshold=0.7999999999999999, limit=6


  0%|          | 0/905 [00:00<?, ?it/s]

Result:  {'hit_rate': 0.8497237569060774, 'mrr': 0.7148987108655622}
Evaluating cofigs: score_threshold=0.7999999999999999, limit=7


  0%|          | 0/905 [00:00<?, ?it/s]

Result:  {'hit_rate': 0.8530386740331491, 'mrr': 0.7153722704551438}
Evaluating cofigs: score_threshold=0.7999999999999999, limit=8


  0%|          | 0/905 [00:00<?, ?it/s]

Result:  {'hit_rate': 0.8585635359116022, 'mrr': 0.7160628781899504}
Evaluating cofigs: score_threshold=0.7999999999999999, limit=9


  0%|          | 0/905 [00:00<?, ?it/s]

Result:  {'hit_rate': 0.8629834254143647, 'mrr': 0.7165539770235906}
Evaluating cofigs: score_threshold=0.8999999999999999, limit=5


  0%|          | 0/905 [00:00<?, ?it/s]

Result:  {'hit_rate': 0.1734806629834254, 'mrr': 0.16335174953959486}
Evaluating cofigs: score_threshold=0.8999999999999999, limit=6


  0%|          | 0/905 [00:00<?, ?it/s]

Result:  {'hit_rate': 0.1734806629834254, 'mrr': 0.16335174953959486}
Evaluating cofigs: score_threshold=0.8999999999999999, limit=7


  0%|          | 0/905 [00:00<?, ?it/s]

Result:  {'hit_rate': 0.1734806629834254, 'mrr': 0.16335174953959486}
Evaluating cofigs: score_threshold=0.8999999999999999, limit=8


  0%|          | 0/905 [00:00<?, ?it/s]

Result:  {'hit_rate': 0.1734806629834254, 'mrr': 0.16335174953959486}
Evaluating cofigs: score_threshold=0.8999999999999999, limit=9


  0%|          | 0/905 [00:00<?, ?it/s]

Result:  {'hit_rate': 0.1734806629834254, 'mrr': 0.16335174953959486}
Best params: 


[{'hit_rate': 0.9292817679558011,
  'mrr': 0.7506265894939936,
  'score_threshold': np.float64(0.4),
  'limit': np.int64(9)},
 {'hit_rate': 0.9292817679558011,
  'mrr': 0.7506265894939936,
  'score_threshold': np.float64(0.5),
  'limit': np.int64(9)},
 {'hit_rate': 0.9292817679558011,
  'mrr': 0.7506265894939936,
  'score_threshold': np.float64(0.6),
  'limit': np.int64(9)},
 {'hit_rate': 0.9292817679558011,
  'mrr': 0.7506265894939936,
  'score_threshold': np.float64(0.7),
  'limit': np.int64(9)},
 {'hit_rate': 0.9215469613259668,
  'mrr': 0.7497671665351233,
  'score_threshold': np.float64(0.4),
  'limit': np.int64(8)},
 {'hit_rate': 0.9215469613259668,
  'mrr': 0.7497671665351233,
  'score_threshold': np.float64(0.5),
  'limit': np.int64(8)},
 {'hit_rate': 0.9215469613259668,
  'mrr': 0.7497671665351233,
  'score_threshold': np.float64(0.6),
  'limit': np.int64(8)},
 {'hit_rate': 0.9215469613259668,
  'mrr': 0.7497671665351233,
  'score_threshold': np.float64(0.7),
  'limit': np.int