In [21]:
%load_ext autoreload
%autoreload 2

In [22]:
from vespa.application import Vespa

app = Vespa(url = "http://localhost", port = 8080)

In [23]:
from vespa.query import Query, RankProfile as Ranking, OR

query_models = {
    "or_bm25": Query(
        match_phase = OR(),
        rank_profile = Ranking(name="bm25")
    ),
    "or_bm25_bert": Query(
        match_phase = OR(),
        rank_profile = Ranking(name="bert")
    ),
    "or_bm25_bert_index_1": Query(
        match_phase = OR(),
        rank_profile = Ranking(name="bert_index_1")
    )
    
}
        

In [24]:
from vespa.evaluation import MatchRatio, Recall, ReciprocalRank, NormalizedDiscountedCumulativeGain

eval_metrics = [MatchRatio(), Recall(at=10), ReciprocalRank(at=10), NormalizedDiscountedCumulativeGain(at=10)]

In [25]:
import json

labelled_data = json.load(open("cord19/labelled_data.json", "r"))

In [26]:
from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [28]:
from pandas import DataFrame

evaluations = {}
for query_model in query_models:
    evaluation = []
    for query_data in labelled_data:
        print(query_data["query_id"])
        evaluation_query = app.evaluate_query(
            eval_metrics=eval_metrics,
            query_model=query_models[query_model],
            query_id=query_data["query_id"],
            query=query_data["query"],
            id_field = "cord_uid",
            relevant_docs=query_data["relevant_docs"],
            hits = 10,
            timeout="100s",
            **{"ranking.features.query(query_token_ids)": str(tokenizer(
                        str(query_data["query"]), 
                        truncation=True, 
                        padding="max_length",
                        max_length=64, 
                        add_special_tokens=False
                    )["input_ids"])}            
        )
        evaluation.append(evaluation_query)
    evaluations[query_model] = DataFrame.from_records(evaluation)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50


## Recall issue

In [29]:
import pandas as pd

metric_values = []
for query_model in query_models:
    for metric in eval_metrics:
        metric_values.append(
            pd.DataFrame(
                data={
                    "query_model": query_model, 
                    "metric": metric.name, 
                    "value": evaluations[query_model][metric.name + "_value"].to_list()
                }
            )
        )
metric_values = pd.concat(metric_values, ignore_index=True)

In [12]:
metric_values[metric_values.metric == "recall_10"].groupby(['query_model', 'metric']).median()

Unnamed: 0_level_0,Unnamed: 1_level_0,value
query_model,metric,Unnamed: 2_level_1
or_bm25,recall_10,0.007412
or_bm25_bert,recall_10,0.008076
or_bm25_bert_index_1,recall_10,0.008118


In [14]:
from pandas import merge

recall_measures = merge(
    left=evaluations["or_bm25"], 
    right=evaluations["or_bm25_bert_index_1"],
    on="query_id"
)[["query_id", "recall_10_value_x", "recall_10_value_y"]]

recall_measures[recall_measures.recall_10_value_x != recall_measures.recall_10_value_y]

Unnamed: 0,query_id,recall_10_value_x,recall_10_value_y
14,15,0.006726,0.004484
16,17,0.006974,0.008368
20,21,0.006088,0.00761
32,33,0.006515,0.009772
38,39,0.007165,0.008188
40,41,0.014045,0.016854
49,50,0.020134,0.013423


In [13]:
query_data = labelled_data[14]

result_bm25 = app.query(query=query_data["query"], query_model=query_models["or_bm25"],
          hits = 10,
         )
result_bm25_bert = app.query(query=query_data["query"], query_model=query_models["or_bm25_bert_index_1"],
          hits = 10,
          timeout="100s",
          **{"ranking.features.query(query_token_ids)": str(tokenizer(
                        str(query_data["query"]), 
                        truncation=True, 
                        padding="max_length",
                        max_length=64, 
                        add_special_tokens=False
                    )["input_ids"])}            
         )

In [16]:
bm25_bert_ids = [hit["fields"]["cord_uid"] for hit in result_bm25_bert.hits] 
bm25_ids = [hit["fields"]["cord_uid"] for hit in result_bm25.hits]
id_in_bert_not_in_bm25 = [x for x in bm25_bert_ids if x not in bm25_ids]
id_in_bert_not_in_bm25

['ecu579el']

In [19]:
result_bm25_11 = [hit["fields"]["cord_uid"] for hit in app.query(query=query_data["query"], query_model=query_models["or_bm25"], hits = 11).hits] 

In [20]:
result_bm25_11

['zpek8i5e',
 '75u57fw1',
 'up5jpq45',
 'qmrntk43',
 'cxfzs68n',
 'y2nhss9u',
 '94puwlbm',
 'zpmdrh4q',
 'fpexj3s5',
 'axljtddn',
 'ecu579el']

## Positive and null NDGC

In [31]:
metric_values.metric.unique()

array(['match_ratio', 'recall_10', 'reciprocal_rank_10', 'ndcg_10'],
      dtype=object)

In [34]:
from pandas import merge

ndcg_measures = merge(
    left=evaluations["or_bm25"], 
    right=evaluations["or_bm25_bert_index_1"],
    on="query_id"
)[["query_id", "ndcg_10_value_x", "ndcg_10_value_y"]]

#recall_measures[recall_measures.recall_10_value_x != recall_measures.recall_10_value_y]

In [35]:
ndcg_measures

Unnamed: 0,query_id,ndcg_10_value_x,ndcg_10_value_y
0,1,0.683159,0.812003
1,2,0.0,0.0
2,3,0.450853,0.619669
3,4,0.0,0.0
4,5,0.397809,0.455605
5,6,0.678762,0.901013
6,7,0.888733,0.6292
7,8,0.527845,0.947807
8,9,0.859413,0.569139
9,10,0.541696,0.88074
