In [12]:
import pandas as pd
import numpy as np
import argparse

"""
Usage:

$ python3 evaluate.py --qrel_path=touche-task2-51-100-relevance.qrels --run_path=run.txt --depth=5
>         Tag  nDCG@5
> 0  ChatNoir   0.422
"""


def load_qrels(path):
    qrels = (
        pd.read_csv(path, header=None, sep=" ")
        .rename({0: "Topic", 1: "Q0", 2: "ID", 3: "Score"}, axis=1)
        .drop("Q0", axis=1)
    )
    qrels["Score"] = qrels["Score"].replace({-2: 0})
    return qrels


def load_runs(path, depth):
    try:
        # If space as separator
        df = (
            pd.read_csv(path, header=None, sep=" ")
            .rename({0: "Topic", 1: "Q0", 2: "ID", 3: "Rank", 4: "Score", 5: "Tag"}, axis=1)
            .sort_values(["Topic", "Rank"], ascending=[False, True])
            .groupby("Topic")
            .head(depth)
            .drop(["Q0", "Score"], axis=1)
        )
    except:
        # tab as as separator
        df = (
            pd.read_csv(path, header=None, sep="\t")
            .rename({0: "Topic", 1: "Q0", 2: "ID", 3: "Rank", 4: "Score", 5: "Tag"}, axis=1)
            .sort_values(["Topic", "Rank"], ascending=[False, True])
            .groupby("Topic")
            .head(depth)
            .drop(["Q0", "Score"], axis=1)
        )

    return df


def calculate_scores(runs, qrels, depth):
    def dcg(data, k):
        ranking = data.Score.astype(int).head(k).tolist()
        return sum(list(map(lambda entry: entry[1] / np.log2(2 + entry[0]), enumerate(ranking, start=0))))

    def normalize(v, min_v, max_v):
        return np.float64(v - min_v) / np.float64(max_v - min_v)

    scores = runs.merge(
        qrels,
        on=["Topic", "ID"],
        how="left"
    )
    scores = scores[~scores.Score.isna()]
    scores = scores.merge(
        scores
        .sort_values('Rank', ascending=True)
        .groupby(['Topic', 'Tag'])
        .apply(dcg, depth)
        .reset_index()
        .rename({0: 'DCG'}, axis=1),
        on=['Topic', 'Tag'],
        how='left'
    )
    scores = scores.merge(
        qrels
        .sort_values('Score', ascending=False)
        .groupby('Topic')
        .apply(dcg, depth)
        .reset_index()
        .rename({0: 'IDCG'}, axis=1),
        on=['Topic'],
        how='left'
    )
    scores['NDCG'] = scores.apply(lambda row: normalize(row['DCG'], 0, row['IDCG']), axis=1)
    return (
        scores
        .loc[:, ["Tag", "Topic", "NDCG"]]
        .drop_duplicates()
        .rename({"NDCG": "nDCG@" + str(depth)}, axis=1)
        .reset_index(drop=True)
    )


def calculate_mean(scores, depth):
    return (
        scores
        .groupby("Tag")
        .mean()
        .reset_index()
        .sort_values("nDCG@" + str(depth), ascending=False)
        .drop("Topic", axis=1)
    )


def main(qrel_path, run_path, depth):
    qrels = load_qrels(qrel_path)
    runs = load_runs(run_path, depth)
    scores = calculate_scores(runs, qrels, depth)
    mean_scores = calculate_mean(scores, depth)
    print(mean_scores.round(3))



In [25]:
qrels = load_qrels('/notebook/Touche22/touche2021_val/touche-task2-51-100-relevance.qrels')
runs = load_runs('/notebook/Touche22/touche2021_val/outs/colbert_finetune.qrels', 5)

In [26]:
qrels

Unnamed: 0,Topic,ID,Score
0,54,clueweb12-0205wb-64-11095,0
1,54,clueweb12-0501wb-64-06459,1
2,54,clueweb12-0207wb-30-15337,0
3,54,clueweb12-0906wb-38-27123,0
4,54,clueweb12-0907wb-55-11510,0
...,...,...,...
2071,86,clueweb12-0008wb-85-29076,0
2072,86,clueweb12-1008wb-62-10779,0
2073,86,clueweb12-0202wb-34-19787,0
2074,86,clueweb12-0915wb-71-22856,0


In [27]:
runs

Unnamed: 0,Topic,ID,Rank,Tag
49000,100,clueweb12-1509wb-05-28610,1,colbert
49001,100,clueweb12-0808wb-28-06040,2,colbert
49002,100,clueweb12-1805wb-63-23098,3,colbert
49003,100,clueweb12-0601wb-98-29823,4,colbert
49004,100,clueweb12-0004wb-73-23813,5,colbert
...,...,...,...,...
0,51,clueweb12-0300wb-75-11201,1,colbert
1,51,clueweb12-0705wb-83-11603,2,colbert
2,51,clueweb12-1307wb-90-23633,3,colbert
3,51,clueweb12-0906wb-63-25161,4,colbert


In [28]:
scores = runs.merge(
        qrels,
        on=["Topic", "ID"],
        how="left"
    )

In [29]:
scores

Unnamed: 0,Topic,ID,Rank,Tag,Score
0,100,clueweb12-1509wb-05-28610,1,colbert,1.0
1,100,clueweb12-0808wb-28-06040,2,colbert,
2,100,clueweb12-1805wb-63-23098,3,colbert,
3,100,clueweb12-0601wb-98-29823,4,colbert,
4,100,clueweb12-0004wb-73-23813,5,colbert,1.0
...,...,...,...,...,...
245,51,clueweb12-0300wb-75-11201,1,colbert,
246,51,clueweb12-0705wb-83-11603,2,colbert,
247,51,clueweb12-1307wb-90-23633,3,colbert,
248,51,clueweb12-0906wb-63-25161,4,colbert,


In [30]:
scores = scores[~scores.Score.isna()]

In [31]:
scores

Unnamed: 0,Topic,ID,Rank,Tag,Score
0,100,clueweb12-1509wb-05-28610,1,colbert,1.0
4,100,clueweb12-0004wb-73-23813,5,colbert,1.0
13,98,clueweb12-1806wb-69-18337,4,colbert,0.0
20,96,clueweb12-0109wb-16-14460,1,colbert,0.0
21,96,clueweb12-0208wb-37-04521,2,colbert,0.0
...,...,...,...,...,...
232,54,clueweb12-1700wb-33-14262,3,colbert,1.0
235,53,clueweb12-0006wb-79-22015,1,colbert,1.0
236,53,clueweb12-0008wb-31-12426,2,colbert,2.0
242,52,clueweb12-1202wb-28-18697,3,colbert,0.0


In [19]:
scores = scores[~scores.Score.isna()]