In [None]:
import os
import re

In [None]:
sorted([item for item in os.listdir() if os.path.isdir(item)])

In [None]:
dataset_name = "dataset-v2-vanilla-mfq_boost"

In [None]:
trec_executable = "../trec_eval/trec_eval"     # path to the TREC executable

qrels_file_path = "../qrels/qrels.txt"
# qrels_file_path = "qrels/qrels_without_empty.txt"
# qrels_file_path = "qrels/qrels_without_empty_nodes_file.txt"

run_output_folder = f"{dataset_name}/output"      # folder that contains the output of the runs
trec_output_folder = f"{dataset_name}/trec"       # folder that will contain the TREC output 

## TREC
Generate TREC output files

In [None]:
output_run = os.listdir(run_output_folder)

In [None]:
trec_cmd = f"{trec_executable} -q -m all_trec -c {qrels_file_path}"

In [None]:
shell_commands = list()
trec_output_files = list()

for file in output_run:
    trec_output_file = file.replace("output", "trec")
    cmd = f"{trec_cmd} {run_output_folder}/{file} > {trec_output_folder}/{trec_output_file}"

    trec_output_files.append(trec_output_file)
    shell_commands.append(cmd)

In [None]:
for cmd in shell_commands:
    os.system(cmd)

## Results
Extrapolate data from trec output

In [None]:
files = os.listdir(trec_output_folder)
files.sort()

In [None]:
# measurements = ["ndcg_cut_5", "ndcg_cut_10", "map_cut_5", "map_cut_10", "ndcg", "map" ,"set_recall"]
measurements = ["ndcg_cut_5", "ndcg_cut_10", "map_cut_5", "map_cut_10"]

def get_metrics(file_path: str)->dict:
    d = dict()
    with open(file_path) as f:
        for line in f:
            for m in measurements:
                if re.match(f"{m}\s", line):
                    d[m] = line.split()[-1]
    return d


In [None]:
out = dict()
for file in files:
    file_path = f"{trec_output_folder}/{file}"
    key = file.replace('-trec.txt','')
    out[key] =  get_metrics(file_path)

In [None]:
import pandas as pd

df = pd.DataFrame(out).transpose()
df.index.name = 'Model'
df.columns.name = 'Metric'

markdown_table = df.to_markdown()
print(markdown_table)

Synthetic row analysis

In [None]:
def find_row(markdown_table: str, target_name: str) -> str:
    lines = markdown_table.strip().split("\n")

    for line in lines:
        if re.match(f"\| {re.escape(target_name)}", line):
            return line

In [None]:
synthetic_rows = [
    "CS-META+EXTRACTED-ALL-QUERIES",
    "CS-META-ALL-QUERIES",
    "CS-EXTRACTED-ALL-QUERIES",
    "BM25-META+EXTRACTED-ALL-QUERIES",
    "BM25-META-ALL-QUERIES",
    "BM25-EXTRACTED-ALL-QUERIES",
    "LMD-META+EXTRACTED-ALL-QUERIES",
    "LMD-META-ALL-QUERIES",
    "LMD-EXTRACTED-ALL-QUERIES",
]

for name in synthetic_rows:
    print(find_row(markdown_table, name))