# Results

This notebook reads in (most of) the results and formats it into the tables in the paper. 

When running the notebook, ensure that the paths have been updated correctly! e.g., in MODELS in the third cell and QPP_MODELs in the QPP section

In [None]:
import pandas as pd
import os
import json
from pathlib import Path
from collections import defaultdict
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


In [None]:
def read_file(fpath):
    with open(fpath) as reader:
        return json.load(reader)["query_level"]

def read_results(path, flat=True):
    results = {}
    cqa = {}
    for fname in os.listdir(path):
        if not fname.endswith(".json"):
            continue

        if fname.startswith("dl") or fname.startswith("dev") and "scifact" not in fname:
            results[fname.split("_")[0].rstrip(".json")] = read_file(path/fname)
        elif "scifact" in fname:
            results["scifact"] = read_file(path/fname)
        elif fname.startswith("bier_test") and "cqa" not in fname:
            results[fname.split("_")[2].rstrip(".json")] = read_file(path/fname)
        elif "cqa" in fname:
            cqa_split = fname.split("_")[2].rstrip(".json")
            queries = read_file(path/fname)
            cqa.update({cqa_split + "_" + qid: mets for qid, mets in queries.items()})
        else:
            print(fname)
    results["cqa"] = cqa
    mean_results_raw = defaultdict(lambda: defaultdict(list))
    mean_results = {}
    for dname, queries in results.items():
        for qid, perfs in queries.items():
            for met, val in perfs.items():
                mean_results_raw[dname][met].append(val)


        if not flat:
            mean_results[dname] = {}

        
        for met, vals in mean_results_raw[dname].items():
            if met not in METRIC_SETS[dname]:
                continue
            assert len(vals) == QUERY_COUNTS[dname], str(path) + dname
            if flat:
                mean_results[f"{dname}_{met}"] = round(np.mean(vals), 3)
            else:
                mean_results[dname][met] = round(np.mean(vals), 3)
    
    return mean_results, mean_results_raw


METRIC_SETS = {
    "dev": {"recip_rank_cut_10", "map"},
    "dl19": {"ndcg_cut_10", "map"},
    "dl20": {"ndcg_cut_10", "map"},
    "scifact": {"ndcg_cut_10"},
    "cqa": {"ndcg_cut_10"},
    "trec-covid": {"ndcg_cut_10"},
    "fiqa": {"ndcg_cut_10"},
}

QUERY_COUNTS = {
    "dev": 6980,
    "dl19": 43,
    "dl20": 54,
    "scifact": 300,
    "trec-covid": 50,
    "cqa": 13145,
    "fiqa": 648,
}


MODELS = {
    "DPR":  Path("./gathered_results/dpr"),
    "TAS-B (0s)":  Path("./gathered_results/tasb/"), 
    "MVRL": Path("./gathered_results/mvrl"),
    "CLDRD": Path("./gathered_results/cldrd/"), 
}


MEAN_RESULTS = {}
MEAN_RESULTS_RAW = {}
for model_name, res_path in MODELS.items():
    m, r = read_results(res_path)
    m["model"] = model_name
    MEAN_RESULTS[model_name] = m
    MEAN_RESULTS_RAW[model_name] = r

In [None]:
len(MEAN_RESULTS)

In [None]:
COL_ORDER = ['model',
 'dev_recip_rank_cut_10',
 'dev_map',
 'dl19_ndcg_cut_10',
 'dl19_map',
 'dl20_ndcg_cut_10',
 'dl20_map',
 'scifact_ndcg_cut_10',
 'fiqa_ndcg_cut_10',
 'trec-covid_ndcg_cut_10', 
 'cqa_ndcg_cut_10',
 ]


In [None]:
all_res_df = pd.DataFrame(MEAN_RESULTS.values())[COL_ORDER]
all_res_df

In [None]:
all_res_df.to_clipboard(index=False)

## QPP 

In [None]:
from scipy.stats import kendalltau, pearsonr, spearmanr
import ir_datasets

In [None]:
def compute_self_perf(run_path, predicted_path, flip_sign=False):
    actual_performances = {}
    with open(run_path) as reader:
        run_res = json.load(reader)
        for qid, res_dict in run_res["query_level"].items():
            actual_performances[qid] = res_dict[RUN_METRIC]
    pred = [] 
    actual = []
    with open(predicted_path) as reader:
        for line in reader:
            qid, p = line.split()
            p = float(p)
            if flip_sign:
                pred.append(-1*p)
            else:
                pred.append(p)
            actual.append(actual_performances[qid])
    

    spearman_res = spearmanr(actual, pred)
    pearson_res = pearsonr(actual, pred)
    ktau_res = kendalltau(actual, pred)

    return {
        "spearman": spearman_res.statistic,
        "spearman_pval": spearman_res.pvalue,
        "pearson": pearson_res.statistic,
        "pearson_pval": pearson_res.pvalue,
        "ktau": ktau_res.statistic,
        "ktau_pval": ktau_res.pvalue
    }


def compute_ref_perf(actual_perf_path, predicted_path, flip_sign=False):
    
    with open(actual_perf_path) as reader:
        act = json.load(reader)

    pred = [] 
    actual = []
    with open(predicted_path) as reader:
        for line in reader:
            qid, p = line.split()
            p = float(p)
            if flip_sign:
                pred.append(-1*p)
            else:
                pred.append(p)
            actual.append(act[qid][QPP_METRIC])

    spearman_res = spearmanr(actual, pred)
    pearson_res = pearsonr(actual, pred)
    ktau_res = kendalltau(actual, pred)

    return {
        "spearman": spearman_res.statistic,
        "spearman_pval": spearman_res.pvalue,
        "pearson": pearson_res.statistic,
        "pearson_pval": pearson_res.pvalue,
        "ktau": ktau_res.statistic,
        "ktau_pval": ktau_res.pvalue
    }


**Note**

Update the variables below with the correct paths before running!

In [None]:
QPP_MODELS = {
    "MVRL": Path("./gathered_results/mvrl_qpp/")
}

BASELINE_PATHS={
    "dl19": "../qpp_output/pre-retrieval/dl19/",
    "dl20": "../qpp_output/pre-retrieval/dl20/",
    "dev": "../qpp_output/pre-retrieval/dev/"
}

SELF_PERF_MODELS = {
    "MVRL": Path("./gathered_results/mvrl_updated_runs")
}

OTHER_PERF = {
    "BM25": {
        "dl19": Path("../datasets/actual_performances/dl19_bm25.json"),
        "dl20": Path("../datasets/actual_performances/dl20_bm25.json"),
        "dev": Path("../datasets/actual_performances/dev_bm25.json")
    }, 
    "TASB": {
        "dl19": Path("../datasets/actual_performances/dl19_tasb.json"),
        "dl20": Path("../datasets/actual_performances/dl20_tasb.json"),
        "dev": Path("../datasets/actual_performances/dev_tasb.json")
    }, 
    "DPR": {
        "dl19": Path("../datasets/actual_performances/dl19_dpr.json"),
        "dl20": Path("../datasets/actual_performances/dl20_dpr.json"),
        "dev": Path("../datasets/actual_performances/dev_dpr.json"),
    }   
}

QPP_METRIC = "ndcg@10"
RUN_METRIC = "ndcg_cut_10"

BL_PATH = {
    "dl19": Path("../qpp_output/pre-retrieval/dl19/"),
    "dl20": Path("../qpp_output/pre-retrieval/dl20/"),
    "dev": Path("../qpp_output/pre-retrieval/dev/")
}

QPP_AGG_METHODS = {"norm"}
FLIP_NORM = True

QPP_BASELINES = [
    ("VAR", ('VAR-std-sum', 'VAR-std-max', 'VAR-var-sum', 'VAR-var-max', 'VAR-std-avg', 'VAR-var-avg')),
    ("PMI", ('PMI-avg', 'PMI-max', 'PMI-sum')),
    ("IDF", ('IDF-std', 'IDF-avg', 'IDF-max', 'IDF-sum')),
    ("SCQ", ('SCQ-avg','SCQ-sum', 'SCQ-max')),
]

best_corr_metric =  "spearman"

dev_all_rows = []
dev_best_rows = {}


# select the best performing methods on the dev set
for ap_name, ap_paths in OTHER_PERF.items():
    dset = "dev"
    ap_path = ap_paths[dset]
    
    ## baseline results
    for qpp_bl_methods in QPP_BASELINES:
        qpp_method_name, qpp_bl_methods = qpp_bl_methods
        
        # pick best one
        qres = {}
        for m in qpp_bl_methods:
            ppath = BL_PATH[dset] / m
            qres[m] = compute_ref_perf(ap_path, ppath)
            qres[m]["dataset"] = dset
            qres[m]["ref"] = ap_name
            qres[m]["param"] = m
            qres[m]["method"] = qpp_method_name
            dev_all_rows.append(qres[m])
        
        best_ = max(qres.items(), key=lambda _: _[1][best_corr_metric])[1]
        dev_best_rows[(ap_name, qpp_method_name)] = best_
    
    ## model results wrt ref APs
    for qpp_method, path in QPP_MODELS.items():
        qres = {}
        for qpp_agg_method in QPP_AGG_METHODS:
            ppath = Path(str(path).format(method=qpp_agg_method)) / f"msmarco-{dset}.txt"
            m = qpp_method + f"({qpp_agg_method})"
            flip_sign = qpp_agg_method == "norm" and FLIP_NORM
            qres[qpp_agg_method] = compute_ref_perf(ap_path, ppath, flip_sign)
            qres[qpp_agg_method]["dataset"] = dset
            qres[qpp_agg_method]["ref"] = ap_name
            qres[qpp_agg_method]["param"] = qpp_agg_method
            qres[qpp_agg_method]["method"] = qpp_method
            
            dev_all_rows.append(qres[qpp_agg_method])

        best_ = max(qres.items(), key=lambda _: _[1][best_corr_metric])[1]
        dev_best_rows[(ap_name, qpp_method)] = best_


print(dev_best_rows)

In [None]:
qpp_rows = []

for ap_name, ap_paths in OTHER_PERF.items():
    for dset, ap_path in ap_paths.items():
        
        ## add baseline results
        for qpp_bl_methods in QPP_BASELINES:
            qpp_method, qpp_bl_methods = qpp_bl_methods
            # pick best model
            qpp_param = dev_best_rows[(ap_name, qpp_method)]["param"]
            ppath = BL_PATH[dset] / qpp_param
            res = compute_ref_perf(ap_path, ppath)
            res["dataset"] = dset
            res["ref"] = ap_name
            res["method"] = qpp_method
            res["param"] = qpp_param
            qpp_rows.append(res)
            
        ## model results wrt ref APs
        for qpp_method, path in QPP_MODELS.items():
            qpp_agg_method = dev_best_rows[(ap_name, qpp_method)]["param"]
            ppath = Path(str(path).format(method=qpp_agg_method)) / f"msmarco-{dset}.txt"
            flip_sign = qpp_agg_method == "norm" and FLIP_NORM
            res = compute_ref_perf(ap_path, ppath, flip_sign)
            res["dataset"] = dset
            res["ref"] = ap_name
            res["method"] = qpp_method
            res["param"] = qpp_agg_method
            qpp_rows.append(res)
    
## model results wrt self
for dset in ["dl19", "dl20"]:
    for qpp_method, path in QPP_MODELS.items():
        run_path = SELF_PERF_MODELS[qpp_method] / (dset + ".json")
        qpp_agg_method = dev_best_rows[(ap_name, qpp_method)]["param"]
        ppath = Path(str(path).format(method=qpp_agg_method)) / f"msmarco-{dset}.txt"
        m = qpp_method + f"({qpp_agg_method})"
        flip_sign = qpp_agg_method == "norm" and FLIP_NORM
        qres[m] = compute_self_perf(run_path, ppath, flip_sign)
        qres[m]["dataset"] = dset
        qres[m]["ref"] = "self"
        qres[m]["method"] = qpp_method
        qres[m]["param"] = qpp_agg_method
        qpp_rows.append(qres[m])
    

#all_qpp = pd.DataFrame(all_rows)
qpp_rows = pd.DataFrame(qpp_rows)

In [None]:
qpp_rows

In [None]:
def reformat_qpp(df, drop_dev=True, drop_pval=True):
    rows = {}
    for _, row in df.iterrows():
        dset = row["dataset"]
        if drop_dev and dset == "dev":
            continue
        k = (row["method"], row["ref"])
        if k not in rows:
            rows[k] = {"method": row["method"], "ref": row["ref"], "param": row["param"]}
        for col in ["spearman", "spearman_pval", "pearson", "pearson_pval", "ktau", "ktau_pval"]:
            if drop_pval and "_pval" in col:
                continue
            rows[k][f"{dset}-{col}"] = row[col]
    return pd.DataFrame(rows.values())

def reformat_qpp2(df, drop_dev=True, drop_pval=True):
    rows = {}
    for _, row in df.iterrows():
        dset = row["dataset"]
        if drop_dev and dset == "dev":
            continue
        k = row["method"]
        if k not in rows:
            rows[k] = {"method": row["method"]}
        ref = row["ref"]
        rows[k][f"param-{ref}"] = row["param"]
        
        for col in ["spearman", "spearman_pval", "pearson", "pearson_pval", "ktau", "ktau_pval"]:
            if drop_pval and "_pval" in col:
                continue
            rows[k][f"{ref}-{dset}-{col}"] = row[col]
    return pd.DataFrame(rows.values())


best_qpp = reformat_qpp2(qpp_rows)
best_qpp

In [None]:
QPP_ORDER = ["method", "BM25-dl19-spearman", "BM25-dl19-pearson", "BM25-dl19-ktau", 
             "BM25-dl20-spearman", "BM25-dl20-pearson", "BM25-dl20-ktau", "param-BM25",
            "DPR-dl19-spearman", "DPR-dl19-pearson", "DPR-dl19-ktau", 
             "DPR-dl20-spearman", "DPR-dl20-pearson", "DPR-dl20-ktau", "param-DPR",
            "TASB-dl19-spearman", "TASB-dl19-pearson", "TASB-dl19-ktau", 
             "TASB-dl20-spearman", "TASB-dl20-pearson", "TASB-dl20-ktau", "param-TASB"]
best_qpp = best_qpp[QPP_ORDER]
best_qpp["method"] = pd.Categorical(best_qpp["method"], ["IDF", "VAR", "SCQ", "PMI", "MCDropout-DB", "MVRL"])
best_qpp = best_qpp.sort_values(by=["method"])
best_qpp.to_clipboard(index=False)

In [None]:
best_qpp = reformat_qpp(qpp_rows)
QPP_ORDER = ["ref", "method", "dl19-spearman", "dl19-pearson", "dl19-ktau", "dl20-spearman", "dl20-pearson", "dl20-ktau", "param"]
best_qpp = best_qpp[QPP_ORDER]
best_qpp["method"] = pd.Categorical(best_qpp["method"], ["IDF", "VAR", "SCQ", "PMI", "MCDropout-DB", "MVRL"])
best_qpp = best_qpp.sort_values(by=["ref", "method"])
best_qpp_sel = best_qpp[best_qpp["ref"].apply(lambda _: _ in {"self"})]
best_qpp_sel.to_clipboard(index=False)