In [1]:
import matplotlib.pyplot as plt
import pandas as pd
from matplotlib.patches import Patch
from collections import defaultdict
import json
import numpy as np
import statistics
from sklearn.model_selection import train_test_split, KFold
from utils import convert_run, dcg, dcg_from_run, dcg_for_query, load_umbrela_scores, make_equal_content_bins, assign_to_bin, prettify_label
import matplotlib as mpl
mpl.rcParams["font.family"] = "Times New Roman"

In [2]:
data_df = pd.read_csv("data/full_data.tsv", sep="\t")
data_df['max_vs_fully_reranked'] = data_df['max_dcg_value_mono_1000'] / data_df['mono_1000_reranked_all_dcg_value']
optimal_depth_avg = data_df['max_dcg_value_mono_1000_depth'].mean()
max_vs_fully_reranked_ratio_avg = data_df['max_vs_fully_reranked'].mean()

In [3]:
def equal_content_prediction_ten_fold_validation(initial_metric_col, fully_reranked_col, retriever, metric, 
                                                 reranker, rerank_depth_array, no_plot=False):
    main_df = data_df[["query_id", initial_metric_col, fully_reranked_col, 
                  f"max_{metric}_value_{reranker}", f"max_{metric}_value_{reranker}_depth"]].copy()

    metric_value_lookup_df = pd.read_csv(f"data/all_queries_all_depth-{retriever}-{reranker}-{metric}.csv")
    metric_value_lookup_df_indexed = metric_value_lookup_df.set_index(["query_id", "depth"])

    # Split all the data into 10 folds
    kf = KFold(n_splits=10, shuffle=True, random_state=42)
    folds = []
    for train_idx, test_idx in kf.split(main_df):
        train_df = main_df.iloc[train_idx]
        test_df = main_df.iloc[test_idx]
        folds.append((train_df, test_df))

    percentiles = [90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100]
    data = []

    for index, (train_df, test_df) in enumerate(folds):
        train_df = train_df.copy()
        test_df = test_df.copy()

        # Use training data to get equal content bucket
        bin_edges = make_equal_content_bins(train_df[initial_metric_col], n_bins=10)

        # Assign all data to corresponding metric bins
        train_df['metric_bin'] = train_df[initial_metric_col].apply(lambda x: assign_to_bin(x, bin_edges))
        train_df['metric_bin'] = pd.Categorical(
            train_df['metric_bin'],
            ordered=True,
            categories=sorted(set(train_df['metric_bin']), key=lambda x: float(x.split('~')[0]))
        )
        test_df['metric_bin'] = test_df[initial_metric_col].apply(lambda x: assign_to_bin(x, bin_edges))
        test_df['metric_bin'] = pd.Categorical(
            test_df['metric_bin'],
            ordered=True,
            categories=sorted(set(test_df['metric_bin']), key=lambda x: float(x.split('~')[0]))
        )

        # train data's initial metric value metric bin -> rerank depth that maximizes the metric
        train_initial_metric_to_optimal_depths = (
            train_df.groupby("metric_bin", observed=True)[f"max_{metric}_value_{reranker}_depth"]
              .apply(lambda x: sorted(x.tolist()))
              .to_dict()
        )

        for p in percentiles:
            # For each training data metric bin, get all optimal depths
            for metric_bin, depths in train_initial_metric_to_optimal_depths.items():
                # The pth percentile of the optimal depths is the predicted depth for test data in this metric bin
                depth_p = int(np.percentile(depths, p))
        
                # get test queries in this bin
                df_bin = test_df[test_df["metric_bin"] == metric_bin]
        
                for _, row in df_bin.iterrows():
                    if metric == "dcg":
                        metric_at_depth_p = metric_value_lookup_df_indexed.loc[(row['query_id'], depth_p), "metric_value_at_depth"]

                    data.append([index, p, row['query_id'], depth_p, 
                                 row[fully_reranked_col], 
                                 row[initial_metric_col], 
                                 row[f"max_{metric}_value_{reranker}"],
                                 row[f"max_{metric}_value_{reranker}_depth"],
                                 metric_bin,
                                 metric_at_depth_p, 
                                 0 if row[fully_reranked_col] == 0 else metric_at_depth_p / row[fully_reranked_col],
                                 0 if row[f"max_{metric}_value_{reranker}"] == 0 else metric_at_depth_p / row[f"max_{metric}_value_{reranker}"],
                                ])
        
    df = pd.DataFrame(data, columns=['Fold #', 'Percentile', 'Query ID', 'Predicted Depth', 'Reranked All Metric Value', 
                                     'Initial Metric Value', 'Maximum Metric Value', 'Depth of Maximum Metric Value',
                                     'Bin',
                                     'Metric Value At Predicted Depth', 'Ratio (Metric at Depth / Reranked All)',
                                     'Ratio (Metric at Depth / Maximum Metric Value)'])

    df.to_csv(f'data/equal_content_10_fold-{retriever}-{reranker}-{metric}-{initial_metric_col}.csv', index=False)

    # Rerank all to the same depth
    all_same_depth_avg_predicted_vs_fully_reranked_ratio = {}
    all_same_depth_avg_predicted_vs_max_ratio = {}

    for d in rerank_depth_array:
        # get metrics at depth d
        depth_df = metric_value_lookup_df_indexed.xs(d, level="depth")[
            ["metric_value_at_depth"]
        ].reset_index()
        
        merged = depth_df.merge(main_df, on="query_id", how="inner")
        merged["predicted_vs_fully_reranked"] = (
            merged["metric_value_at_depth"] / merged[fully_reranked_col]
        )
        merged["predicted_vs_max"] = (
            merged["metric_value_at_depth"] / merged[f"max_{metric}_value_{reranker}"]
        )
        all_same_depth_avg_predicted_vs_fully_reranked_ratio[d] = merged["predicted_vs_fully_reranked"].mean()
        all_same_depth_avg_predicted_vs_max_ratio[d]   = merged["predicted_vs_max"].mean()

    # Group data by percentile
    grouped_ratio = df.groupby("Percentile").agg(
        avg_predicted_depth=("Predicted Depth", "mean"),
        avg_predicted_vs_fully_reranked_ratio=("Ratio (Metric at Depth / Reranked All)", "mean"),
        avg_predicted_vs_max_ratio=("Ratio (Metric at Depth / Maximum Metric Value)", "mean")
    ).reset_index()
    grouped_ratio = grouped_ratio.sort_values("avg_predicted_depth")

    return all_same_depth_avg_predicted_vs_fully_reranked_ratio, grouped_ratio

In [4]:
# Obtain all QPP's data
reranker = "mono_1000"
metric = "dcg"
qpp_metrics = ['NQC', 'UEF', 'RSD', 'OddsRatio', 'WIG', 'Scaled NQC', 'QV-NQC Variant', 'DenseQPP', 'SMV', 'Sigma_max', 'Sigma_x']
qpp_rows = []

for qpp_metric in qpp_metrics:
    _, grouped_ratio = equal_content_prediction_ten_fold_validation(
        qpp_metric, "mono_1000_reranked_all_dcg_value", "bm25", "dcg", "mono_1000", [0, 1, 2, 5, 10, 20, 30, 50, 100, 250, 500, 800, 1000],
        True)

    mask = grouped_ratio["avg_predicted_vs_fully_reranked_ratio"] >= 1
    if mask.any(): # Get the smallest threshold such that ratio >= 1
        result_row = grouped_ratio.loc[mask].sort_values("Percentile").iloc[0]
    else: # Return the threshold that yields the maximum ratio if it never reaches 1
        result_row = grouped_ratio.loc[
            grouped_ratio["avg_predicted_vs_fully_reranked_ratio"].idxmax()
        ]

    qpp_rows.append({
        "QPP Measure": qpp_metric,
        "Threshold": result_row['Percentile'] / 100,
        "Average DCG@10 Ratio": result_row["avg_predicted_vs_fully_reranked_ratio"],
        "Average Predicted Depths": result_row["avg_predicted_depth"],
    })

qpp_df = pd.DataFrame(qpp_rows)

  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)


In [5]:
print(qpp_df.to_latex(index=False, float_format="%.3f"))

\begin{tabular}{lrrr}
\toprule
QPP Measure & Average Predicted Depths & Average DCG@10 Ratio & Threshold \\
\midrule
NQC & 846.439 & 1.000 & 0.980 \\
UEF & 736.390 & 1.000 & 0.960 \\
RSD & 533.900 & 1.000 & 0.900 \\
OddsRatio & 923.637 & 1.000 & 1.000 \\
WIG & 961.294 & 0.999 & 1.000 \\
Scaled NQC & 846.439 & 1.000 & 0.980 \\
QV-NQC Variant & 846.439 & 1.000 & 0.980 \\
DenseQPP & 992.380 & 0.999 & 1.000 \\
SMV & 764.217 & 1.001 & 0.970 \\
Sigma_max & 894.311 & 0.999 & 0.990 \\
Sigma_x & 834.596 & 0.999 & 0.980 \\
\bottomrule
\end{tabular}

