In [None]:
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
import os
import json
from matplotlib import pyplot as plt
import matplotlib as mpl


# Organizing results

In [None]:
base_path = ""
paths_pointwise = [base_path + "gm_sample_queries/", base_path + "gm_sample_docs/", base_path + "dm/"]
paths_lambda = [base_path + "gm_sample_queries_lambda_loss/"]

In [None]:
def preprocess(paths):
    df_dict = defaultdict(list)
    
    ds_map = {
        "ISTELLAS": "Istella-S",
        "MSLR30K": "MSLR-WEB30k",
        "YAHOO": "Yahoo!Webscope"
    }
    
    for path_dir in paths:
        for dir_adr in os.listdir(path_dir):
            dir_adr = os.path.join(path_dir, dir_adr)
            with open(os.path.join(dir_adr, "config.json")) as config_file:
                config = json.load(config_file)
                if "summary.csv" in os.listdir(dir_adr):
                    df_dict["dataset"].append(ds_map[config["Datasets"]["Full"]["ID"]])
                    if config["Experiment"]["Name"] == "gm_sample_queries":
                        df_dict["distillation"].append("Q")
                        df_dict["alg"].append("GM")
                        df_dict["loss"].append("pointwise")
                    elif config["Experiment"]["Name"] == "gm_sample_docs":
                        df_dict["distillation"].append("D")
                        df_dict["alg"].append("GM")
                        df_dict["loss"].append("pointwise")
                    elif config["Experiment"]["Name"] == "dm":
                        df_dict["distillation"].append("L")
                        df_dict["alg"].append("DM")
                        df_dict["loss"].append("pointwise")
                    elif config["Experiment"]["Name"] == "gm_sample_queries_lambda_loss":
                        df_dict["distillation"].append("Q")
                        df_dict["alg"].append("GM")
                        df_dict["loss"].append("listwise")
                    elif config["Experiment"]["Name"] == "gm_sample_docs_lambda_loss":
                        df_dict["distillation"].append("D")
                        df_dict["alg"].append("GM")
                        df_dict["loss"].append("listwise")
                    else:
                        raise NotImplementedError("Invalid exp_name {}".format(config["Experiment"]["Name"]))
                    df_dict["z_rate"].append(config["Datasets"]["Distilled"]["CompressionRatio"])
                    df_dict["rand_init"].append(config["Datasets"]["Distilled"]["RandInit"])
                    df_dict["batch_size"].append(config["Trainer"]["BatchSize"])
                    df_dict["seed"].append(config["General"]["Seed"])
                    smr_df = pd.read_csv(os.path.join(dir_adr, "summary.csv"))
                    df_dict["ndcg_final"].append(smr_df["NDCG@10-FinalEval-DistillInfoTest-Mean"].item())
                    df_dict["ndcg_init"].append(smr_df["NDCG@10-InitEval-DistillInfoTest-Mean"].item())
                    df_dict["arp_final"].append(smr_df["ARP-FinalEval-DistillInfoTest-Mean"].item())
                    df_dict["arp_init"].append(smr_df["ARP-InitEval-DistillInfoTest-Mean"].item())
                    df_dict["ndcg_full"].append(smr_df["NDCG@10-InitEval-FullInfoTest"].item())
                    df_dict["arp_full"].append(smr_df["ARP-InitEval-FullInfoTest"].item())
                    df_dict["r_rate"].append(smr_df["total_qd_vecs_ds"].item()/smr_df["total_qd_vecs_full"].item())
                    df_dict["best_step"].append(smr_df["best_step"].item())
                    df_dict["dist_valid_ndcg"].append(smr_df["best_valid_ndcg10"].item())
                    df_dict["init_valid_ndcg"].append(smr_df["NDCG@10-InitEval-DistilledInfoValidationBest"].item())
                    df_dict["exp_path"].append(dir_adr)
                    
    
    df = pd.DataFrame.from_dict(df_dict)
    df = df.sort_values(by=["distillation", "dataset", "rand_init", "z_rate"])
    df = df.drop_duplicates(["distillation", "dataset", "rand_init", "z_rate", "alg", "seed"])

    return df


In [None]:
df_pointwise = preprocess(paths_pointwise)
print(df_pointwise.groupby(["distillation", "dataset", "rand_init"]).size())

In [None]:
df_lambda = preprocess(paths_lambda)
print(df_lambda.groupby(["distillation", "dataset", "rand_init"]).size())
cond1 = (df["dataset"] == "MSLR-WEB30k") & (df["batch_size"] == 64)
cond2 = (df["dataset"] != "MSLR-WEB30k")
df = df[cond1 | cond2]
df = df[df["distillation"] == "Q"]
print(df.groupby(["distillation", "dataset", "rand_init"]).size())     
df_lambda[df_lambda["dataset"] ==  "MSLR-WEB30k"]

# Convergence Speed

### MSE Loss

In [None]:
df_pointwise.groupby(["dataset", "rand_init", "distillation",])["best_step"].describe()

In [None]:
df_pointwise.groupby(["distillation", "rand_init",])["best_step"].describe().round(1)

In [None]:
df_pointwise.groupby(["rand_init"])["best_step"].describe()

In [None]:
df_pointwise.groupby(["distillation"])["best_step"].describe()

In [None]:
df_pointwise.groupby(["dataset"])["best_step"].describe()

In [None]:
df_pointwise.groupby(["dataset", "rand_init"])["best_step"].describe()

### LambdaLoss

In [None]:
df_lambda.groupby(["dataset", "rand_init", "distillation",])["best_step"].describe()

In [None]:
df_lambda.groupby(["distillation", "rand_init",])["best_step"].describe().round(1)

In [None]:
df_lambda.groupby(["rand_init"])["best_step"].describe()

In [None]:
df_lambda.groupby(["distillation"])["best_step"].describe()

In [None]:
df_lambda.groupby(["dataset"])["best_step"].describe()

In [None]:
df_lambda.groupby(["dataset", "rand_init"])["best_step"].describe()

# Figures

In [None]:
def full_ndcg_arp(df, metric):
    alpha = 1.645
    df[metric+"_full_mean"] = np.mean(df[metric+"_full"])
    df[metric+"_full_min"] =  np.mean(df[metric+"_full"]) - alpha * np.std(df[metric+"_full"])
    df[metric+"_full_max"] =  np.mean(df[metric+"_full"]) + alpha * np.std(df[metric+"_full"])
    return df

In [None]:
def max_min_res(df, metric):
    alpha = 1.645
    df[metric+"_final_min"] = np.mean(df[metric+"_final"]) - alpha * np.std(df[metric+"_final"])
    df[metric+"_final_max"] = np.mean(df[metric+"_final"]) + alpha * np.std(df[metric+"_final"])
    df[metric+"_final_mean"] = np.mean(df[metric+"_final"])

    df[metric+"_init_min"] = np.mean(df[metric+"_init"]) - alpha * np.std(df[metric+"_init"])
    df[metric+"_init_max"] = np.mean(df[metric+"_init"]) + alpha * np.std(df[metric+"_init"])
    df[metric+"_init_mean"] = np.mean(df[metric+"_init"])
    
    return df


def df_plots(df, metric):
    # ax, fig = plt.subplots(1, 1, figsize=(8,6))
    # mpl.rcParams['font.weight'] = 'bold'
    mpl.rcParams['axes.titleweight'] = 'bold'
    mpl.rcParams['axes.labelweight'] = 'bold'
    fontsize = 20
    ms = 18
    lw = 2
    alg = df["alg"].head(1).item()
    dtype = df["distillation"].head(1).item()
    Gold = '#FFD700'
    Blue = '#1f77b4'
    Maroon = '#800000'

    
    fig = plt.figure(figsize=(10, 12))
    ax = fig.add_subplot(111)
    df = df.groupby(["z_rate", "rand_init"]).apply(max_min_res, metric)
    df = df.drop_duplicates(["dataset", "distillation", "z_rate", "rand_init"])
    df = df.sort_values(["dataset", "distillation", "rand_init", "z_rate"])

    df_temp = df[df["rand_init"] == True]
    
    did = df_temp["dataset"].head(1).item()
    distillation_type = df_temp["distillation"].head(1).item()
    loss = df_temp["loss"].head(1).item()

    x_values = df_temp["r_rate"]
    
    ax.plot(x_values, df_temp[metric+"_final_mean"], '*--', label="{}-{}-R".format(alg, dtype), markersize=ms, linewidth=lw, color=Gold)
    ax.plot(x_values, df_temp[metric+"_init_mean"], 'p--', label="RV", markersize=ms, linewidth=lw, color=Maroon)
    ax.fill_between(x_values, df_temp[metric+"_final_min"], df_temp[metric+"_final_max"], color=Gold, alpha=0.2)
    ax.fill_between(x_values, df_temp[metric+"_init_min"], df_temp[metric+"_init_max"], color=Maroon, alpha=0.2)


    df_temp = df[df["rand_init"] == False]

    x_values = df_temp["r_rate"]
    
    ax.plot(x_values, df_temp[metric+"_final_mean"], 'o--', label="{}-{}-NR".format(alg, dtype), markersize=ms, linewidth=lw, color=Blue)
    ax.plot(x_values, df_temp[metric+"_init_mean"], 'g^--', label="{}-S".format(dtype), markersize=ms, linewidth=lw)
    ax.fill_between(x_values, df_temp[metric+"_init_min"], df_temp[metric+"_init_max"], color='g', alpha=0.2)
    ax.fill_between(x_values, df_temp[metric+"_final_min"], df_temp[metric+"_final_max"], color=Blue, alpha=0.2)
    # ax.errorbar(df_temp["z_rate"], df_temp[metric+"_final_mean"], yerr=2*df[metric+"_final_std"].head(1))

    ax.plot(x_values, df_temp[metric+"_full_mean"], '--', label="Full", linewidth=lw*2, color='Purple')
    ax.fill_between(x_values, df_temp[metric+"_full_min"], df_temp[metric+"_full_max"], color='Purple', alpha=0.2)

    # ax.legend(loc="lower right")
    plt.title(did, fontsize=fontsize)
    plt.xlabel("Relative dataset size to the full dataset", fontsize=fontsize)
    if metric == "ndcg":
        y_l = "NDCG@10"
    elif metric == "arp":
        y_l = "ARP"
    else:
        raise NotImplementedError
    plt.ylabel("Average {}".format(y_l), fontsize=fontsize)
    x_ticks = x_values
    custom_x_ticks = x_ticks*100
    custom_x_ticks = custom_x_ticks.round(2).apply(lambda x: str(x)+'%')
    ax.set_xticks(x_ticks)
    ax.set_xticklabels(custom_x_ticks, rotation=22.5)
    if metric == "ndcg":
        y_ticks = np.arange(0, 0.85, 0.1)
    elif metric == "arp":
        y_ticks = np.arange(0, 80, 10)
    ax.set_yticks(y_ticks)
    plt.xticks(fontsize=fontsize)
    plt.yticks(fontsize=fontsize)
    
    plt.legend(fontsize=17, loc='lower left')
    


    plt.grid()

    plt.savefig("./wandb/distillation_figs/{}-{}-{}-{}-{}.pdf".format(metric, did,alg,distillation_type, loss), transparent=True)
    # ax.plot()
    plt.show()

    return df


### MSE 

In [None]:
df_pointwise = df_pointwise.groupby(["dataset"]).apply(full_ndcg_arp, "ndcg")
df_pointwise = df_pointwise.groupby(["dataset"]).apply(full_ndcg_arp, "arp")

In [None]:
df_pointwise.groupby(["distillation", "dataset"]).apply(df_plots, "ndcg")
df_pointwise.groupby(["distillation", "dataset"]).apply(df_plots, "arp")

### LambdaLoss

In [None]:
df_lambda = df_lambda.groupby(["dataset"]).apply(full_ndcg_arp, "ndcg")
df_lambda = df_lambda.groupby(["dataset"]).apply(full_ndcg_arp, "arp")

In [None]:
df_lambda.groupby(["distillation", "dataset"]).apply(df_plots, "ndcg")
df_lambda.groupby(["distillation", "dataset"]).apply(df_plots, "arp")

### MSE and LambdaLoss

In [None]:
def df_pl_plots(df, metric):
    # ax, fig = plt.subplots(1, 1, figsize=(8,6))
    # mpl.rcParams['font.weight'] = 'bold'
    mpl.rcParams['axes.titleweight'] = 'bold'
    mpl.rcParams['axes.labelweight'] = 'bold'
    fontsize = 20
    ms = 18
    lw = 2
    alg = df["alg"].head(1).item()
    dtype = df["distillation"].head(1).item()
    Gold = '#FFD700'
    Blue = '#1f77b4'
    Maroon = '#800000'

    
    fig = plt.figure(figsize=(10, 12))
    ax = fig.add_subplot(111)
    df = df.groupby(["z_rate", "rand_init", "loss"]).apply(max_min_res, metric)
    df = df.drop_duplicates(["dataset", "distillation", "z_rate", "rand_init", "loss"])
    df = df.sort_values(["dataset", "distillation", "rand_init", "z_rate"])

    # print(df.groupby(["distillation", "dataset", "rand_init"]).size())
    df_temp = df[df["rand_init"] == True]
    did = df_temp["dataset"].head(1).item()
    distillation_type = df_temp["distillation"].head(1).item()
    

    df_temp = df_temp[df_temp["loss"] == "pointwise"]
    loss = df_temp["loss"].head(1).item()
    x_values = df_temp["r_rate"]
    ax.plot(x_values, df_temp[metric+"_final_mean"], '*--', label="{}-{}-R-P".format(alg, dtype), markersize=ms, linewidth=lw, color=Gold)
    ax.fill_between(x_values, df_temp[metric+"_final_min"], df_temp[metric+"_final_max"], color=Gold, alpha=0.2)

    df_temp = df[df["rand_init"] == True]
    df_temp = df_temp[df_temp["loss"] == "listwise"]
    loss = df_temp["loss"].head(1).item()
    x_values = df_temp["r_rate"]
    ax.plot(x_values, df_temp[metric+"_final_mean"], 'p--', label="{}-{}-R-L".format(alg, dtype), markersize=ms, linewidth=lw, color='#004488')
    ax.fill_between(x_values, df_temp[metric+"_final_min"], df_temp[metric+"_final_max"], color='#004488', alpha=0.2) # Blue


    df_temp = df[df["rand_init"] == False]
    df_temp = df_temp[df_temp["loss"] == "pointwise"]
    loss = df_temp["loss"].head(1).item()
    x_values = df_temp["r_rate"]
    ax.plot(x_values, df_temp[metric+"_init_mean"], 'g^--', label="{}-S-P".format(dtype), markersize=ms, linewidth=lw)
    ax.fill_between(x_values, df_temp[metric+"_init_min"], df_temp[metric+"_init_max"], color='g', alpha=0.2)

    ax.plot(x_values, df_temp[metric+"_full_mean"], '--', label="Full-P", linewidth=lw*2, color='Purple')
    ax.fill_between(x_values, df_temp[metric+"_full_min"], df_temp[metric+"_full_max"], color='Purple', alpha=0.2)

    df_temp = df[df["rand_init"] == False]
    df_temp = df_temp[df_temp["loss"] == "listwise"]
    loss = df_temp["loss"].head(1).item()
    x_values = df_temp["r_rate"]
    ax.plot(x_values, df_temp[metric+"_init_mean"], 'o--', label="{}-S-L".format(dtype), markersize=ms, linewidth=lw, color='#999933') #olive
    ax.fill_between(x_values, df_temp[metric+"_init_min"], df_temp[metric+"_init_max"], color='#999933', alpha=0.2)

    ax.plot(x_values, df_temp[metric+"_full_mean"], '--', label="Full-L", linewidth=lw*2, color='#663333') # dark red
    ax.fill_between(x_values, df_temp[metric+"_full_min"], df_temp[metric+"_full_max"], color='#663333', alpha=0.2)
    


    # # ax.legend(loc="lower right")
    plt.title(did, fontsize=fontsize)
    plt.xlabel("Relative dataset size to the full dataset", fontsize=fontsize)
    if metric == "ndcg":
        y_l = "NDCG@10"
    elif metric == "arp":
        y_l = "ARP"
    else:
        raise NotImplementedError
    plt.ylabel("Average {}".format(y_l), fontsize=fontsize)
    x_ticks = x_values
    custom_x_ticks = x_ticks*100
    custom_x_ticks = custom_x_ticks.round(2).apply(lambda x: str(x)+'%')
    ax.set_xticks(x_ticks)
    ax.set_xticklabels(custom_x_ticks, rotation=22.5)
    if metric == "ndcg":
        y_ticks = np.arange(0, 0.85, 0.1)
    elif metric == "arp":
        y_ticks = np.arange(0, 80, 10)
    ax.set_yticks(y_ticks)
    plt.xticks(fontsize=fontsize)
    plt.yticks(fontsize=fontsize)
    
    plt.legend(fontsize=17, loc='lower left')

    plt.grid()

    plt.savefig("./wandb/distillation_figs/{}-{}-{}-{}-both-loss.pdf".format(metric, did,alg,distillation_type), transparent=True)
    # ax.plot()
    plt.show()

    return df

In [None]:
df_pl = pd.concat([df_pointwise, df_lambda])
df_pl = df_pl[df_pl["distillation"] == "Q"]
df_pl.groupby(["distillation", "dataset", "rand_init"]).size()

df_pl.groupby(["distillation", "dataset"]).apply(df_pl_plots, "ndcg")
df_pl.groupby(["distillation", "dataset"]).apply(df_pl_plots, "arp")

# Overall Comparison

In [None]:
df_pointwise.head()

In [None]:
print(df_pointwise.groupby(["distillation", "dataset", "rand_init"]).size())
dfo = df_pointwise.copy()

In [None]:
dfo = df_pointwise.copy()
# change the valid ndcg into the test performance

def aggregate(dfo):
    selected_rows = dfo.query("rand_init == True")
    row_id = selected_rows["dist_valid_ndcg"].idxmax()    
    dfo["dp"] = dfo.loc[row_id]["ndcg_final"]
    dfo["path_distill"] = dfo.loc[row_id]["exp_path"]

    selected_rows = dfo.query("rand_init == False")
    row_id = selected_rows["init_valid_ndcg"].idxmax()
    dfo["sp"] = dfo.loc[row_id]["ndcg_init"]
    dfo["path_sample"] = dfo.loc[row_id]["exp_path"]

    return dfo


def full_max_agg(dfo):
    row_id = dfo["ndcg_full"].idxmax()
    dfo["full"] = dfo.loc[row_id]["ndcg_full"]
    dfo["path_full"] = dfo.loc[row_id]["exp_path"]
    return dfo


dfo = dfo.groupby(by=["dataset"]).apply(full_max_agg)
dfo = dfo.groupby(by=["distillation", "dataset"]).apply(aggregate)
dfo = dfo.drop_duplicates(["distillation", "dataset", "alg"])
dfo = dfo.sort_values(by=["dataset", "rand_init", "distillation", "alg"])
dfo = dfo[["distillation", "dataset", "alg", "dp", "sp", "full", "ndcg_full_mean", "path_distill", "path_sample", "path_full"]].round(3)
print(dfo.groupby(["distillation", "dataset"]).size())
dfo     

# Significance testing

In [None]:
from scipy import stats
import statsmodels.stats.multicomp as mc

In [None]:
def sig_test(df):
    sig_df = defaultdict()
    did = df["dataset"].head(1).item()
    for idx, row in df.iterrows():
    
        # distillation ndcgs
        mdf = pd.read_csv(os.path.join(row["path_distill"], "metrics.csv"))
        sig_df["dist_"+row["distillation"]] = mdf["NDCG@10-FinalEval-DistillInfoTest"]
        #sampling ndcgs
        mdf = pd.read_csv(os.path.join(row["path_sample"], "metrics.csv"))
        sig_df["samp_"+row["distillation"]] = mdf["NDCG@10-InitEval-DistillInfoTest"]

        mdf = pd.read_csv(os.path.join(row["path_full"], "metrics.csv"))
        sig_df["full"] = mdf["NDCG@10-InitEval-FullInfo-Test"]

    sig_df = pd.DataFrame.from_dict(sig_df)

    sig_df = sig_df.melt(value_vars=sig_df.columns, var_name="model", value_name="metric")
    comparison = mc.MultiComparison(sig_df["metric"], sig_df["model"])
    tbl, a1, a2 = comparison.allpairtest(stats.ttest_ind, method= "bonf", alpha=0.0001)
    # tab_df_list.append(pd.DataFrame(tbl))
    print(did, "\n", tbl, "\n---------------------------------------\n")
    pass


dfo.groupby(by=["dataset"]).apply(sig_test)

# Dataset Statistics

In [None]:
from pytorchltr.datasets import MSLR30K, IstellaS
from datasets.yahoo import Yahoo
import numpy as np

In [None]:
import os

location = ""

mslr = {
    "train": MSLR30K(location=location+"MSLR30K/", split="train", fold=1, normalize=True, filter_queries=True, download=False, validate_checksums=False),
    "valid": MSLR30K(location=location+"MSLR30K/", split="vali", fold=1, normalize=True, filter_queries=True, download=False, validate_checksums=False),
    "test": MSLR30K(location=location+"MSLR30K/", split="test", fold=1, normalize=True, filter_queries=True, download=False, validate_checksums=False)
}


istella = {
    "train": IstellaS(location=location+"ISTELLAS/", split="train", normalize=True, filter_queries=True, download=False, validate_checksums=False),
    "valid": IstellaS(location=location+"ISTELLAS/", split="vali", normalize=True, filter_queries=True, download=False, validate_checksums=False),
    "test": IstellaS(location=location+"ISTELLAS/", split="test", normalize=True, filter_queries=True, download=False, validate_checksums=False)
}

yahoo = {
    "train": Yahoo(location=location+"YAHOO/", split="train", normalize=False, filter_queries=True),
    "valid": Yahoo(location=location+"YAHOO/", split="vali", normalize=False, filter_queries=True),
    "test": Yahoo(location=location+"YAHOO/", split="test", normalize=False, filter_queries=True)
}


datasets = {
    "MSLR30K": mslr,
    "IstellaS": istella,
    "Yahoo": yahoo
}

splits = ["train", "valid", "test"]


In [None]:
did_dict = defaultdict(list)


for id in datasets.keys():
    split_stats = defaultdict(list)
    for split in splits:
        n_docs = []
        n_non_zero_docs = []
        for item in datasets[id][split]:
            n_docs.append(item.n.item())
            n_non_zero.append(torch.count_nonzero(item.relevance).item())
        split_stats[split] = np.sum(n_docs)
        if split == "train":
            split_stats["avg_docs"] = np.mean(n_docs)
            split_stats["avg_docs_non_zero"] = np.mean(n_non_zero)
    did_dict["dataset"].append(id)
    did_dict["train"].append(split_stats["train"])
    did_dict["valid"].append(split_stats["valid"])
    did_dict["test"].append(split_stats["test"])
    did_dict["avg_docs"].append(split_stats["avg_docs"])
    did_dict["avg_non_zero"].append(split_stats["avg_docs_non_zero"])


df = pd.DataFrame.from_dict(did_dict)
df
        
        