In [1]:
import json
from collections import defaultdict
from itertools import product

import pandas as pd
import roach
import torch

from relbench.base import TaskType
from relbench.datasets import get_dataset_names
from relbench.tasks import get_task, get_task_names

In [2]:
all_stores = roach.scan("relbench/2024-07-05")

In [3]:
len(all_stores)

742

In [4]:
all_stores[-1]["__roach__"]

{'project': 'relbench/2024-07-05',
 'timestamp': 1720608396648790026,
 'caller_file': 'idgnn_link.py',
 'done': True}

In [5]:
def wrap(name):
    return r"\texttt{" + name + r"}"
    # return name

In [6]:
txt = {
    "val": "Val",
    "test": "Test",
}

# regression

In [7]:
metric = "mae"
higher_is_better = False

In [8]:
table_data = []
for dataset in get_dataset_names():
    for task in get_task_names(dataset):
        task_obj = get_task(dataset, task)
        if task_obj.task_type.value != TaskType.REGRESSION.value:
            continue
        for script in [
            "gnn_node",
            "lightgbm_node",
            "hybrid_node",
            "baseline_node",
        ]:
            for split in [
                "val",
                "test",
            ]:
                if script == "baseline_node":
                    stores = list(
                        filter(
                            lambda store: store["__roach__"]["caller_file"]
                            == f"{script}.py"
                            and store["args"]["dataset"] == dataset
                            and store["args"]["task"] == task,
                            all_stores,
                        ),
                    )
                    for baseline in [
                        "global_zero",
                        "global_mean",
                        "global_median",
                        "entity_mean",
                        "entity_median",
                    ]:
                        try:
                            store = stores[-1]
                            val = store[baseline][split][metric]
                        except IndexError:
                            val = float("nan")
                        record = {
                            "dataset": dataset,
                            "task": task,
                            "script": baseline,
                            "split": split,
                            "mean": val,
                            "std": 0.0,
                        }
                        table_data.append(record)
                else:
                    vals = []
                    for seed in range(5):
                        stores = list(
                            filter(
                                lambda store: store["__roach__"]["caller_file"]
                                == f"{script}.py"
                                and store["args"]["dataset"] == dataset
                                and store["args"]["task"] == task
                                and store["args"]["seed"] == seed,
                                all_stores,
                            )
                        )
                        try:
                            store = stores[-1]
                            val = store[split][metric]
                            vals.append(val)
                        except IndexError:
                            val = float("nan")
                            vals.append(val)
                            # pass
                    val = torch.tensor(vals)
                    mean = val.mean().item()
                    std = val.std().item()
                    record = {
                        "dataset": dataset,
                        "task": task,
                        "script": script,
                        "split": split,
                        "mean": mean,
                        "std": std,
                    }
                    table_data.append(record)

## main paper table

In [21]:
tex_tab = defaultdict(dict)
for dataset in get_dataset_names():
    for task in get_task_names(dataset):
        task_obj = get_task(dataset, task)
        if task_obj.task_type.value != TaskType.REGRESSION.value:
            continue
        for split in [
            "val",
            "test",
        ]:
            local_data = filter(
                lambda r: r["dataset"] == dataset
                and r["task"] == task
                and r["split"] == split,
                table_data,
            )
            local_data = list(local_data)
            for script in [
                "global_zero",
                "global_mean",
                "global_median",
                "entity_mean",
                "entity_median",
                "lightgbm_node",
                "gnn_node",
                "relative",
            ]:
                if script == "relative":
                    f = filter(lambda r: r["script"] == "gnn_node", local_data)
                    r = next(f)
                    rdl_mean = r["mean"]

                    f = filter(lambda r: r["script"] == "lightgbm_node", local_data)
                    r = next(f)
                    dt_mean = r["mean"]

                    if higher_is_better:
                        val = (rdl_mean - dt_mean) / dt_mean
                    else:
                        val = (dt_mean - rdl_mean) / dt_mean

                    record = {
                        "dataset": dataset,
                        "task": task,
                        "script": "relative",
                        "split": split,
                        "mean": val,
                        "std": float("nan"),
                    }
                    table_data.append(record)

                    tex_val = r"$" + f"{val * 100: .2f}" + r"$ \%"

                else:
                    for rec in local_data:
                        if rec["script"] == script:
                            break
                    mean = rec["mean"]
                    std = rec["std"]

                    is_best = True
                    for comp_rec in local_data:
                        if comp_rec["script"] in ["hybrid_node", "relative"]:
                            continue
                        comp_mean = comp_rec["mean"]
                        comp_std = comp_rec["std"]
                        ### ignore std
                        std = 0
                        comp_std = 0
                        ###
                        if higher_is_better:
                            if mean + std < comp_mean - comp_std:
                                is_best = False
                        else:
                            if mean - std > comp_mean + comp_std:
                                is_best = False
                    opt_bm_open = r"\bm{" if is_best else ""
                    opt_bm_close = r"}" if is_best else ""
                    tex_val = (
                        r"$"
                        + opt_bm_open
                        + f"{mean:.3f}"
                        + opt_bm_close
                        ### ignore std
                        # + r"_{"
                        # + f"{std * 100:.2f}"
                        # + r"}$"
                        ###
                        + r"$"
                    )

                tex_tab[script][(wrap(dataset), wrap(task), txt[split])] = tex_val

for script in [
    "global_zero",
    "global_mean",
    "global_median",
    "entity_mean",
    "entity_median",
    "lightgbm_node",
    "gnn_node",
    "relative",
]:
    for split in ["val", "test"]:
        local_data = filter(
            lambda r: r["script"] == script and r["split"] == split, table_data
        )
        local_data = list(local_data)
        vals = []
        for rec in local_data:
            vals.append(rec["mean"])
        mean = sum(vals) / len(vals)

        is_best = script == "gnn_node"
        opt_bm_open = r"\bm{" if is_best else ""
        opt_bm_close = r"}" if is_best else ""
        if script == "relative":
            tex_val = (
                r"$"
                + opt_bm_open
                + f"{mean * 100:.2f}"
                + r"$ \%"
            )
        else:
            tex_val = (
                r"$"
                + opt_bm_open
                + f"{mean:.3f}"
                + opt_bm_close
                ### ignore std
                # + r"_{"
                # + f"{std * 100:.2f}"
                # + r"}$"
                ###
                + r"$"
            )

        tex_tab[script]["average", "", txt[split]] = tex_val


tex_df = pd.DataFrame(tex_tab)
tex_df

Unnamed: 0,Unnamed: 1,Unnamed: 2,global_zero,global_mean,global_median,entity_mean,entity_median,lightgbm_node,gnn_node,relative
\texttt{rel-amazon},\texttt{user-ltv},Val,$14.141$,$20.740$,$14.141$,$17.685$,$15.978$,$14.141$,$\bm{12.132}$,$ 14.21$ \%
\texttt{rel-amazon},\texttt{user-ltv},Test,$16.783$,$22.121$,$16.783$,$19.055$,$17.423$,$16.783$,$\bm{14.313}$,$ 14.72$ \%
\texttt{rel-amazon},\texttt{item-ltv},Val,$72.096$,$78.110$,$59.471$,$80.466$,$68.922$,$55.741$,$\bm{45.140}$,$ 19.02$ \%
\texttt{rel-amazon},\texttt{item-ltv},Test,$77.126$,$81.852$,$64.234$,$78.423$,$66.436$,$60.569$,$\bm{50.053}$,$ 17.36$ \%
\texttt{rel-avito},\texttt{ad-ctr},Val,$0.048$,$0.048$,$0.040$,$0.044$,$0.044$,$0.037$,$\bm{0.037}$,$ 2.21$ \%
\texttt{rel-avito},\texttt{ad-ctr},Test,$0.052$,$0.051$,$0.043$,$0.046$,$0.046$,$\bm{0.041}$,$0.041$,$-0.18$ \%
\texttt{rel-event},\texttt{user-attendance},Val,$0.262$,$0.457$,$0.262$,$0.296$,$0.268$,$0.262$,$\bm{0.258}$,$ 1.43$ \%
\texttt{rel-event},\texttt{user-attendance},Test,$0.264$,$0.470$,$0.264$,$0.304$,$0.269$,$0.264$,$\bm{0.261}$,$ 0.84$ \%
\texttt{rel-f1},\texttt{driver-position},Val,$11.083$,$4.334$,$4.136$,$7.181$,$7.114$,$3.450$,$\bm{3.193}$,$ 7.44$ \%
\texttt{rel-f1},\texttt{driver-position},Test,$11.926$,$4.513$,$4.399$,$8.501$,$8.519$,$4.170$,$\bm{4.022}$,$ 3.56$ \%


In [25]:
tex = tex_df.to_latex()
tex = tex.replace(r"\multirow[t]", r"\multirow[c]")
tex = tex.replace(r"\cline", r"\cmidrule")
tex = tex.replace(r"\cmidrule{1-11} \cmidrule{2-11}", r"\cmidrule{1-11}")
tex = tex.replace(r"\multirow[c]{2}{*}{average} & \multirow[c]{2}{*}{}", r"\multicolumn{2}{c}{\multirow[c]{2}{*}{Average}}")
print(tex)

\begin{tabular}{lllllllllll}
\toprule
 &  &  & global_zero & global_mean & global_median & entity_mean & entity_median & lightgbm_node & gnn_node & relative \\
\midrule
\multirow[c]{4}{*}{\texttt{rel-amazon}} & \multirow[c]{2}{*}{\texttt{user-ltv}} & Val & $14.141$ & $20.740$ & $14.141$ & $17.685$ & $15.978$ & $14.141$ & $\bm{12.132}$ & $ 14.21$ \% \\
 &  & Test & $16.783$ & $22.121$ & $16.783$ & $19.055$ & $17.423$ & $16.783$ & $\bm{14.313}$ & $ 14.72$ \% \\
\cmidrule{2-11}
 & \multirow[c]{2}{*}{\texttt{item-ltv}} & Val & $72.096$ & $78.110$ & $59.471$ & $80.466$ & $68.922$ & $55.741$ & $\bm{45.140}$ & $ 19.02$ \% \\
 &  & Test & $77.126$ & $81.852$ & $64.234$ & $78.423$ & $66.436$ & $60.569$ & $\bm{50.053}$ & $ 17.36$ \% \\
\cmidrule{1-11}
\multirow[c]{2}{*}{\texttt{rel-avito}} & \multirow[c]{2}{*}{\texttt{ad-ctr}} & Val & $0.048$ & $0.048$ & $0.040$ & $0.044$ & $0.044$ & $0.037$ & $\bm{0.037}$ & $ 2.21$ \% \\
 &  & Test & $0.052$ & $0.051$ & $0.043$ & $0.046$ & $0.046$ & $\bm{0.041}

## appendix table

In [13]:
tex_tab = defaultdict(dict)
for dataset in get_dataset_names():
    for task in get_task_names(dataset):
        task_obj = get_task(dataset, task)
        if task_obj.task_type.value != TaskType.REGRESSION.value:
            continue
        for split in [
            "val",
            "test",
        ]:
            local_data = filter(
                lambda r: r["dataset"] == dataset
                and r["task"] == task
                and r["split"] == split,
                table_data,
            )
            local_data = list(local_data)
            for script in [
                "global_zero",
                "global_mean",
                "global_median",
                "entity_mean",
                "entity_median",
                "lightgbm_node",
                "gnn_node",
            ]:
                for rec in local_data:
                    if rec["script"] == script:
                        break
                mean = rec["mean"]
                std = rec["std"]

                is_best = True
                for comp_rec in local_data:
                    if comp_rec["script"] == "hybrid_node":
                        continue
                    comp_mean = comp_rec["mean"]
                    comp_std = comp_rec["std"]
                    if higher_is_better:
                        if mean + std < comp_mean - comp_std:
                            is_best = False
                    else:
                        if mean - std > comp_mean + comp_std:
                            is_best = False
                opt_bm_open = r"\bm{" if is_best else ""
                opt_bm_close = r"}" if is_best else ""
                if script in ["lightgbm_node", "gnn_node"]:
                    tex_val = (
                        r"$"
                        + opt_bm_open
                        + f"{mean:.3f}"
                        + opt_bm_close
                        + r"_{\pm "
                        + f"{std:.3f}"
                        + r"}$"
                    )
                else:
                    tex_val = r"$" + opt_bm_open + f"{mean:.3f}" + opt_bm_close + r"$"

                tex_tab[script][(wrap(dataset), wrap(task), txt[split])] = tex_val

tex_df = pd.DataFrame(tex_tab)
tex_df

Unnamed: 0,Unnamed: 1,Unnamed: 2,global_zero,global_mean,global_median,entity_mean,entity_median,lightgbm_node,gnn_node
\texttt{rel-amazon},\texttt{user-ltv},Val,$14.141$,$20.740$,$14.141$,$17.685$,$15.978$,$14.141_{\pm 0.000}$,$\bm{12.132}_{\pm 0.007}$
\texttt{rel-amazon},\texttt{user-ltv},Test,$16.783$,$22.121$,$16.783$,$19.055$,$17.423$,$16.783_{\pm 0.000}$,$\bm{14.313}_{\pm 0.013}$
\texttt{rel-amazon},\texttt{item-ltv},Val,$72.096$,$78.110$,$59.471$,$80.466$,$68.922$,$55.741_{\pm 0.049}$,$\bm{45.140}_{\pm 0.068}$
\texttt{rel-amazon},\texttt{item-ltv},Test,$77.126$,$81.852$,$64.234$,$78.423$,$66.436$,$60.569_{\pm 0.047}$,$\bm{50.053}_{\pm 0.163}$
\texttt{rel-avito},\texttt{ad-ctr},Val,$0.048$,$0.048$,$0.040$,$0.044$,$0.044$,$0.037_{\pm 0.000}$,$\bm{0.037}_{\pm 0.000}$
\texttt{rel-avito},\texttt{ad-ctr},Test,$0.052$,$0.051$,$0.043$,$0.046$,$0.046$,$\bm{0.041}_{\pm 0.000}$,$\bm{0.041}_{\pm 0.001}$
\texttt{rel-event},\texttt{user-attendance},Val,$0.262$,$0.457$,$0.262$,$0.296$,$0.268$,$0.262_{\pm 0.000}$,$\bm{0.258}_{\pm 0.002}$
\texttt{rel-event},\texttt{user-attendance},Test,$0.264$,$0.470$,$0.264$,$0.304$,$0.269$,$0.264_{\pm 0.000}$,$\bm{0.261}_{\pm 0.002}$
\texttt{rel-f1},\texttt{driver-position},Val,$11.083$,$4.334$,$4.136$,$7.181$,$7.114$,$3.450_{\pm 0.030}$,$\bm{3.193}_{\pm 0.024}$
\texttt{rel-f1},\texttt{driver-position},Test,$11.926$,$4.513$,$4.399$,$8.501$,$8.519$,$\bm{4.170}_{\pm 0.137}$,$\bm{4.022}_{\pm 0.119}$


In [15]:
tex = tex_df.to_latex()
tex = tex.replace(r"\multirow[t]", r"\multirow[c]")
tex = tex.replace(r"\cline", r"\cmidrule")
tex = tex.replace(r"\cmidrule{1-10} \cmidrule{2-10}", r"\cmidrule{1-10}")
tex = tex.replace(r"\multirow[c]{2}{*}{average} & \multirow[c]{2}{*}{}", r"\multicolumn{2}{c}{\multirow[c]{2}{*}{Average}}")
print(tex)

\begin{tabular}{llllllllll}
\toprule
 &  &  & global_zero & global_mean & global_median & entity_mean & entity_median & lightgbm_node & gnn_node \\
\midrule
\multirow[c]{4}{*}{\texttt{rel-amazon}} & \multirow[c]{2}{*}{\texttt{user-ltv}} & Val & $14.141$ & $20.740$ & $14.141$ & $17.685$ & $15.978$ & $14.141_{\pm 0.000}$ & $\bm{12.132}_{\pm 0.007}$ \\
 &  & Test & $16.783$ & $22.121$ & $16.783$ & $19.055$ & $17.423$ & $16.783_{\pm 0.000}$ & $\bm{14.313}_{\pm 0.013}$ \\
\cmidrule{2-10}
 & \multirow[c]{2}{*}{\texttt{item-ltv}} & Val & $72.096$ & $78.110$ & $59.471$ & $80.466$ & $68.922$ & $55.741_{\pm 0.049}$ & $\bm{45.140}_{\pm 0.068}$ \\
 &  & Test & $77.126$ & $81.852$ & $64.234$ & $78.423$ & $66.436$ & $60.569_{\pm 0.047}$ & $\bm{50.053}_{\pm 0.163}$ \\
\cmidrule{1-10}
\multirow[c]{2}{*}{\texttt{rel-avito}} & \multirow[c]{2}{*}{\texttt{ad-ctr}} & Val & $0.048$ & $0.048$ & $0.040$ & $0.044$ & $0.044$ & $0.037_{\pm 0.000}$ & $\bm{0.037}_{\pm 0.000}$ \\
 &  & Test & $0.052$ & $0.051$ & $0

## leaderboard submission

In [16]:
lb_sub = defaultdict(lambda: defaultdict(dict))
for rec in table_data:
    dataset = rec["dataset"]
    task = rec["task"]
    script = rec["script"]
    split = rec["split"]
    mean = rec["mean"]
    std = rec["std"]

    lb_sub[script][split][f"{dataset}/{task}"] = [mean, std]
print(json.dumps(lb_sub, indent=2))

{
  "gnn_node": {
    "val": {
      "rel-amazon/user-ltv": [
        12.13155218257926,
        0.007139421124063155
      ],
      "rel-amazon/item-ltv": [
        45.14017719206194,
        0.06788705138267491
      ],
      "rel-avito/ad-ctr": [
        0.03652401605725081,
        0.0004016788031990846
      ],
      "rel-event/user-attendance": [
        0.25805687517005443,
        0.0024245484839017957
      ],
      "rel-f1/driver-position": [
        3.1930855864815975,
        0.023542208202951602
      ],
      "rel-hm/item-sales": [
        0.06469797821756289,
        0.0003100070533139141
      ],
      "rel-stack/post-votes": [
        0.059103957575344236,
        3.429541515110885e-05
      ],
      "rel-trial/study-adverse": [
        46.290131133687254,
        0.30417190159629065
      ],
      "rel-trial/site-success": [
        0.40093374169559537,
        0.008656987485762853
      ]
    },
    "test": {
      "rel-amazon/user-ltv": [
        14.313016270113087,