* Input: embeddings in `io/` directory.
* Output: graphs

In [None]:
"hu"

In [None]:
import json
from pathlib import Path
from ast import literal_eval
from typing import Tuple, Union, List
import numpy as np
import pandas as pd

import stats
import sys

In [None]:
n = 50
noise = "human"
DIR = Path("io/2021-01-22/")

In [None]:
EM_DIR = DIR / "embeddings"
RESPONSES_DIR = DIR / "responses"
SALMON_DIR = Path.home() / "Developer" / "stsievert" / "salmon"
sys.path.append(str(SALMON_DIR / "examples"))

In [None]:
def _literal_eval(x):
    try:
        return literal_eval(x)
    except ValueError:
        return x

def _get_dict(s: str) -> dict:
    k_v = [kv.split("=") for kv in s.split("-")]
    k_v2 = {k: _literal_eval(v) for k, v in k_v}
    return k_v2

def _ident(d: dict) -> str:
    d2 = sorted(tuple(d.items()))
    d3 = [f"{k}={v}" for k, v in d2]
    return "-".join(d3)

In [None]:
import msgpack
def _get_config(name):
    return _get_dict(
        name.replace(".msgpack", "")
        .replace(".csv", "")
        .replace("responses*", "")
    )

def _get(file: Path, history=False) -> Union[Tuple[np.ndarray, dict], List[dict]]:
    with open(file, "rb") as f:
        data = msgpack.load(f)
    em = data.pop("embedding")
    meta = data.pop("meta")
    perf = data.pop("performance")
    if history:
        return data["history"]
    for k, v in perf.items():
        meta[f"perf__{k}"] = v
    return em, meta

files = list(EM_DIR.glob("*.msgpack"))
print(len(files))
data = [_get(f) for f in files]
print(len(data))

In [None]:
keys = [k for k in RESPONSES_DIR.glob("*.csv") if "alg=RandomSampling" in str(k)]
assert len(keys) == 1, len(keys)
random_responses = pd.read_csv(keys[0])
from sklearn.model_selection import train_test_split
_, df_test = train_test_split(random_responses, test_size=0.2, random_state=42)
print(keys)
X_test = df_test[["head", "winner", "loser"]].to_numpy()

from run import _X_test
X_test = _X_test(n=n, num=20_000, noise=noise)

perf = [{**meta, **stats.collect(embedding, X_test)} for embedding, meta in data]
len(perf)

In [None]:
import datasets

In [None]:
df = pd.DataFrame(perf)

# fname = "history*R=10-alg=RR-d=2-dataset=strange_fruit-init=True-max_queries=20000-n=100-n_users=4-noise=constant-random_state=42-reaction_time=0.25-response_time=1.0.csv"
# fname = "history*R=10-alg=RR-d=2-dataset=strange_fruit-init=True-max_queries=20100-n=30-n_users=6-noise=constant-random_state=42-reaction_time=0.25-response_time=1.0.csv"
fname = "history*R=10-alg=RR-d=2-dataset=alien_eggs-init=True-max_queries=30100-n=50-n_users=6-noise=human-random_state=42-reaction_time=0.25-response_time=1.0.csv"
online = pd.read_csv(DIR / "history" / fname)
online["alg"] = online["perf__ident"] = "RR (online)"

df["sampling"] = df.ident.apply(lambda s: s.split("-")[0])
df["embedding"] = df.ident.apply(lambda s: s.split("-")[1])
online["sampling"] = "online"
online["embedding"] = "online"
df = pd.concat((df, online))
print("df.shape =", df.shape)

df.head()

In [None]:
df.alg.unique()

In [None]:
df.columns

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

df["n_responses/1000"] = df.n_responses / 1000
Y = ["accuracy", "nn_acc", "embedding_rel_error", "nn_diff_mean"]
# Y = ["accuracy", "perf__loss_test", "embedding_rel_error", "nn_diff_mean"]

w = 4
fig, axs = plt.subplots(ncols=2, nrows=2, figsize=(2 * w, 2.0 * w))
for k, (y, ax) in enumerate(zip(Y, axs.flatten())):
    ax = sns.lineplot(
        x="n_responses/1000",
        y=y,
        hue="sampling",
        style="embedding",
#         marker="o",
#         data=df,
        data=df,#[df.embedding.isin(["online", "CKL"])],
        ax=ax,
    )
    _ = ax.set_xlabel("Num. responses (thousands)")
    ax.grid(alpha=0.6)
    if k != 3:
        ax.legend_.remove()
    else:
        ax.legend(loc=(1.05, 0))
    _ = ax.set_title(y)
    
#     ax.set_xscale("log")
    if "accu" in y:
        _ = ax.set_ylim(0.50, None)
    elif "nn_acc" in y:
        _ = ax.set_ylim(0.0, 1.05)
#     ax.set_xlim(0.2, 20)


In [None]:
import matplotlib.ticker as ticker
show = df[df.embedding.isin(["online", "CKL"])].copy()
show["error"] = 1 - show["accuracy"]

w = 3
fig, axs = plt.subplots(figsize=(1 * w, 1.0 * w))
y = "accuracy"
ax = sns.lineplot(
    x="n_responses/1000",
    y=y,
    hue="sampling",
    style="embedding",
#     ci=None,
#     estimator="max",
#     marker="o",
    data=show,
)
ax.set_xlabel("Num. responses (thousands)")
ax.grid(alpha=0.6)
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: f"{100 * y:0.0f}%"))
ax.xaxis.set_major_locator(plt.MultipleLocator(10))
# ax.set_xlim(-2, 32)
# ax.set_xlim(25, 35)
# ax.set_xscale("log", base=2)

_ = ax.set_ylim(0.70, None)
_ = ax.set_title("Hold out performance")

ax.set_xlim(0.5, None)
ax.set_xscale("log", base=2)
ax.xaxis.set_major_locator(ticker.LogLocator(base=2, numticks=15))

# ax.yaxis.set_major_locator(ticker.MultipleLocator(0.02))
# ax.xaxis.set_major_locator(plt.MultipleLocator(2))
# ax.set_xlim(11, 27)

ax.legend(loc=(1.05, 0))

In [None]:
w = 3.0
fig, axs = plt.subplots(figsize=(1.5 * w, 1.0 * w))
y = "embedding_rel_error"
ax = sns.lineplot(
    x="n_responses/1000",
    y=y,
    hue="sampling",
    style="embedding",
#     ci=None,
#     estimator="max",
#     marker="o",
    data=show,
)
ax.grid(alpha=0.5)
ax.yaxis.set_major_locator(plt.MultipleLocator(0.1))
ax.xaxis.set_major_locator(plt.MultipleLocator(5))
# ax.xaxis.set_major_locator(plt.FixedLocator([0, 4, 10, 20, 30, 40, 50, 60]))
# ax.set_ylim(0.2, 1)
# ax.set_xscale("log")
# ax.set_xlim(0.5, 4)

# ax.set_xscale("log", base=2)
# ax.set_xlim(0.5, None)
ax.set_ylabel("Relative error")
ax.set_ylim(0, 1)
ax.set_title("Embedding error\n(after projection)")
ax.set_xlabel("Observed responses (thousands)")
ax.legend(loc=(1.05, 0))

In [None]:
fig, axs = plt.subplots(figsize=(1 * w, 1.0 * w))
y = "error"
ax = sns.lineplot(
    x="n_responses/1000",
    y=y,
    hue="sampling",
    style="embedding",
#     ci=None,
#     estimator="max",
#     marker="o",
    data=show,
)
ax.set_xlabel("Num. responses (thousands)")
ax.grid(alpha=0.6)
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: f"{100 * y:0.0f}%"))
ax.xaxis.set_major_locator(plt.MultipleLocator(10))
ax.set_xlim(0, 30)
ax.set_ylim(0.13, 0.26)
# ax.set_xlim(25, 35)
# ax.set_xscale("log", base=2)

# _ = ax.set_ylim(0.70, None)
_ = ax.set_title("Hold out performance")

    ax.set_xscale("log", base=2)
ax.xaxis.set_major_locator(ticker.LogLocator(base=2, numticks=15))
ax.set_xlim(0.3, None)
ax.legend(loc=(1.05, 0))

In [None]:
fig, axs = plt.subplots(figsize=(1 * w, 1.0 * w))
y = "error"
ax = sns.lineplot(
    x="n_responses/1000",
    y=y,
    hue="sampling",
    style="embedding",
#     ci=None,
#     estimator="max",
#     marker="o",
    data=show,
)
ax.set_xlabel("Num. responses (thousands)")
ax.grid(alpha=0.6)
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: f"{100 * y:0.0f}%"))
ax.set_xlim(0, 4)
ax.xaxis.set_major_locator(plt.MultipleLocator(0.5))
ax.set_ylim(0.13, 0.26)
# ax.set_xlim(25, 35)
# ax.set_xscale("log", base=2)

# _ = ax.set_ylim(0.70, None)
_ = ax.set_title("Hold out performance")

# ax.set_xscale("log", base=2)
# ax.xaxis.set_major_locator(ticker.LogLocator(base=2, numticks=15))
# ax.set_xlim(0.3, None)
ax.legend(loc=(1.05, 0))

In [None]:
_idx = show.pivot_table(
    index="sampling",
    values="accuracy",
    aggfunc=lambda x: np.abs(x - 0.82).idxmin(),
)
idx = {k: int(v) for k, v in dict(_idx["accuracy"]).items()}
for i in idx.values():
    s = show.loc[i, ["accuracy", "sampling", "n_responses"]]
    print(s)

In [None]:
df.columns

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

Y = ["accuracy", "nn_acc", "embedding_rel_error", "nn_diff_mean"]
Y = ["perf__loss_train", "perf__loss_test", "perf__score_train", "perf__score_test"]

w = 4
fig, axs = plt.subplots(ncols=2, nrows=2, figsize=(2 * w, 2.0 * w))
for y, ax in zip(Y, axs.flatten()):
    ax = sns.lineplot(
        x="n_responses",
        y=y,
        hue="sampling",
        style="embedding",
        data=df,
        ax=ax,
    )
    ax.grid(alpha=0.6)
    if "score" in y:
        _ = ax.set_ylim(0.5, 1)
#     ax.set_xscale("log")
#     _ = ax.set_title(y)
# axs[0][1].set_ylim(0, 2)

In [None]:
w = 2
fig, ax = plt.subplots(figsize=(2 * w, 2.0 * w))
ax = sns.lineplot(
    x="perf__loss_train",
    y="perf__loss_test",
    hue="alg",
    data=df,
    marker="o",
    ax=ax,
)
ax.grid(alpha=0.6)
