In [13]:
from typing import Union, Dict, Any
import pandas as pd
from zipfile import ZipFile
from pathlib import Path
import msgpack
import numpy as np

In [14]:
def _get_ident(row: Union[pd.Series, Dict[str, Any]]) -> Dict[str, Any]:
    return {k: row[k] for k in ["n_train", "sampling", "noise_model", "seed", "dataset"]}

def _matches(meta, ident) -> bool:
    basic = all(meta[k] == ident[k] for k in ["n_train", "sampling", "n"])
    nm = (meta["est__noise_model"] == ident["noise_model"])
    seed = (meta["seed"] == ident["seed"]) or (meta["seed"] and ident["seed"] and np.allclose(meta["seed"], ident["seed"]))
    return basic and nm and seed

In [26]:
idents = [
    # good viz for n=90
#     {'n_train': 990, 'sampling': 'random', 'noise_model': 'CKL', 'seed': 7.0, 'dataset': 'simulation', 'acc': 0.76, 'n': 90},
#     {'n_train': 990, 'sampling': 'active', 'noise_model': 'CKL', 'seed': None, 'dataset': 'simulation', 'acc': 0.78, 'n': 90},
#     {'n_train': 1890, 'sampling': 'random', 'noise_model': 'CKL', 'seed': 8.0, 'dataset': 'simulation', 'acc': 0.8, 'n': 90},
#     {'n_train': 5400, 'sampling': 'random', 'noise_model': 'CKL', 'seed': 8.0, 'dataset': 'simulation', 'acc': 0.82, 'n': 90},
#     {'n_train': 9000, 'sampling': 'active', 'noise_model': 'CKL', 'seed': None, 'dataset': 'simulation', 'acc': 0.84, 'n': 90},

    # good for n=30
    {'n_train': 780, 'sampling': 'random', 'noise_model': 'CKL', 'seed': 10.0, 'dataset': 'human', 'acc': 0.801, 'n': 30},
    {'n_train': 2100, 'sampling': 'random', 'noise_model': 'CKL', 'seed': 3.0, 'dataset': 'human', 'acc': 0.825, 'n': 30},
    {'n_train': 3600, 'sampling': 'active', 'noise_model': 'CKL', 'seed': None, 'dataset': 'human', 'acc': 0.845, 'n': 30},
    
    # good for n=180
#     {'n_train': 4680, 'sampling': 'random', 'noise_model': 'CKL', 'seed': 9.0, 'dataset': 'simulation', 'acc': 0.8, 'actual_acc': 0.8003672345200449, 'n': 180},
#     {'n_train': 5580, 'sampling': 'active', 'noise_model': 'CKL', 'seed': None, 'dataset': 'simulation', 'acc': 0.82, 'actual_acc': 0.8197490564113027, 'n': 180},
#     {'n_train': 10800, 'sampling': 'active', 'noise_model': 'CKL', 'seed': None, 'dataset': 'simulation', 'acc': 0.83, 'actual_acc': 0.8300520248903397, 'n': 180},

    
    # good for n=90
    # {'n_train': 1890, 'sampling': 'random', 'noise_model': 'CKL', 'seed': 8.0, 'dataset': 'simulation', 'acc': 0.8, 'actual_acc': 0.8002378121284186, 'n': 90},
    # {'n_train': 2790, 'sampling': 'random', 'noise_model': 'CKL', 'seed': 6.0, 'dataset': 'simulation', 'acc': 0.815, 'actual_acc': 0.8149821640903686, 'n': 90},
    # {'n_train': 18000, 'sampling': 'random', 'noise_model': 'CKL', 'seed': 7.0, 'dataset': 'simulation', 'acc': 0.83, 'actual_acc': 0.8304399524375743, 'n': 90},
]

In [27]:
ID = "2021-04-16"

In [28]:
n = idents[0]["n"]
assert all(i["n"] == n for i in idents)

In [29]:
from collections import defaultdict

def _get_zip(zf, filter=None):
    global raw
    data = []
    for f in zf.filelist:
        assert ".msgpack" in f.filename
        ir = zf.read(f)
        raw = msgpack.loads(ir)
        assert set(raw.keys()) == {"embedding", "history", "perf", "params", "meta"}
        _ = raw.pop("history")
        assert "history" not in raw
        n = raw["params"]["n"]
        if filter is None or filter(raw):
            data.append(raw)
    return data

data = []

filters = {
    "random-n=[90, 180, 300].zip": lambda raw: (
        raw["meta"]["sampling"] == "random" and raw["meta"]["n"] in [90, 180, 300]
    ),
    "n=30.zip": lambda raw: (
        raw["meta"]["n"] == 30
    ),
    "embeddings.zip": lambda raw: (raw["meta"]["sampling"] == "active"),
}
for fname in ["embeddings.zip", "n=30.zip", "random-n=[90, 180, 300].zip"]:
    with ZipFile(f"io/{ID}/{fname}") as zf:
        print(f"{len(zf.filelist)} embeddings")
        _data = _get_zip(zf, filter=filters[fname])
        for raw in _data:
            for k, i in enumerate(idents):
                if _matches(raw["meta"], i):
                    show = {"embedding": raw["embedding"], "ident": i}
                    data.append(show)

115 embeddings
254 embeddings
690 embeddings


In [30]:
print(len(data))
print(len(idents))
assert len(data) == len(idents)

3
3


In [31]:
data = sorted(data, key=lambda raw: raw["ident"]["acc"])

In [32]:
from copy import deepcopy
def _cook(data):
    assert set(data.keys()) == {"embedding", "ident"}
    em = np.asarray(data["embedding"])
    ident = data["ident"]
    df = pd.DataFrame({str(k): em[:, k] for k in range(em.shape[1])})
    for k in ["0", "1"]:
        df[k] -= df[k].min()
        df[k] /= df[k].max()
        df[k] *= 2
        df[k] -= 1
    df["smoothness"] = np.linspace(0, 1, num=len(df)) #* -1
    for k, v in ident.items():
        df[k] = v
    df["col"] = df['acc'].apply(lambda x: f"{100 * x}%")
    return df
    
    
raw = deepcopy(data)
steaks = [_cook(d) for d in raw]
df = pd.concat(steaks)

In [33]:
df.head()

Unnamed: 0,0,1,smoothness,n_train,sampling,noise_model,seed,dataset,acc,n,col
0,0.509599,0.899917,0.0,780,random,CKL,10,human,0.801,30,80.10000000000001%
1,1.0,0.753093,0.034483,780,random,CKL,10,human,0.801,30,80.10000000000001%
2,0.244107,0.806243,0.068966,780,random,CKL,10,human,0.801,30,80.10000000000001%
3,0.872134,1.0,0.103448,780,random,CKL,10,human,0.801,30,80.10000000000001%
4,0.910908,0.744976,0.137931,780,random,CKL,10,human,0.801,30,80.10000000000001%


In [34]:
accs = list(df.acc.unique())

In [35]:
import altair as alt

def get_plot(df, w=150, title="", color=False):
    encodings = dict(
        x=alt.X("0", axis=alt.Axis(labels=True), title="x"),
        y=alt.Y("1", axis=alt.Axis(labels=True), title="y"),
    )
    if color:
        encodings["color"] =  alt.Color(
            "smoothness", scale=alt.Scale(scheme="turbo"), title="Smoothness"
        )
    c = alt.Chart(df).mark_circle(size=100, line=True, opacity=0.9).encode(**encodings).properties(
        width=w,
        height=w,
        title=title,
    )
#         color=alt.Color("smoothness", scale=alt.Scale(scheme="viridis"), title="Smoothness"),
#         color=alt.Color("smoothness", scale=alt.Scale(scheme="cividis"), title="Smoothness"),
#         color=alt.Color("smoothness", scale=alt.Scale(scheme="redyellowblue"), title="Smoothness"),  # dark background good
#         color=alt.Color("smoothness", scale=alt.Scale(scheme="lightmulti"), title="Smoothness"),
    return c#.interactive()

color = False
c1 = get_plot(df[df.acc == accs[0]], title=f"Test Accuracy: {accs[0]*100:0.1f}%", color=color)
c2 = get_plot(df[df.acc == accs[1]], title=f"Test Accuracy: {accs[1]*100:0.1f}%", color=color)
c3 = get_plot(df[df.acc == accs[2]], title=f"Test Accuracy: {accs[2]*100:0.1f}%", color=color)
# c4 = get_plot(df[df.acc == accs[3]], title=f"Test Accuracy: {accs[3]*100}%")
c = (c1 | c2) | (c3)# | c4)
c
# c.properties(background="#000")