In [1]:
!cd input-data; unzip -uq freqs.msgpack.zip; cd ..

In [2]:
!cd input-data; unzip -uq zappos.csv.zip; cd ..

In [3]:
import pandas as pd
from pathlib import Path
import requests
import yaml
import json
import numpy as np
import itertools
from time import time, sleep

In [4]:
from numba import jit
import joblib
from joblib import Parallel, delayed
from typing import Tuple, Dict

@jit(nopython=True, nogil=True)
def __get_freqs(h: int, o1: int, o2: int, answers: pd.DataFrame) -> Tuple[int, int]:
    head = answers[:, 0] == h
    ans1 = (answers[:, 1] == o1) & (answers[:, 2] == o2)
    ans2 = (answers[:, 1] == o2) & (answers[:, 2] == o1)
    bool_idx = head & (ans1 | ans2)
    if bool_idx.sum() == 0:
        return (0, 0)
    good_ans = answers[bool_idx]
    o1_wins = (good_ans[:, 1] == o1).sum()
    o2_wins = (good_ans[:, 1] == o2).sum()
    return o1_wins, o2_wins

Query = Tuple[int, int, int]
Freq = Tuple[int, int]

def _get_freqs(n: int, train_ans: pd.DataFrame) -> Dict[Query, Freq]:
    def _inner_loop(h, arms, train_ans):
        print(h)
        prob = {
            (h, o1, o2): __get_freqs(h, o1, o2, train_ans)
            for o1 in arms - {h}
            for o2 in arms - {h, o1}
        }
        return prob

    arms = set(range(n))
    responses = [_inner_loop(h, arms, train_ans) for h in arms]
    out = {}
    for r in responses:
        out.update(r)
    return out

In [5]:
from sklearn.model_selection import train_test_split
responses = pd.read_csv("input-data/zappos.csv", usecols=["head", "b", "c"])
responses.columns = ["head", "winner", "loser"]

N = responses["head"].nunique()

train, test = train_test_split(responses, random_state=42, test_size=0.2)
train_ans = train.to_numpy()
test_ans = test.to_numpy()

In [6]:
%%time
import pickle
from joblib import Parallel, delayed
import msgpack

if False:
    # about 52 minutes
    freqs = _get_freqs(N, train_ans)
    with open("input-data/freqs.msgpack", "wb") as f:
        msgpack.dump(freqs, f)
else:
    with open("input-data/freqs.msgpack", "rb") as f:
        freqs = msgpack.load(f)

CPU times: user 484 ms, sys: 47.6 ms, total: 532 ms
Wall time: 536 ms


In [7]:
# sampling = "adaptive"
sampling = "random"
alg = "TSTE"
random_state = 1

config = {
    'targets': [str(x) for x in range(N)],
    'd': 2,
    'samplers': {
        alg: {
            'optimizer': 'Embedding',
            'optimizer__lr': 0.1,
            'random_state': random_state,
            'sampling': sampling,
        }
    }
}
{k: v for k, v in config.items() if k not in ["targets"]}

{'d': 2,
 'samplers': {'TSTE': {'optimizer': 'Embedding',
   'optimizer__lr': 0.1,
   'random_state': 1,
   'sampling': 'random'}}}

In [6]:
base = "http://127.0.0.1"
r = requests.get(f"{base}:8421/reset?force=1", auth=("foo", "bar"))
print("done1")
assert r.status_code == 200
r = requests.post(f"{base}:8421/init_exp", data={"exp": bytes(str(config), "ascii")}, auth=("foo", "bar"))
assert r.status_code == 200
print("done2")
sleep(1)

done1
done2


In [7]:
r.text

'<html><body>\n        <br><br>\n        <p>\n        Now, Salmon presents the following interfaces:\n        </p>\n        <p><ul style="text-align: center;">\n        <li><a href="/">Query page</a>. Send this page to crowdsourcing participants.</li>\n        <li><a href="/dashboard">Dashboard</a>. Use this page to monitor experimental progress.</li>\n        </ul></p>\n        </body></html>\n'

In [8]:
from time import time, sleep

targets = config["targets"]
n = len(targets)
d = 2

num_ans = 10 * n * d * np.log2(n)
print(num_ans)

data = []

10895.964591434093


In [9]:
def _answer(q, freqs):
    h, o1, o2 = q
    ret = None
    key = bytes(f'{h}-{o1}-{o2}', "ascii")
    if key not in freqs:
        return None
    o1_wins, o2_wins = freqs[key]
    return o1_wins, o2_wins

def answer_with(q, freqs, rng):
    h, o1, o2 = q
    o1_wins, o2_wins = _answer(q, freqs)
    answers = [o1] * o1_wins + [o2] * o2_wins
    if not len(answers):
        return None
    return int(rng.choice(answers))

In [10]:
import sys
from pathlib import Path
project_dir = Path(".").absolute().parent.parent
search = project_dir / "salmon" / "triplets" / "algs" / "adaptive" / "search"
sys.path.append(str(search))
import gram_utils

def score(embedding, queries):
    gram_matrix = gram_utils.gram_matrix(embedding)
    dists = gram_utils.distances(gram_matrix)
    # queries is organized as ["head", "winner", "loser"]
    winner_dists = dists[queries[:, 0], queries[:, 1]]
    loser_dists = dists[queries[:, 0], queries[:, 2]]
    acc = (winner_dists <= loser_dists).mean()
    return acc

In [None]:
rng = np.random.RandomState(random_state)

data = []
for k in range(12_000):
    datum = {"client_num_ans": k + 1, "loop_start_time": time()}
    if k % 100 == 0:
        print(datum["client_num_ans"])
        df = pd.DataFrame(data)
        df.to_parquet(f"{sampling}-data.parquet")
    _start = time()
    q = requests.get(f"{base}:8421/query")
    assert q.status_code == 200
    q = q.json()
    datum["get_query_time"] = time() - _start
    datum.update(q)
    
    winner = answer_with((q["head"], q["left"], q["right"]), freqs, rng)
    if winner is None:
        continue
    datum["winner"] = winner

    answer = {"winner": winner, "puid": "0", "response_time": 0, **q}
    sleep(0.25)  # "human response time"
    _start = time()
    a = requests.post(f"{base}:8421/answer", data=json.dumps(answer))
    datum["post_answer_time"] = time() - _start
    assert a.status_code == 200
    
    _start = time()
    e = requests.get(f"{base}:8421/model/{alg}").json()
    em = e.pop("embedding")
    datum.update(e)
    em = np.array(em)

    _score = score(em, test_ans)
    datum.update({"accuracy": _score})

    datum["embedding_max"] = np.abs(np.array(em)).max()
    datum["get_model_time"] = time() - _start
    data.append(datum)
    
    if k % 50 == 0:
        df = pd.DataFrame(data)
        df.to_parquet(f"{sampling}-data.parquet")

1
