In [1]:
from collections import defaultdict
import time
from pathlib import Path

import roach

from relbench.datasets import get_dataset, get_dataset_names
from relbench.tasks import get_task, get_task_names

In [2]:
# clear current cache with backup
ts = int(time.time())
home = Path.home()
try:
    Path(f"{home}/.cache/relbench").rename(f"{home}/.cache/relbench.{ts}")
except FileNotFoundError:
    pass

In [35]:
# clear current cache with backup
ts = int(time.time())
home = Path.home()
try:
    Path(f"{home}/.cache/relbench_upload").rename(f"{home}/.cache/relbench_upload.{ts}")
except FileNotFoundError:
    pass

In [2]:
queue = "relbench/2024-07-09_cpu"

In [3]:
task_files = defaultdict(lambda: "~/README.md") # just something that we know is a file

In [4]:
def join(task_keys):
    tests = []
    for task_key in task_keys:
        task_file = task_files[task_key]
        task_file = task_file.replace("/ready/", "/done/")
        test = f"test -f {task_file}"
        tests.append(test)
    test = " && ".join(tests)
    return test

In [6]:
cmd = (
    "kaggle competitions download -c event-recommendation-engine-challenge && "
    "mkdir -p data/rel-event && "
    "mv event-recommendation-engine-challenge.zip data/rel-event"
)
task_files["download/rel-event"] = roach.submit(queue, cmd)

In [7]:
cmd = (
    "kaggle competitions download -c h-and-m-personalized-fashion-recommendations && "
    "mkdir -p data/hm-recommendation && "
    "mv h-and-m-personalized-fashion-recommendations.zip data/hm-recommendation"
)
task_files["download/rel-hm"] = roach.submit(queue, cmd)

In [5]:
for dataset_name in get_dataset_names():
    py_cmd = (
        f"from relbench.datasets import get_dataset; "
        f'get_dataset("{dataset_name}", download=False).get_db()'
    )
    cmd = f"python -c '{py_cmd}'"
    if dataset_name in ["rel-hm", "rel-event"]:
        requires = join([f"download/{dataset_name}"])
    else:
        requires = "true"
    task_files[f"make_db/{dataset_name}"] = roach.submit(queue, cmd, requires)

In [6]:
for dataset_name in get_dataset_names():
    for task_name in get_task_names(dataset_name):
        py_cmd = (
            f"from relbench.tasks import get_task; "
            f'task = get_task("{dataset_name}", "{task_name}", download=False); '
            f'task.get_table("train"); '
            f'task.get_table("val"); '
            f'task.get_table("test")'
        )
        cmd = f"python -c '{py_cmd}'"
        requires = join([f"make_db/{dataset_name}"])
        task_files[f"make_tables/{dataset_name}/{task_name}"] = roach.submit(queue, cmd, requires)

In [7]:
for dataset_name in get_dataset_names():
    cmd = (
        f"cd ~/.cache/relbench/{dataset_name} && "
        f"zip -r db db && "
        f"mkdir -p ~/.cache/relbench_upload/{dataset_name} && "
        f"mv db.zip ~/.cache/relbench_upload/{dataset_name}"
    )
    requires = join([f"make_db/{dataset_name}"])
    task_files[f"zip_db/{dataset_name}"] = roach.submit(queue, cmd, requires)

In [8]:
for dataset_name in get_dataset_names():
    for task_name in get_task_names(dataset_name):
        cmd = (
            f"cd ~/.cache/relbench/{dataset_name}/tasks && "
            f"zip -r {task_name} {task_name} && "
            f"mkdir -p ~/.cache/relbench_upload/{dataset_name}/tasks && "
            f"mv {task_name}.zip ~/.cache/relbench_upload/{dataset_name}/tasks"
        )
        requires = join([f"make_tables/{dataset_name}/{task_name}"])
        task_files[f"zip_task/{dataset_name}/{task_name}"] = roach.submit(queue, cmd, requires)

In [9]:
join_list = []
for dataset_name in get_dataset_names():
    join_list.append(f"zip_db/{dataset_name}")
requires = join(join_list)
py_cmd = (
    "import utils; "
    "utils.db_hashes(in_dir=\"~/.cache/relbench_upload\", "
    "out_file=\"/lfs/local/0/ranjanr/relbench/relbench/datasets/hashes.json\")"
)
assert "'" not in py_cmd
cmd = f"python -c '{py_cmd}'"
roach.submit(queue, cmd, requires)

'/lfs/local/0/ranjanr/queues/relbench/2024-07-09_cpu/ready/1721947570435203965'

In [10]:
join_list = []
for dataset_name in get_dataset_names():
    for task_name in get_task_names(dataset_name):
        join_list.append(f"zip_task/{dataset_name}/{task_name}")
requires = join(join_list)
py_cmd = (
    "import utils; "
    "utils.task_hashes(in_dir=\"~/.cache/relbench_upload\", "
    "out_file=\"/lfs/local/0/ranjanr/relbench/relbench/tasks/hashes.json\")"
)
assert "'" not in py_cmd
cmd = f"python -c '{py_cmd}'"
roach.submit(queue, cmd, requires)

'/lfs/local/0/ranjanr/queues/relbench/2024-07-09_cpu/ready/1721947578706268914'

# upload

In [29]:
%%bash
scp -r ~/.cache/relbench_upload/* ranjanr@relbench.stanford.edu:/lfs/0/staging/download

ranjanr@relbench.stanford.edu: Permission denied (publickey,gssapi-keyex,gssapi-with-mic,password).
Permission denied, please try again.
Permission denied, please try again.
ranjanr@ampere4.stanford.edu: Permission denied (publickey,gssapi-keyex,gssapi-with-mic,password).


CalledProcessError: Command 'b'scp -r ranjanr@ampere4.stanford.edu:~/.cache/relbench/**.zip ranjanr@relbench.stanford.edu:/lfs/0/staging/download\n'' returned non-zero exit status 1.