In [1]:
import time
from pathlib import Path

import roach

from relbench.datasets import get_dataset, get_dataset_names
from relbench.tasks import get_task, get_task_names

In [2]:
# clear current cache with backup
ts = int(time.time())
home = Path.home()
try:
    Path(f"{home}/.cache/relbench").rename(f"{home}/.cache/relbench.{ts}")
except FileNotFoundError:
    pass

In [3]:
queue = "relbench_cpu"

In [4]:
task_files = {}

In [5]:
def join(task_keys):
    tests = []
    for task_key in task_keys:
        task_file = task_files[task_key]
        task_file = task_file.replace("/ready/", "/done/")
        test = f"test -f {task_file}"
        tests.append(test)
    test = " && ".join(tests)
    return test

In [6]:
cmd = (
    "kaggle competitions download -c event-recommendation-engine-challenge && "
    "mkdir -p data/rel-event && "
    "mv event-recommendation-engine-challenge.zip data/rel-event"
)
task_files["download/rel-event"] = roach.submit(queue, cmd)

In [7]:
cmd = (
    "kaggle competitions download -c h-and-m-personalized-fashion-recommendations && "
    "mkdir -p data/hm-recommendation && "
    "mv h-and-m-personalized-fashion-recommendations.zip data/hm-recommendation"
)
task_files["download/rel-hm"] = roach.submit(queue, cmd)

In [8]:
for dataset_name in get_dataset_names():
    py_cmd = (
        f"from relbench.datasets import get_dataset; "
        f'get_dataset("{dataset_name}", download=False).get_db()'
    )
    cmd = f"python -c '{py_cmd}'"
    if dataset_name in ["rel-hm", "rel-event"]:
        requires = join([f"download/{dataset_name}"])
    else:
        requires = "true"
    task_files[f"make_db/{dataset_name}"] = roach.submit(queue, cmd, requires)

In [9]:
for dataset_name in get_dataset_names():
    for task_name in get_task_names(dataset_name):
        py_cmd = (
            f"from relbench.tasks import get_task; "
            f'task = get_task("{dataset_name}", "{task_name}", download=False); '
            f'task.get_table("train"); '
            f'task.get_table("val"); '
            f'task.get_table("test")'
        )
        cmd = f"python -c '{py_cmd}'"
        requires = join([f"make_db/{dataset_name}"])
        task_files[f"make_tables/{dataset_name}/{task_name}"] = roach.submit(queue, cmd, requires)

In [19]:
for dataset_name in get_dataset_names():
    cmd = (
        f"cd ~/.cache/relbench/{dataset_name} && "
        f"zip -r db db"
    )
    requires = join([f"make_db/{dataset_name}"])
    roach.submit(queue, cmd, requires)

In [20]:
for dataset_name in get_dataset_names():
    for task_name in get_task_names(dataset_name):
        cmd = (
            f"cd ~/.cache/relbench/{dataset_name}/tasks && "
            f"zip -r {task_name} {task_name}"
        )
        requires = join([f"make_tables/{dataset_name}/{task_name}"])
        roach.submit(queue, cmd, requires)

In [26]:
join_list = []
for dataset_name in get_dataset_names():
    join_list.append(f"make_db/{dataset_name}")
requires = join(join_list)
cmd = f"python -c 'import utils; utils.db_hashes()'"
roach.submit(queue, cmd, requires)

'/lfs/local/0/ranjanr/queues/relbench_cpu/ready/1720227536067304680'

In [28]:
join_list = []
for dataset_name in get_dataset_names():
    for task_name in get_task_names(dataset_name):
        join_list.append(f"make_tables/{dataset_name}/{task_name}")
requires = join(join_list)
cmd = f"python -c 'import utils; utils.task_hashes()'"
roach.submit(queue, cmd, requires)

'/lfs/local/0/ranjanr/queues/relbench_cpu/ready/1720227591302366062'