# DreamBank

This notebook extracts raw HTML dream reports and metadata from [DreamBank](https://dreambank.net).

In [1]:
import json
import os
import shutil

from bs4 import BeautifulSoup
import pandas as pd
import pooch
from tqdm import tqdm

In [2]:
REGISTRY_PATH = "./registry.json"
CACHE_DIR = pooch.os_cache("pooch").joinpath("dreambank")
with open(REGISTRY_PATH, "r", encoding="utf-8") as f:
    registry = json.load(f)

In [3]:
def compose_url(fname: str) -> str:
    if fname == "grid.cgi":
        return "https://dreambank.net/grid.cgi"
    assert fname.count("/") == 1 and fname.endswith(".html"), f"Unexpected filename format: {fname}"
    dataset, component = fname[:-5].split("/")
    if component == "dreams":
        return f"https://dreambank.net/random_sample.cgi?series={dataset}"
    elif component == "info":
        return f"https://dreambank.net/more_info.cgi?series={dataset}"
    elif component == "moreinfo":
        return f"https://dreambank.net/more_info.cgi?series={dataset}&further=1"
    else:
        raise ValueError(f"Unknown component: {component}")

In [4]:
no_brotli_downloader = pooch.HTTPDownloader(headers={"Accept-Encoding": "gzip, deflate"})

In [5]:
def retrieve_with_retry(**kwargs) -> tuple[str, str]:
    """Wraps pooch.retrieve to retry without known_hash on hash mismatch."""
    try:
        fname = pooch.retrieve(**kwargs)
    except ValueError as e:
        if "hash of downloaded file" in str(e):
            kwargs.update({"known_hash": None})
            fname = pooch.retrieve(**kwargs)
        else:
            raise e
    hash_ = "md5:" + pooch.file_hash(fname, "md5")
    return fname, hash_

## Grid page

The grid file is a DreamBank page that includes a table of all the datasets available in DreamBank and also longer text descriptions of them. This file is used to create a registry, ensuring that all available datasets are included. It also provides easy access to all the dataset IDs, which are used for subsequent web scraping.

In [6]:
GRID_FNAME = "grid.cgi"
fname, hash_ = retrieve_with_retry(
    url=compose_url(GRID_FNAME),
    known_hash=registry[GRID_FNAME],
    fname=GRID_FNAME,
    path=CACHE_DIR,
    downloader=no_brotli_downloader,
)
registry.update({GRID_FNAME: hash_})

In [7]:
with open(fname, "r", encoding="utf-8") as f:
    soup = BeautifulSoup(f, "html.parser")
datasets = sorted(x.get("value") for x in soup.find_all("input", type="checkbox"))
assert len(set(datasets)) == len(datasets), "Unexpected duplicate values found in `datasets`"

In [8]:
# Check for overlap between datasets and registry
registry_datasets = set(fname.split("/")[0] for fname in registry.keys() if fname != "grid.cgi")
symm_diff = set(datasets) ^ registry_datasets
assert not symm_diff, f"Datasets in one but not the other: {symm_diff}"

In [9]:
# Check there won't be any old datasets in cache_dir that are not in new dataset/registry.
for cached_fname in os.listdir(CACHE_DIR):
    if cached_fname == "grid.cgi":
        continue
    dataset = cached_fname.split("/")[0]
    assert dataset in datasets, f"Found unexpected dataset in cache: {dataset}"

## Datasets and metadata

In [10]:
for dataset in (pbar := tqdm(datasets)):
    pbar.set_description(f"Retrieving dataset: {dataset}")
    for component in ["info", "moreinfo", "dreams"]:
        fname = f"{dataset}/{component}.html"
        url = compose_url(fname)
        hash_ = registry[fname]
        fpath, hash_ = retrieve_with_retry(
            url=url,
            known_hash=hash_,
            fname=fname,
            path=CACHE_DIR,
            downloader=no_brotli_downloader,
        )
        registry.update({fname: hash_})

Retrieving dataset: zurich-m.de: 100%|██████████| 94/94 [00:00<00:00, 368.80it/s]        


## Export

* Rewrite the registry JSON file in case anything got updated.
* Zip the cache directory into a zip file

In [11]:
with open(REGISTRY_PATH, "w", encoding="utf-8") as f:
    json.dump(registry, f, indent=4, ensure_ascii=True)

In [12]:
# Zip up the cache directory for easier distribution
ARCHIVE_PATH = "./output/dreambank"
os.makedirs(os.path.dirname(ARCHIVE_PATH), exist_ok=True)
outpath = shutil.make_archive(ARCHIVE_PATH, "xztar", CACHE_DIR)

# Print details that will get manually copied to README.md
print(f"file: {os.path.basename(outpath)}")
print(f"size: {os.path.getsize(outpath) / 1e6} MB")
for alg in ["md5", "sha256"]:
    print(f"{alg}: {pooch.file_hash(outpath, alg=alg)}")

file: dreambank.tar.xz
size: 8.321316 MB
md5: 6ab629e9c13251d228db7ec1a93ffeb6
sha256: e32156310682bce27a52cfd5e84ee4ebf90a638e6c2832d19fa4fb71555729e2
