## DreamBank

Convert DreamBank's raw HTML files into tabular format.

In [1]:
from datetime import datetime, timezone
import os
import re
from requests.exceptions import HTTPError
import tarfile

from bs4 import BeautifulSoup
import pandas as pd
import pooch
import tqdm

In [2]:
RAW_URL = "https://github.com/krank-sources/dreambank/releases/download/v1/dreambank.tar.xz"
RAW_HASH = "md5:6ab629e9c13251d228db7ec1a93ffeb6"
try:
    archive_fname = pooch.retrieve(RAW_URL, RAW_HASH, progressbar=True)
except HTTPError as e:
    if str(e).startswith("404 Client Error: Not Found for url"):
        archive_fname = "./raw/output/dreambank.tar.xz"

Downloading data from 'https://github.com/krank-sources/dreambank/releases/download/v1/dreambank.tar.xz' to file 'C:\Users\remra\AppData\Local\pooch\pooch\Cache\1078757df23e3b53c13d2bcbb0de123c-dreambank.tar.xz'.


In [3]:
# Get a list of all the datasets available in the archive.
datasets = []
with tarfile.open(archive_fname, "r:xz") as tar:
    for member in tar.getmembers():
        if member.isdir() and member.name != ".":
            datasets.append(os.path.basename(member.name))

# Drop non-english datasets.
for ds in datasets[:]:
    if "." in ds:
        datasets.remove(ds)
        print(f"Dropping non-English dataset: {ds}")

Dropping non-English dataset: german-f.de
Dropping non-English dataset: german-m.de
Dropping non-English dataset: vonuslar.de
Dropping non-English dataset: zurich-f.de
Dropping non-English dataset: zurich-m.de


## Define functions for HTML extraction

In [4]:
def extract_info_file_content(dataset: str) -> bytes:
    """Extracts the info.html content for a given dataset from the archive."""
    with tarfile.open(archive_fname, "r:xz") as tar:
        fname = f"./{dataset}/info.html"
        with tar.extractfile(fname) as f:
            content = f.read()
    return content


def extract_dream_file_content(dataset: str) -> bytes:
    """Extracts the dreams.html content for a given dataset from the archive."""
    with tarfile.open(archive_fname, "r:xz") as tar:
        fname = f"./{dataset}/dreams.html"
        with tar.extractfile(fname) as f:
            content = f.read()
    return content

In [5]:
def extract_dreams_from_html(dataset: str) -> pd.DataFrame:
    """Parse DreamBank HTML dreams page for a given dataset into a DataFrame."""
    content = extract_dream_file_content(dataset)
    soup = BeautifulSoup(content, "html.parser", from_encoding="ISO-8859-1")
    # Find all spans that do not have "comment" class labels.
    # Comments will already be present in the regular spans/dreams as bracketed content.
    data = []
    dream_spans = soup.find_all("span", style=False, class_=lambda x: x != "comment")
    for span in dream_spans:
        span_text = span.get_text(separator=" ", strip=True)
        # Extract the dream number from the beginning of string
        span_text = span_text.split(" ", 1)[1]
        # Drop the word count from the end of the string
        span_text = span_text.rsplit("\n", 1)[0]
        data.append(span_text)
        # # Extract the dream number (and potentially date) from beginning of string
        # # Sometimes dream number is a string, like 111a (e.g., Alta)
        # # Date is sometimes present if provided by dreamer
        # # Dream number is always present and represents the number of the dream in the whole sequence
        # match_ = re.match(r"^#(\S+) ((\(\S*\)) )?", span_text)
        # assert match_ is not None, f"Did not find dream number match for dataset {dataset}, dream index {i}."
        # dream_n = match_.group(1)  # The number of dream in the whole sequence
        # dream_date = match_.group(3)  # will be None if not found
        # # Remove the dream number (and potentially date) from the beginning of string
        # dream_and_wc_text = re.sub(r"^#([0-9]+) ((\(\S*\)) )?", "", span_text)
        # # Remove the word count from end of string
        # n_wc_matches = len(re.findall(r"[ \n]?\([0-9]+ words\)$", dream_and_wc_text))
        # assert n_wc_matches == 1, f"Found {n_wc_matches} WC match for dataset {dataset}, dream {dream_n} (expected 1)."
        # dream_text = re.sub(r"[ \n]?\([0-9]+ words\)$", "", dream_and_wc_text)
        # assert dream_n not in data, f"Unexpected duplicate dream number: {dream_n} in dataset {dataset}."
        # data.append(dict(n=dream_n, date=dream_date, dream=dream_text))
    # Make sure the correct number of dreams were extracted.
    # At the top of each page, DreamBank will say how many dreams are present in the
    # total dataset, as well as how many are displayed on the page. These, and the total
    # amount of dreams extracted, should all be the same.
    # n_dreams_statement = soup.find("h4").find_next().get_text()
    # n_dreams_total, n_dreams_displayed = re.findall(r"[0-9]+", n_dreams_statement)
    # n_dreams_extracted = len(data)
    # assert int(n_dreams_total) == int(n_dreams_displayed) == n_dreams_extracted
    # dreams = pd.DataFrame(data).replace(dict(date={None: pd.NA})).astype(dict(n="string", date="string", dream="string")).dropna(how="all", axis=1).sort_index(axis=0)
    dreams = pd.DataFrame(data, columns=["dream"]).astype(dict(dream="string"))
    return dreams


def extract_info_from_html(dataset: str) -> dict:
    """Parse DreamBank HTML info page for a given dataset into a dictionary.

    * long_name (str): The dataset title.
    * n_dreams (int): The total number of dreams in the dataset.
    * timeframe (str): Provided year or timeframe of the dataset.
    * sex (str): The provided sex of the dreamer.
    * description (str): A long-form description of the dataset.
    """
    content = extract_info_file_content(dataset)
    soup = BeautifulSoup(content, "html.parser", from_encoding="ISO-8859-1")
    body = soup.find("body")
    long_name = body.find(string="Dream series:").next.get_text(strip=True)
    n_dreams = body.find(string="Number of dreams:").next.get_text(strip=True)
    timeframe = body.find(string="Year:").next.get_text(strip=True)
    sex = body.find(string="Sex of the dreamer(s):").next.get_text(strip=True)
    match_ = re.match(
        rf".*Sex of the dreamer\(s\): {sex}\n\n\n?(.*?)\s+(For the further analyses, click here.\n)?\[Back to search form\]\s+$",
        body.get_text(),
        flags=re.DOTALL
    )
    assert match_ is not None, f"Error parsing info description for dataset {dataset}."
    description = match_.group(1)
    info = {
        "long_name": long_name,
        "n_dreams": n_dreams,
        "timeframe": timeframe,
        "sex": sex,
        "description": description,
    }
    return info

## Process each dataset

In [6]:
extracted_data = {}
for dataset in (pbar := tqdm.tqdm(datasets, ncols=90)):
    pbar.set_description(f"Processing dataset {dataset}")
    extracted_data[dataset] = {
        "info": extract_info_from_html(dataset),
        "dreams": extract_dreams_from_html(dataset)
    }

Processing dataset west_coast_teens: 100%|████████████████| 89/89 [03:10<00:00,  2.14s/it]


In [7]:
# Combine all info dicts into a single DataFrame
all_info = pd.DataFrame.from_dict(
    {ds: extracted_data[ds]["info"] for ds in extracted_data},
    orient="index"
).rename_axis("dataset").reset_index(drop=False)

# Combine all dreams DataFrames into a single DataFrame with a multi-index
all_dreams = pd.concat(
    {
        ds: extracted_data[ds]["dreams"]
        for ds in extracted_data
    },
    names=["dataset"]
).droplevel(1).reset_index(drop=False)

## Export

In [10]:
OUTDIR = "./output"
DREAMS_FNAME = "dreams.csv"
INFO_FNAME = "info.csv"
dreams_outpath = f"{OUTDIR}/{DREAMS_FNAME}"
info_outpath = f"{OUTDIR}/{INFO_FNAME}"
os.makedirs(OUTDIR, exist_ok=True)

TO_CSV_KWARGS = {
    "index": False,
    "sep": ",",
    "mode": "x",  # Switch to `w` to overwrite existing file
    "encoding": "utf-8-sig",  # Include sig/BOM for better compatibility with Excel
    "lineterminator": "\n",
    "quoting": 2,  # 2 = csv.QUOTE_NONNUMERIC
    "quotechar": '"',
    "doublequote": True,
}
all_dreams.to_csv(dreams_outpath, **TO_CSV_KWARGS)
all_info.to_csv(info_outpath, **TO_CSV_KWARGS)

In [11]:
for fn in [dreams_outpath, info_outpath]:
    print(f"file: {os.path.basename(fn)}")
    print(f"size: {os.path.getsize(fn) / 1e6} MB")
    print(f"md5: {pooch.file_hash(fn, alg='md5')}")
    print(f"sha256: {pooch.file_hash(fn, alg='sha256')}")
    print(f"timestamp: {datetime.fromtimestamp(os.path.getmtime(fn), tz=timezone.utc).isoformat(timespec='seconds')}")
    print()

file: dreams.csv
size: 33.72761 MB
md5: 79c40db24e7343cbab2ba88040fbd6da
sha256: bef1b3121d402e59eb2480176048c127d13e46a3529b850ad407cf3b1d3674aa
timestamp: 2025-12-29T01:47:19+00:00

file: info.csv
size: 0.059037 MB
md5: e169da104778fc00536f5f55b68ddabf
sha256: 30a7fc3ead9a3fc02fa9dd0c8898f1aadb16e40250ee23212ac94132a663e4f5
timestamp: 2025-12-29T01:47:19+00:00

