# urbina1975

See [GitHub Issue](https://github.com/remrama/dreambank/issues/5).

In [1]:
from datetime import datetime, timezone
import os

import pandas as pd
import pooch

In [2]:
DATASETS = ["peru-f", "peru-m"]

## Load

In [3]:
# Retrieve the latest datasets from GitHub releases
# OR from local file if during development of a new release.
DREAMBANK_VERSION = "v1.0.0-alpha5"
datasets_kwargs = {
    "url": f"https://github.com/remrama/dreambank/releases/download/{DREAMBANK_VERSION}/datasets.csv.xz",
    "known_hash": "md5:1475582e2daa1da53920df50cb9fc98e",
}
dreams_kwargs = {
    "url": f"https://github.com/remrama/dreambank/releases/download/{DREAMBANK_VERSION}/dreams.csv.xz",
    "known_hash": "md5:2dcab92f9d9515df174388babb5c9e5a",
    "progressbar": True,
}

datasets_fname = pooch.retrieve(**datasets_kwargs)
dreams_fname = pooch.retrieve(**dreams_kwargs)
# datasets_fname = "../output/datasets.csv.xz"
# dreams_fname = "../output/dreams.csv.xz"

In [4]:
datasets = pd.read_csv(datasets_fname)
dreams = pd.read_csv(dreams_fname)

## Process

In [5]:
dreams = dreams[dreams["dataset"].isin(DATASETS)].copy()

In [6]:
assert dreams["metadata"].isna().all()
assert dreams["dream_id"].str.match(r"^[FM][0-4][0-9]-[1-8]$").all()
assert dreams["dataset"].str[-1].eq(dreams["dream_id"].str[0].str.lower()).all()

In [7]:
dreams["sex"] = dreams["dataset"].str[-1].map({"f": "female", "m": "male"})
dreams["author"] = dreams["dream_id"].str[:3]

In [8]:
dreams = dreams.rename(columns={"dream_text": "report"})
dreams = dreams.reindex(columns=["author", "sex", "report"])
dreams = dreams.sort_values("author")

In [9]:
# Minor fix: See https://github.com/remrama/dreambank/issues/6
dreams["report"] = dreams["report"].str.replace(r"\\$", "", regex=True)

In [10]:
dreams.head()

Unnamed: 0,author,sex,report
35127,F01,female,"I dreamed I was a the beach with my sister, su..."
35128,F01,female,I dreamed I was married to a very tall and fat...
35129,F01,female,I dreamed about an unknown person who had a sn...
35130,F01,female,I dreamed about a friend and we were going by ...
35131,F01,female,I dreamed I was at a beach and then a group of...


In [11]:
dreams.describe()

Unnamed: 0,author,sex,report
count,766,766,766
unique,96,2,766
top,F01,male,"I dreamed I was a the beach with my sister, su..."
freq,8,384,1


In [12]:
dreams.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Index: 766 entries, 35127 to 35892
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   author  766 non-null    object
 1   sex     766 non-null    object
 2   report  766 non-null    object
dtypes: object(3)
memory usage: 603.5 KB


## Export

In [13]:
assert not dreams.isna().any(axis=None)
assert not dreams.duplicated().any()
assert not dreams.duplicated(subset="report").any()
assert dreams["sex"].isin(["female", "male"]).all()
assert (dreams["report"] == dreams["report"].str.strip()).all()

In [14]:
outpath = "./output/urbina1975.csv"
os.makedirs(os.path.dirname(outpath), exist_ok=True)

TO_CSV_KWARGS = {
    "index": False,
    "na_rep": "N/A",
    "sep": ",",
    "mode": "x",  # Switch to `w` to overwrite existing file
    "encoding": "utf-8-sig",  # Include sig/BOM for better compatibility with Excel
    "lineterminator": "\n",
    "quoting": 2,  # 2 = csv.QUOTE_NONNUMERIC
    "quotechar": '"',
    "doublequote": True,
}
dreams.to_csv(outpath, **TO_CSV_KWARGS)

print(f"file: {os.path.basename(outpath)}")
print(f"size: {os.path.getsize(outpath) / 1e6} MB")
print(f"md5: {pooch.file_hash(outpath, alg='md5')}")
print(f"sha256: {pooch.file_hash(outpath, alg='sha256')}")
print(f"timestamp: {datetime.fromtimestamp(os.path.getmtime(outpath), tz=timezone.utc).isoformat(timespec='seconds')}")

file: urbina1975.csv
size: 0.506346 MB
md5: 05197362b787f8bef2e7c11fb9b81bf1
sha256: 4367e9c0b09176b920cb241c1658323eae68a7f51b5df8e4dce2fecffc343b3a
timestamp: 2025-12-30T04:11:42+00:00
