# ucsc1996

See [GitHub Issue](https://github.com/remrama/dreambank/issues/7).

In [1]:
from datetime import datetime, timezone
import os

import pandas as pd
import pooch

In [2]:
DATASETS = ["ucsc_women"]

## Load

In [3]:
# Retrieve the latest datasets from GitHub releases
# OR from local file if during development of a new release.
DREAMBANK_VERSION = "v1.0.0-alpha5"
dreams_kwargs = {
    "url": f"https://github.com/remrama/dreambank/releases/download/{DREAMBANK_VERSION}/dreams.csv.xz",
    "known_hash": "md5:2dcab92f9d9515df174388babb5c9e5a",
    "progressbar": True,
}

dreams_fname = pooch.retrieve(**dreams_kwargs)
# dreams_fname = "../output/dreams.csv.xz"

In [4]:
dreams = pd.read_csv(dreams_fname)

## Process

In [5]:
dreams = dreams[dreams["dataset"].isin(DATASETS)].copy()

In [6]:
# Verify assumptions about the data
assert dreams["metadata"].str.match(r"^[12][8901234]/female$").all()
assert dreams["dream_id"].str.match(r"^[1-9][0-9]?[a-b]?$").all()

# Verify that the age and sex metadata are consistent for the a/b person
dreams[dreams["dream_id"].str.len() > 2]

Unnamed: 0,dataset,dream_id,metadata,word_count,dream_text
37182,ucsc_women,16a,20/female,124,In part of the dream I was arguing with my now...
37183,ucsc_women,16b,20/female,60,I dreamt that my ex-boyfriend forgave me for b...


In [7]:
# Create author column by extracting digits from dream_id (keep as string)
dreams["author"] = dreams["dream_id"].str.extract(r"^([1-9][0-9]?)[a-b]?$")[0].astype(str)
# Create age column by extracting from metadata
dreams["age"] = dreams["metadata"].str.extract(r"^([12][8901234])/female$")[0].astype(int)
# Create constant sex column
dreams["sex"] = "female"

In [8]:
dreams = dreams.rename(columns={"dream_text": "report"})
dreams = dreams.reindex(columns=["author", "age", "sex", "report"])
dreams = dreams.sort_values("author")

In [9]:
dreams.head()

Unnamed: 0,author,age,sex,report
37168,1,18,female,I had this dream at the beginning of the schoo...
37177,10,19,female,It was my first day of class. It was a small c...
37178,12,19,female,"I am at a place with buildings surroundings, o..."
37179,13,19,female,I was watching TV with my boyfriend. I remembe...
37180,14,19,female,"I was in a concert hall of my favorite singer,..."


In [10]:
dreams.describe(include="number")

Unnamed: 0,age
count,81.0
mean,20.617284
std,1.280311
min,18.0
25%,20.0
50%,21.0
75%,21.0
max,24.0


In [11]:
dreams.describe(exclude="number")

Unnamed: 0,author,sex,report
count,81,81,81
unique,80,1,81
top,16,female,I had this dream at the beginning of the schoo...
freq,2,81,1


In [12]:
dreams.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Index: 81 entries, 37168 to 37176
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   author  81 non-null     object
 1   age     81 non-null     int64 
 2   sex     81 non-null     object
 3   report  81 non-null     object
dtypes: int64(1), object(3)
memory usage: 67.3 KB


## Export

In [13]:
assert not dreams.isna().any(axis=None)
assert not dreams.duplicated().any()
assert not dreams.duplicated(subset="report").any()
assert dreams["sex"].eq("female").all()
assert dreams["age"].between(18, 24).all()
assert (dreams["report"] == dreams["report"].str.strip()).all()

In [14]:
outpath = "./output/ucsc1996.csv"
os.makedirs(os.path.dirname(outpath), exist_ok=True)

TO_CSV_KWARGS = {
    "index": False,
    "na_rep": "N/A",
    "sep": ",",
    "mode": "x",  # Switch to `w` to overwrite existing file
    "encoding": "utf-8-sig",  # Include sig/BOM for better compatibility with Excel
    "lineterminator": "\n",
    "quoting": 2,  # 2 = csv.QUOTE_NONNUMERIC
    "quotechar": '"',
    "doublequote": True,
}
dreams.to_csv(outpath, **TO_CSV_KWARGS)

print(f"file: {os.path.basename(outpath)}")
print(f"size: {os.path.getsize(outpath) / 1e6} MB")
print(f"md5: {pooch.file_hash(outpath, alg='md5')}")
print(f"sha256: {pooch.file_hash(outpath, alg='sha256')}")
print(f"timestamp: {datetime.fromtimestamp(os.path.getmtime(outpath), tz=timezone.utc).isoformat(timespec='seconds')}")

file: ucsc1996.csv
size: 0.056766 MB
md5: caf94dc6bac916330caeb200e60c5671
sha256: cc10c7e80e99039928bfe29bbb825dd1b3737cc994dcf9eba4650e1b1daa8c15
timestamp: 2025-12-30T18:35:09+00:00
