# Compare available SDDbs

An informal notebook for quick comparisons of different copies of SDDb available.

See the [GitHub Issue](https://github.com/krank-sources/sddb/issues/1) for a full table of versions and their details.

In [None]:
import itertools

import pandas as pd
import pooch

from IPython.display import display

In [2]:
CACHE_DIR = pooch.os_cache("pooch").joinpath("sddb")
REGISTRY = {
    "github_v1-alpha.1": {
        "url": "https://github.com/krank-sources/sddb/raw/refs/tags/v1-alpha.1/raw/dream-export.csv.xz",
        "known_hash": "md5:7a28c8f29584ab3375db2dc2bbc86d6d",
    },
    "github_v1-alpha.2": {
        "url": "https://github.com/krank-sources/sddb/raw/refs/tags/v1-alpha.1/raw/dream-export.csv.xz",
        "known_hash": "md5:7a28c8f29584ab3375db2dc2bbc86d6d",
    },
    "github_v1-alpha.1_processed": {
        "url": "https://github.com/krank-sources/sddb/releases/download/v1-alpha.1/sddb.tsv.xz",
        "known_hash": "md5:f05c3615c978a16dcf0ddb2440c56914",
    },
    "github_v1-alpha.2_processed": {
        "url": "https://github.com/krank-sources/sddb/releases/download/v1-alpha.2/sddb.tsv.xz",
        "known_hash": "md5:082496b09f8973e511e15190bc344ac1",
    },
    "zenodo_v1": {
        "url": "https://zenodo.org/records/11662064/files/dream-export.csv?download=1",
        "known_hash": "md5:2252f45157859f2d598023fc84ad436c",
    },
    "zenodo_v2": {
        "url": "https://zenodo.org/records/18076716/files/dream_search_2025-12-28T15_50_30.592Z.csv?download=1",
        "known_hash": "md5:4ecfb8cd2a83eabe75d2b6537ca846b6",
    },
}

## Load different copies

In [3]:
# Download and cache files as needed
filenames = {
    version: pooch.retrieve(path=CACHE_DIR, **params) for version, params in REGISTRY.items()
}

In [4]:
# Read in all dataframes
dataframes = {
    version: pd.read_csv(fname, sep="\t" if "processed" in version else ",", low_memory=False)
    for version, fname in filenames.items()
}

In [5]:
assert dataframes["github_v1-alpha.1"].equals(dataframes["github_v1-alpha.2"])
del dataframes["github_v1-alpha.2"]

assert (
    dataframes["github_v1-alpha.2_processed"]
    .equals(
        dataframes["github_v1-alpha.1_processed"]
        .sort_values(["dataset", "author", "dream"])
        .reset_index(drop=True)
    )
)
del dataframes["github_v1-alpha.1_processed"]

# Make sure none of remaining are identical
for x, y in itertools.combinations(dataframes.values(), 2):
    assert not x.equals(y), "DataFrames should not be identical"

In [6]:
dataframes["zenodo_v2"].columns[:10]

Index(['Dream Report ID', 'Title', 'Dream Text', 'Participant ID', 'Gender',
       'Age', 'Word Count', 'Dream Date', 'Survey Name', 'Survey ID'],
      dtype='object')

In [7]:
# Make a dataframe with sizes of each dataframe

DATASET_COLS = {
    "github_v1-alpha.1": "survey",
    "github_v1-alpha.2_processed": "dataset",
    "zenodo_v1": "survey",
    "zenodo_v2": "Survey Name",
}
REPORT_COLS = {
    "github_v1-alpha.1": "answer_text",
    "github_v1-alpha.2_processed": "dream",
    "zenodo_v1": "answer_text",
    "zenodo_v2": "Dream Text",
}

info = pd.DataFrame.from_dict(
    {
        version: {
            "n_rows": len(df),
            "n_columns": len(df.columns),
            "n_empty_cols": df.isna().all().sum(),
            "n_datasets": df[DATASET_COLS[version]].nunique(),
            "n_nan_datasets": df[DATASET_COLS[version]].isna().sum(),
            "n_nan_reports": df[REPORT_COLS[version]].isna().sum(),
            "n_blank_reports": df[REPORT_COLS[version]].eq("").sum(),
            "avg_wc": df[REPORT_COLS[version]].str.split().str.len().mean().round(3),
            "min_wc": df[REPORT_COLS[version]].str.split().str.len().min(),
            "max_wc": df[REPORT_COLS[version]].str.split().str.len().max(),
        }
        for version, df in dataframes.items()
    },
    orient="index",
)

display(info)

Unnamed: 0,n_rows,n_columns,n_empty_cols,n_datasets,n_nan_datasets,n_nan_reports,n_blank_reports,avg_wc,min_wc,max_wc
github_v1-alpha.1,39089,276,0,83,0,1567,0,89.493,1.0,2681.0
github_v1-alpha.2_processed,28570,3,0,60,0,0,0,104.816,1.0,982.0
zenodo_v1,39089,184,0,83,0,1567,0,89.493,1.0,2681.0
zenodo_v2,44556,12,4,85,0,1567,0,104.363,1.0,2681.0


In [None]:
# Identify which columns were removed across source versions
gh = dataframes["github_v1-alpha.1"]
z1 = dataframes["zenodo_v1"]
z2 = dataframes["zenodo_v2"]

most_columns = max(len(gh.columns), len(z1.columns), len(z2.columns))
print(f"{'github_v1-alpha.1':>40} {'zenodo_v1':>30} {'zenodo_v2':>20}")
for i in range(most_columns):
    c1 = gh.columns[i] if i < len(gh.columns) else "<no column>"
    c2 = z1.columns[i] if i < len(z1.columns) else "<no column>"
    c3 = z2.columns[i] if i < len(z2.columns) else "<no column>"
    print(f"{c1:>40} {c2:>30} {c3:>20}")

                       github_v1-alpha.1                      zenodo_v1            zenodo_v2
                             answer_text                    answer_text      Dream Report ID
                              word_count                     word_count                Title
                       dream_entry_title              dream_entry_title           Dream Text
                                    date                           date       Participant ID
                              respondent                     respondent               Gender
                                  survey                         survey                  Age
                 Religious Affiliation B              Familiar Settings           Word Count
                       Familiar Settings      Non-Physical Aggression 3           Dream Date
               Non-Physical Aggression 3      Non-Physical Aggression 4          Survey Name
               Non-Physical Aggression 4      Non-Physical Aggression 

In [9]:
for version, df in dataframes.items():
    print("=" * 40)
    print()
    print(f"{version} columns:")
    for col in df:
        print("\t- " + col)
    print()


github_v1-alpha.1 columns:
	- answer_text
	- word_count
	- dream_entry_title
	- date
	- respondent
	- survey
	- Religious Affiliation B
	- Familiar Settings
	- Non-Physical Aggression 3
	- Non-Physical Aggression 4
	- Non-Physical Aggression 5
	- Friendliness 3
	- Friendliness 4
	- Friendliness 5
	- Physical Aggression 2
	- Physical Aggression 4
	- Sexuality 5
	- Outside Settings
	- Non-Physical Aggression 1
	- Inside Settings
	- Non-Physical Aggression 2
	- Friendliness 1
	- Friendliness 2
	- Physical Aggression 1
	- Physical Aggression 3
	- Physical Aggression 5
	- Sexuality 1
	- Sexuality 2
	- Sexuality 3
	- Sexuality 4
	- Animals
	- Children
	- Creatures
	- Dead Characters
	- Family Characters
	- Female Characters
	- Friends
	- Imaginary Characters
	- Male Characters
	- Metamorphosis
	- Occupational Characters
	- Prominent Characters
	- Racial/Ethnic Characters
	- Strangers
	- Total Characters
	- Dream Location
	- Anger
	- Fear
	- Happiness
	- Sadness
	- Wonder
	- Good Fortunes
	-