+{html_content}
+
+"""
+
+ out_path.write_text(styled_html, encoding="utf-8")
+ return out_path
+
+
+def _read_dataset(path: Path) -> pd.DataFrame:
+ return pd.read_csv(path, index_col=False, header=0, skipinitialspace=True)
+
+
+def main() -> None:
+ import argparse
+
+ parser = argparse.ArgumentParser(description="Generate the dataset bubble chart.")
+ parser.add_argument("source", type=Path, help="Path to dataset summary CSV")
+ parser.add_argument(
+ "--output",
+ type=Path,
+ default=Path("dataset_bubble.html"),
+ help="Output HTML file",
+ )
+ parser.add_argument(
+ "--x-axis",
+ choices=["records", "duration_h", "size_gb", "tasks", "subjects"],
+ default="records",
+ help="Field for the bubble chart x-axis",
+ )
+ args = parser.parse_args()
+
+ df = _read_dataset(args.source)
+ output_path = generate_dataset_bubble(df, args.output, x_var=args.x_axis)
+ print(f"Bubble chart saved to {output_path.resolve()}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/docs/plot_dataset/colours.py b/docs/plot_dataset/colours.py
new file mode 100644
index 00000000..7d2d50ae
--- /dev/null
+++ b/docs/plot_dataset/colours.py
@@ -0,0 +1,98 @@
+"""Helpers for Sankey diagram generation."""
+
+# Color mappings consistent with prepare_summary_tables.py and custom.css
+PATHOLOGY_COLOR_MAP = {
+ "Healthy": "#22c55e", # green
+ "Clinical": "#f87171", # Lighter red to match table
+ "Unknown": "#94a3b8", # grey
+}
+
+MODALITY_COLOR_MAP = {
+ "Visual": "#2563eb",
+ "Auditory": "#0ea5e9",
+ "Tactile": "#10b981",
+ "Somatosensory": "#10b981",
+ "Multisensory": "#ec4899",
+ "Motor": "#f59e0b",
+ "Resting State": "#6366f1",
+ "Rest": "#6366f1",
+ "Sleep": "#7c3aed",
+ "Other": "#14b8a6",
+ "Unknown": "#94a3b8",
+}
+
+TYPE_COLOR_MAP = {
+ "Perception": "#3b82f6",
+ "Decision-making": "#eab308",
+ "Rest": "#16a34a",
+ "Resting-state": "#16a34a",
+ "Sleep": "#8b5cf6",
+ "Cognitive": "#6366f1",
+ "Clinical": "#f87171", # Lighter red to match table
+ "Memory": "#c4b5fd", # Lighter purple to match table
+ "Attention": "#c4b5fd", # Lighter purple to match table
+ "Intervention": "#c4b5fd", # Lighter purple to match table
+ "Learning": "#c4b5fd", # Lighter purple to match table
+ "Other": "#c4b5fd", # Lighter purple to match table
+ "Unknown": "#94a3b8",
+}
+
+# Canonical mappings to normalize values
+CANONICAL_MAP = {
+ "Type Subject": {
+ "healthy controls": "Healthy",
+ "healthy": "Healthy",
+ "control": "Healthy",
+ "clinical": "Clinical",
+ "patient": "Clinical",
+ },
+ "modality of exp": {
+ "visual": "Visual",
+ "auditory": "Auditory",
+ "tactile": "Tactile",
+ "somatosensory": "Tactile",
+ "multisensory": "Multisensory",
+ "motor": "Motor",
+ "rest": "Resting State",
+ "resting state": "Resting State",
+ "resting-state": "Resting State",
+ "sleep": "Sleep",
+ "other": "Other",
+ },
+ "type of exp": {
+ "perception": "Perception",
+ "decision making": "Decision-making",
+ "decision-making": "Decision-making",
+ "rest": "Rest",
+ "resting state": "Resting-state",
+ "resting-state": "Resting-state",
+ "sleep": "Sleep",
+ "cognitive": "Cognitive",
+ "clinical": "Clinical",
+ "other": "Other",
+ },
+}
+
+# Map column names to their color maps
+COLUMN_COLOR_MAPS = {
+ "Type Subject": PATHOLOGY_COLOR_MAP,
+ "modality of exp": MODALITY_COLOR_MAP,
+ "type of exp": TYPE_COLOR_MAP,
+}
+
+
+def hex_to_rgba(hex_color: str, alpha: float = 0.2) -> str:
+ """Convert hex color to rgba with given alpha."""
+ if not isinstance(hex_color, str) or not hex_color.startswith("#"):
+ # This is not a valid hex color, return a default color
+ return "rgba(148, 163, 184, 0.2)" # Default grey
+ hex_color = hex_color.lstrip("#")
+ if len(hex_color) != 6:
+ return "rgba(148, 163, 184, 0.2)" # Default grey for invalid length
+ try:
+ r = int(hex_color[0:2], 16)
+ g = int(hex_color[2:4], 16)
+ b = int(hex_color[4:6], 16)
+ except ValueError:
+ return "rgba(148, 163, 184, 0.2)" # Default grey for conversion error
+ return f"rgba({r}, {g}, {b}, {alpha})"
diff --git a/docs/plot_dataset/plot_sankey.py b/docs/plot_dataset/plot_sankey.py
new file mode 100644
index 00000000..fb41a3a8
--- /dev/null
+++ b/docs/plot_dataset/plot_sankey.py
@@ -0,0 +1,352 @@
+from __future__ import annotations
+
+"""Generate a Sankey diagram from the EEG-Dash dataset summary.
+
+The script loads ``eegdash/dataset/dataset_summary.csv`` (by default) and builds
+an interactive Plotly Sankey diagram connecting three categorical columns. This
+mirrors how the documentation summarises datasets across subject type, modality,
+and experiment type, but can be reused with any trio of categorical columns via
+CLI arguments.
+"""
+
+import argparse
+from pathlib import Path
+from typing import Sequence
+
+import pandas as pd
+import plotly.graph_objects as go
+
+try: # Support execution as a script or as a package module
+ from .colours import CANONICAL_MAP, COLUMN_COLOR_MAPS, hex_to_rgba
+except ImportError: # pragma: no cover - fallback for direct script execution
+ from colours import CANONICAL_MAP, COLUMN_COLOR_MAPS, hex_to_rgba
+
+DEFAULT_COLUMNS = ["Type Subject", "modality of exp", "type of exp"]
+__all__ = ["generate_dataset_sankey", "build_sankey"]
+
+
+def _prepare_dataframe(df: pd.DataFrame, columns: Sequence[str]) -> pd.DataFrame:
+ all_columns = list(columns)
+ if "n_subjects" not in all_columns:
+ all_columns.append("n_subjects")
+
+ missing = [col for col in all_columns if col not in df.columns]
+ if missing:
+ msg = f"Columns not found in dataframe: {missing}"
+ raise ValueError(msg)
+
+ cleaned = df.copy()
+
+ # Fill missing n_subjects with 1 (to count as at least one dataset)
+ # and ensure the column is numeric integer type.
+ cleaned["n_subjects"] = (
+ pd.to_numeric(cleaned["n_subjects"], errors="coerce").fillna(1).astype(int)
+ )
+
+ # Process each column for cleaning and normalization
+ for col in columns:
+ # 1. Fill original NaN values with the string 'Unknown'
+ cleaned[col] = cleaned[col].fillna("Unknown")
+
+ # 2. Split multi-valued cells
+ cleaned[col] = cleaned[col].astype(str).str.split(r"/|;|,", regex=True)
+ cleaned = cleaned.explode(col)
+
+ # 3. Clean up whitespace and any empty strings created by splitting
+ cleaned[col] = cleaned[col].str.strip()
+ cleaned[col] = cleaned[col].replace(["", "nan"], "Unknown")
+
+ # 4. Apply canonical mapping to standardize terms
+ if col in CANONICAL_MAP:
+ mapping = CANONICAL_MAP[col]
+ # Use .str.lower() for case-insensitive mapping
+ cleaned[col] = cleaned[col].str.lower().map(mapping).fillna(cleaned[col])
+
+ # 5. Apply special rule for 'Type Subject' after all other processing
+ if "Type Subject" in columns:
+ # The user wants to preserve original labels but color them as 'Clinical'.
+ # The relabeling to 'Clinical' is now removed. The coloring logic will handle this.
+ pass
+
+ return cleaned[all_columns]
+
+
+def _load_dataframe(path: Path, columns: Sequence[str]) -> pd.DataFrame:
+ df = pd.read_csv(
+ path,
+ index_col=False,
+ header=0,
+ skipinitialspace=True,
+ )
+ return _prepare_dataframe(df, columns)
+
+
+def _build_sankey_data(df: pd.DataFrame, columns: Sequence[str]):
+ node_labels: list[str] = []
+ node_colors: list[str] = []
+ node_index: dict[tuple[str, str], int] = {}
+
+ for col in columns:
+ color_map = COLUMN_COLOR_MAPS.get(col, {})
+
+ # Sort unique values to ensure "Unknown" appears at the bottom
+ all_unique = df[col].unique()
+ # Separate "Unknown" and sort the rest alphabetically
+ known_values = sorted([v for v in all_unique if v != "Unknown"])
+ unique_values = known_values
+ # Add "Unknown" to the end if it exists
+ if "Unknown" in all_unique:
+ unique_values.append("Unknown")
+
+ for val in unique_values:
+ if (col, val) not in node_index:
+ node_index[(col, val)] = len(node_labels)
+ node_labels.append(val)
+
+ # Use "Clinical" color for specific pathologies
+ node_color = color_map.get(val, "#94a3b8")
+ if col == "Type Subject" and val not in ["Healthy", "Unknown"]:
+ node_color = color_map.get("Clinical", "#94a3b8")
+ node_colors.append(node_color)
+
+ sources: list[int] = []
+ targets: list[int] = []
+ values: list[int] = []
+ link_colors: list[str] = []
+ link_hover_labels: list[str] = []
+
+ for idx in range(len(columns) - 1):
+ col_from, col_to = columns[idx], columns[idx + 1]
+
+ # Use the color from the source node for the link
+ source_color_map = COLUMN_COLOR_MAPS.get(col_from, {})
+
+ # Group by source and target, getting both sum of subjects and count of datasets
+ grouped = (
+ df.groupby([col_from, col_to])
+ .agg(
+ subject_sum=("n_subjects", "sum"),
+ dataset_count=("n_subjects", "size"),
+ )
+ .reset_index()
+ )
+
+ for _, row in grouped.iterrows():
+ source_val, target_val, subject_sum, dataset_count = (
+ row[col_from],
+ row[col_to],
+ row["subject_sum"],
+ row["dataset_count"],
+ )
+
+ source_node_idx = node_index.get((col_from, source_val))
+ target_node_idx = node_index.get((col_to, target_val))
+
+ if source_node_idx is not None and target_node_idx is not None:
+ sources.append(source_node_idx)
+ targets.append(target_node_idx)
+ values.append(subject_sum) # Weight links by sum of subjects
+ link_hover_labels.append(
+ f"{source_val} → {target_val}: "
+ f"{subject_sum} subjects in {dataset_count} datasets"
+ )
+
+ # Assign color to the link based on the source node
+ source_color = source_color_map.get(source_val, "#94a3b8")
+ if col_from == "Type Subject" and source_val not in [
+ "Healthy",
+ "Unknown",
+ ]:
+ source_color = source_color_map.get("Clinical", "#94a3b8")
+ link_colors.append(hex_to_rgba(source_color))
+
+ # Add counts (subjects and datasets) and percentages to the first column labels
+ first_col_name = columns[0]
+ first_col_stats = df.groupby(first_col_name).agg(
+ subject_sum=("n_subjects", "sum"),
+ dataset_count=("n_subjects", "size"),
+ )
+ total_subjects = first_col_stats["subject_sum"].sum()
+
+ for i, label in enumerate(node_labels):
+ col, val = next((k for k, v in node_index.items() if v == i), (None, None))
+ if col == first_col_name and val in first_col_stats.index:
+ stats = first_col_stats.loc[val]
+ subject_sum = stats["subject_sum"]
+ dataset_count = stats["dataset_count"]
+ percentage = (
+ (subject_sum / total_subjects) * 100 if total_subjects > 0 else 0
+ )
+ node_labels[i] = (
+ f"{label} ({subject_sum} subjects, {dataset_count} datasets, {percentage:.1f}%)"
+ )
+
+ return (
+ node_labels,
+ node_colors,
+ sources,
+ targets,
+ values,
+ link_colors,
+ link_hover_labels,
+ )
+
+
+def build_sankey(df: pd.DataFrame, columns: Sequence[str]) -> go.Figure:
+ (
+ labels,
+ colors,
+ sources,
+ targets,
+ values,
+ link_colors,
+ link_hover_labels,
+ ) = _build_sankey_data(df, columns)
+
+ sankey = go.Sankey(
+ arrangement="snap",
+ node=dict(
+ pad=30,
+ thickness=18,
+ label=labels,
+ color=colors,
+ align="left", # Align all labels to the left of the node bars
+ ),
+ link=dict(
+ source=sources,
+ target=targets,
+ value=values,
+ color=link_colors,
+ hovertemplate="%{customdata}",
+ customdata=link_hover_labels,
+ ),
+ )
+
+ fig = go.Figure(sankey)
+
+ fig.update_layout(
+ font=dict(size=14),
+ height=900,
+ width=None,
+ autosize=True,
+ margin=dict(t=40, b=40, l=40, r=40),
+ annotations=[
+ dict(
+ x=0,
+ y=1.05,
+ xref="paper",
+ yref="paper",
+ text="Population Type",
+ showarrow=False,
+ font=dict(size=16, color="black"),
+ ),
+ dict(
+ x=0.5,
+ y=1.05,
+ xref="paper",
+ yref="paper",
+ text="Experimental Modality",
+ showarrow=False,
+ font=dict(size=16, color="black"),
+ ),
+ dict(
+ x=1,
+ y=1.05,
+ xref="paper",
+ yref="paper",
+ text="Cognitive Domain",
+ showarrow=False,
+ font=dict(size=16, color="black"),
+ ),
+ dict(
+ x=0,
+ y=-0.15, # Position the note below the chart
+ xref="paper",
+ yref="paper",
+ text='Note on "Unknown" category: This large portion represents datasets that are still pending categorization.',
+ showarrow=False,
+ align="left",
+ xanchor="left",
+ font=dict(size=12, color="dimgray"),
+ ),
+ ],
+ )
+ return fig
+
+
+def generate_dataset_sankey(
+ df: pd.DataFrame,
+ out_html: str | Path,
+ *,
+ columns: Sequence[str] | None = None,
+) -> Path:
+ """Generate the dataset Sankey diagram and write it to *out_html*."""
+ selected_columns = list(columns) if columns is not None else list(DEFAULT_COLUMNS)
+ prepared = _prepare_dataframe(df, selected_columns)
+ fig = build_sankey(prepared, selected_columns)
+
+ out_path = Path(out_html)
+ out_path.parent.mkdir(parents=True, exist_ok=True)
+
+ html_content = fig.to_html(
+ full_html=False,
+ include_plotlyjs=False,
+ div_id="dataset-sankey",
+ config={
+ "responsive": True,
+ "displaylogo": False,
+ "modeBarButtonsToRemove": ["lasso2d", "select2d"],
+ },
+ )
+
+ out_path.write_text(html_content, encoding="utf-8")
+ return out_path
+
+
+def parse_args() -> argparse.Namespace:
+ parser = argparse.ArgumentParser(
+ description="Generate a Sankey diagram from the dataset summary CSV."
+ )
+ parser.add_argument(
+ "--source",
+ type=Path,
+ default=Path("eegdash/dataset/dataset_summary.csv"),
+ help="Path to the dataset summary CSV file.",
+ )
+ parser.add_argument(
+ "--columns",
+ nargs=3,
+ metavar=("FIRST", "SECOND", "THIRD"),
+ default=DEFAULT_COLUMNS,
+ help="Three categorical columns to connect in the Sankey plot.",
+ )
+ parser.add_argument(
+ "--output",
+ type=Path,
+ default=Path("dataset_summary_sankey.html"),
+ help="Output HTML file for the interactive Sankey diagram.",
+ )
+ return parser.parse_args()
+
+
+def main() -> None:
+ args = parse_args()
+ if not args.source.exists():
+ raise FileNotFoundError(f"Dataset summary CSV not found at {args.source}")
+
+ columns = list(args.columns)
+ df = _load_dataframe(args.source, columns)
+ fig = build_sankey(df, columns)
+
+ args.output.parent.mkdir(parents=True, exist_ok=True)
+ fig.write_html(
+ str(args.output),
+ include_plotlyjs="cdn",
+ full_html=True,
+ auto_open=False,
+ )
+ print(f"Sankey diagram saved to {args.output.resolve()}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/docs/plot_dataset/ridgeline.py b/docs/plot_dataset/ridgeline.py
new file mode 100644
index 00000000..34d5a83f
--- /dev/null
+++ b/docs/plot_dataset/ridgeline.py
@@ -0,0 +1,331 @@
+from __future__ import annotations
+
+import json
+from datetime import datetime
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import plotly.graph_objects as go
+from plotly.utils import PlotlyJSONEncoder
+from scipy.stats import gaussian_kde
+
+try: # Allow execution as a script or module
+ from .colours import MODALITY_COLOR_MAP, hex_to_rgba
+ from .utils import get_dataset_url, primary_modality
+except ImportError: # pragma: no cover - fallback for direct script execution
+ from colours import MODALITY_COLOR_MAP, hex_to_rgba # type: ignore
+ from utils import get_dataset_url, primary_modality # type: ignore
+
+__all__ = ["generate_modality_ridgeline"]
+
+
+def generate_modality_ridgeline(
+ df: pd.DataFrame,
+ out_html: str | Path,
+ *,
+ rng_seed: int = 42,
+) -> Path | None:
+ """Generate a ridgeline (KDE) plot showing participants per modality."""
+ data = df[df["dataset"].str.lower() != "test"].copy()
+ data["modality_label"] = data["modality of exp"].apply(primary_modality)
+ data["n_subjects"] = pd.to_numeric(data["n_subjects"], errors="coerce")
+ data = data.dropna(subset=["n_subjects"])
+ data = data[data["modality_label"] != "Other"]
+
+ if data.empty:
+ return None
+
+ median_participants = (
+ data.groupby("modality_label")["n_subjects"].median().sort_values()
+ )
+ order = [
+ label
+ for label in median_participants.index
+ if label in data["modality_label"].unique()
+ ]
+ if not order:
+ return None
+
+ fig = go.Figure()
+ rng = np.random.default_rng(rng_seed)
+ amplitude = 0.6
+ row_spacing = 0.95
+
+ for idx, label in enumerate(order):
+ subset = data[data["modality_label"] == label].copy()
+ values = subset["n_subjects"].astype(float).dropna()
+ if len(values) < 3:
+ continue
+
+ subset["dataset_url"] = subset["dataset"].apply(get_dataset_url)
+ log_vals = np.log10(values)
+ grid = np.linspace(log_vals.min() - 0.25, log_vals.max() + 0.25, 240)
+ kde = gaussian_kde(log_vals)
+ density = kde(grid)
+ if density.max() <= 0:
+ continue
+
+ density_norm = density / density.max()
+ baseline = idx * row_spacing
+ y_curve = baseline + density_norm * amplitude
+ x_curve = 10**grid
+
+ color = MODALITY_COLOR_MAP.get(label, "#6b7280")
+ fill = hex_to_rgba(color, 0.28)
+
+ fig.add_trace(
+ go.Scatter(
+ x=np.concatenate([x_curve, x_curve[::-1]]),
+ y=np.concatenate([y_curve, np.full_like(y_curve, baseline)]),
+ name=label,
+ fill="toself",
+ fillcolor=fill,
+ line=dict(color="rgba(0,0,0,0)"),
+ hoverinfo="skip",
+ showlegend=False,
+ )
+ )
+
+ fig.add_trace(
+ go.Scatter(
+ x=x_curve,
+ y=y_curve,
+ mode="lines",
+ name=label,
+ line=dict(color=color, width=2),
+ hovertemplate=f"{label} #Participants: %{{x:.0f}}",
+ showlegend=False,
+ )
+ )
+
+ jitter = rng.uniform(0.02, amplitude * 0.5, size=len(values))
+ median_val = float(median_participants.get(label, np.nan))
+ custom_data = np.column_stack(
+ [subset["dataset"].to_numpy(), subset["dataset_url"].to_numpy()]
+ )
+ fig.add_trace(
+ go.Scatter(
+ x=values,
+ y=np.full_like(values, baseline) + jitter,
+ mode="markers",
+ name=label,
+ marker=dict(color=color, size=8, opacity=0.6),
+ customdata=custom_data,
+ hovertemplate="%{customdata[0]} #Participants: %{x} Click to view dataset details",
+ showlegend=False,
+ )
+ )
+
+ if np.isfinite(median_val) and median_val > 0:
+ fig.add_trace(
+ go.Scatter(
+ x=[median_val, median_val],
+ y=[baseline, baseline + amplitude],
+ mode="lines",
+ line=dict(color=color, width=2, dash="dash"),
+ hovertemplate=(
+ f"{label} Median participants: {median_val:.0f}"
+ ),
+ showlegend=False,
+ )
+ )
+
+ if not fig.data:
+ return None
+
+ kde_height = max(650, 150 * len(order))
+ date_stamp = datetime.now().strftime("%d/%m/%Y")
+ fig.update_layout(
+ height=kde_height,
+ width=1200,
+ template="plotly_white",
+ xaxis=dict(
+ type="log",
+ title=dict(text="Number of Participants (Log Scale)", font=dict(size=18)),
+ showgrid=True,
+ gridcolor="rgba(0,0,0,0.08)",
+ zeroline=False,
+ dtick=1,
+ minor=dict(showgrid=True, gridcolor="rgba(0,0,0,0.04)"),
+ tickfont=dict(size=14),
+ ),
+ yaxis=dict(
+ title=dict(text="Modality", font=dict(size=18)),
+ tickmode="array",
+ tickvals=[idx * row_spacing for idx in range(len(order))],
+ ticktext=order,
+ showgrid=False,
+ range=[-0.25, max(0.35, (len(order) - 1) * row_spacing + amplitude + 0.25)],
+ tickfont=dict(size=14),
+ ),
+ showlegend=False,
+ margin=dict(l=120, r=40, t=108, b=80),
+ title=dict(
+ text=f" Based on EEG-Dash datasets available at {date_stamp}.",
+ x=0.5,
+ xanchor="center",
+ y=0.98,
+ yanchor="top",
+ font=dict(size=20),
+ ),
+ autosize=True,
+ font=dict(size=16),
+ )
+
+ fig.add_annotation(
+ xref="paper",
+ yref="paper",
+ x=0.98,
+ y=0.02,
+ text="Visual studies consistently use the largest sample sizes, typically 20-30 participants",
+ showarrow=False,
+ font=dict(size=14, color="#111827"),
+ bgcolor="rgba(255,255,255,0.9)",
+ bordercolor="rgba(17,24,39,0.3)",
+ borderwidth=1,
+ borderpad=8,
+ xanchor="right",
+ yanchor="bottom",
+ )
+
+ plot_config = {
+ "responsive": True,
+ "displaylogo": False,
+ "modeBarButtonsToRemove": ["lasso2d", "select2d"],
+ "toImageButtonOptions": {
+ "format": "png",
+ "filename": "participant_kde",
+ "height": kde_height,
+ "width": 1200,
+ "scale": 2,
+ },
+ }
+
+ fig_spec = fig.to_plotly_json()
+ data_json = json.dumps(fig_spec.get("data", []), cls=PlotlyJSONEncoder)
+ layout_json = json.dumps(fig_spec.get("layout", {}), cls=PlotlyJSONEncoder)
+ config_json = json.dumps(plot_config, cls=PlotlyJSONEncoder)
+
+ styled_html = f"""
+
+
Loading participant distribution...
+
+
+"""
+
+ out_path = Path(out_html)
+ out_path.parent.mkdir(parents=True, exist_ok=True)
+ out_path.write_text(styled_html, encoding="utf-8")
+ return out_path
+
+
+def _read_dataset(path: Path) -> pd.DataFrame:
+ return pd.read_csv(path, index_col=False, header=0, skipinitialspace=True)
+
+
+def main() -> None:
+ import argparse
+
+ parser = argparse.ArgumentParser(
+ description="Generate the modality ridgeline plot from a dataset summary CSV."
+ )
+ parser.add_argument("source", type=Path, help="Path to dataset summary CSV")
+ parser.add_argument(
+ "--output",
+ type=Path,
+ default=Path("dataset_kde_modalities.html"),
+ help="Output HTML file",
+ )
+ parser.add_argument(
+ "--seed",
+ type=int,
+ default=42,
+ help="Random seed controlling jitter placement",
+ )
+ args = parser.parse_args()
+
+ df = _read_dataset(args.source)
+ output_path = generate_modality_ridgeline(df, args.output, rng_seed=args.seed)
+ if output_path is None:
+ print("Ridgeline plot could not be generated (insufficient data).")
+ else:
+ print(f"Ridgeline plot saved to {output_path.resolve()}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/docs/plot_dataset/utils.py b/docs/plot_dataset/utils.py
new file mode 100644
index 00000000..2a518d69
--- /dev/null
+++ b/docs/plot_dataset/utils.py
@@ -0,0 +1,109 @@
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+import pandas as pd
+
+try: # Allow import both as package and script
+ from .colours import CANONICAL_MAP, MODALITY_COLOR_MAP
+except ImportError: # pragma: no cover - fallback for direct script execution
+ from colours import CANONICAL_MAP, MODALITY_COLOR_MAP # type: ignore
+
+__all__ = [
+ "get_dataset_url",
+ "human_readable_size",
+ "primary_modality",
+ "safe_int",
+]
+
+_SEPARATORS = ("/", "|", ";")
+
+
+def primary_modality(value: Any) -> str:
+ """Return the canonical modality label for a record."""
+ if value is None:
+ return "Unknown"
+ if isinstance(value, float) and pd.isna(value):
+ return "Unknown"
+
+ text = str(value).strip()
+ if not text:
+ return "Unknown"
+
+ # normalise separators, keep order of appearance
+ for sep in _SEPARATORS:
+ text = text.replace(sep, ",")
+ tokens = [tok.strip() for tok in text.split(",") if tok.strip()]
+ if not tokens:
+ return "Unknown"
+
+ first = tokens[0]
+ canonical_map = CANONICAL_MAP.get("modality of exp", {})
+ lowered = first.lower()
+ canonical = canonical_map.get(lowered)
+ if canonical:
+ return canonical
+
+ if first in MODALITY_COLOR_MAP:
+ return first
+
+ title_variant = first.title()
+ if title_variant in MODALITY_COLOR_MAP:
+ return title_variant
+
+ return "Other"
+
+
+def safe_int(value: Any, default: int | None = None) -> int | None:
+ """Convert *value* to ``int`` when possible; otherwise return *default*."""
+ try:
+ if value is None or (isinstance(value, float) and pd.isna(value)):
+ return default
+ return int(round(float(value)))
+ except Exception:
+ return default
+
+
+def human_readable_size(num_bytes: int | float | None) -> str:
+ """Format bytes using the closest unit among MB, GB, TB (fallback to KB/B)."""
+ if num_bytes is None:
+ return "0 B"
+
+ try:
+ size = float(num_bytes)
+ except Exception:
+ return "0 B"
+
+ units = [
+ (1024**4, "TB"),
+ (1024**3, "GB"),
+ (1024**2, "MB"),
+ (1024**1, "KB"),
+ (1, "B"),
+ ]
+
+ for factor, unit in units:
+ if size >= factor:
+ value = size / factor
+ if unit in {"B", "KB"}:
+ return f"{int(round(value))} {unit}"
+ return f"{value:.2f} {unit}"
+ return "0 B"
+
+
+def get_dataset_url(name: str) -> str:
+ """Generate dataset URL for plots (relative to dataset summary page)."""
+ if name is None or (isinstance(name, float) and pd.isna(name)):
+ return ""
+ text = str(name).strip()
+ if not text:
+ return ""
+ return f"api/dataset/eegdash.dataset.{text.upper()}.html"
+
+
+def ensure_directory(path: str | Path) -> Path:
+ """Create *path* directory if required and return ``Path`` instance."""
+ dest = Path(path)
+ dest.mkdir(parents=True, exist_ok=True)
+ return dest
diff --git a/docs/prepare_summary_tables.py b/docs/prepare_summary_tables.py
index fd904296..033824f3 100644
--- a/docs/prepare_summary_tables.py
+++ b/docs/prepare_summary_tables.py
@@ -1,384 +1,32 @@
import glob
-import json
+import textwrap
from argparse import ArgumentParser
+from datetime import datetime
from pathlib import Path
from shutil import copyfile
import numpy as np
import pandas as pd
-import plotly.express as px
-import plotly.graph_objects as go
-from plotly.utils import PlotlyJSONEncoder
-from scipy.stats import gaussian_kde
+from plot_dataset import (
+ generate_dataset_bubble,
+ generate_dataset_sankey,
+ generate_modality_ridgeline,
+)
+from plot_dataset.utils import get_dataset_url, human_readable_size
from table_tag_utils import wrap_tags
DOCS_DIR = Path(__file__).resolve().parent
STATIC_DATASET_DIR = DOCS_DIR / "source" / "_static" / "dataset_generated"
-MODALITY_CANONICAL = {
- "visual": "Visual",
- "auditory": "Auditory",
- "tactile": "Tactile",
- "somatosensory": "Tactile",
- "multisensory": "Multisensory",
- "motor": "Motor",
- "rest": "Resting State",
- "resting state": "Resting State",
- "resting-state": "Resting State",
- "sleep": "Sleep",
- "other": "Other",
-}
-
-MODALITY_COLOR_MAP = {
- "Visual": "#2563eb",
- "Auditory": "#0ea5e9",
- "Tactile": "#10b981",
- "Multisensory": "#ec4899",
- "Motor": "#f59e0b",
- "Resting State": "#6366f1",
- "Sleep": "#7c3aed",
- "Other": "#14b8a6",
- "Unknown": "#94a3b8",
-}
-
-
-def _hex_to_rgba(hex_color: str, alpha: float = 0.4) -> str:
- hex_color = hex_color.lstrip("#")
- if len(hex_color) != 6:
- return f"rgba(99, 102, 241, {alpha})"
- r = int(hex_color[0:2], 16)
- g = int(hex_color[2:4], 16)
- b = int(hex_color[4:6], 16)
- return f"rgba({r}, {g}, {b}, {alpha})"
-
-
-def _primary_modality(value: object) -> str:
- if value is None:
- return "Unknown"
- if isinstance(value, float) and pd.isna(value):
- return "Unknown"
- text = str(value).strip()
- if not text:
- return "Unknown"
- for sep in ("/", "|", ";"):
- text = text.replace(sep, ",")
- tokens = [tok.strip() for tok in text.split(",") if tok.strip()]
- if not tokens:
- return "Unknown"
- raw = tokens[0].lower()
- canonical = MODALITY_CANONICAL.get(raw)
- if canonical:
- return canonical
- candidate = tokens[0].strip()
- title_candidate = candidate.title()
- if title_candidate in MODALITY_COLOR_MAP:
- return title_candidate
- return "Other"
-
-
-def _to_numeric_median_list(val) -> float | None:
- """Return a numeric value from possible list-like strings.
-
- Examples
- --------
- - "64" -> 64
- - "6,129" -> median -> 67.5 -> 68
- - "128, 512" -> 320
- - 500.0 -> 500
-
- """
- if pd.isna(val):
- return None
- try:
- # already numeric
- return float(val)
- except Exception:
- pass
- s = str(val).strip().strip("[]")
- if not s:
- return None
- try:
- nums = [float(x) for x in s.split(",") if str(x).strip()]
- if not nums:
- return None
- return float(np.median(nums))
- except Exception:
- return None
-
-
-def _safe_int(x, default=None):
- try:
- if x is None or pd.isna(x):
- return default
- return int(round(float(x)))
- except Exception:
- return default
-
-
-def gen_datasets_bubble(
- df: pd.DataFrame,
- out_html: str = "_static/dataset/dataset_bubble.html",
- x_var: str = "records", # one of: 'records', 'duration_h', 'size_gb', 'tasks'
-):
- """Generate an interactive bubble chart for datasets.
-
- - x: total duration (hours)
- - y: number of subjects
- - size: on-disk size (GB)
- - color: dataset modality
- """
- d = df.copy()
- d = d[d["dataset"].str.lower() != "test"]
-
- # numeric columns
- d["duration_h"] = pd.to_numeric(d.get("duration_hours_total"), errors="coerce")
- d["subjects"] = pd.to_numeric(d.get("n_subjects"), errors="coerce")
- d["records"] = pd.to_numeric(d.get("n_records"), errors="coerce")
- d["tasks"] = pd.to_numeric(d.get("n_tasks"), errors="coerce")
- d["size_bytes"] = pd.to_numeric(d.get("size_bytes"), errors="coerce")
-
- # parse sampling and channels into representative numeric values
- d["sfreq"] = d["sampling_freqs"].map(_to_numeric_median_list)
- d["nchans"] = d["nchans_set"].map(_to_numeric_median_list)
-
- d["modality_label"] = d.get("modality of exp").apply(_primary_modality)
-
- # disk size in GB for sizing
- GB = 1024**3
- d["size_gb"] = d["size_bytes"] / GB
-
- # hover content
- def _fmt_size(bytes_):
- return human_readable_size(_safe_int(bytes_, 0))
-
- # choose x axis field and labels
- x_field = (
- x_var if x_var in {"records", "duration_h", "size_gb", "tasks"} else "records"
- )
- x_label = {
- "records": "#Records",
- "duration_h": "Duration (hours)",
- "size_gb": "Size (GB)",
- "tasks": "#Tasks",
- }[x_field]
-
- # hover text adapts to x
- if x_field == "duration_h":
- x_hover = "Duration: %{x:.2f} h"
- elif x_field == "size_gb":
- x_hover = "Size: %{x:.2f} GB"
- elif x_field == "tasks":
- x_hover = "Tasks: %{x:,}"
- else:
- x_hover = "Records (x): %{x:,}"
-
- hover = (
- "%{customdata[0]}" # dataset id
- " Subjects: %{y:,}"
- f" {x_hover}"
- " Records: %{customdata[1]:,}"
- " Tasks: %{customdata[2]:,}"
- " Channels: %{customdata[3]}"
- " Sampling: %{customdata[4]} Hz"
- " Size: %{customdata[5]}"
- " Modality: %{customdata[6]}"
- ""
- )
-
- d = d.dropna(subset=["duration_h", "subjects", "size_gb"]) # need these
-
- # Marker sizing: scale into a good visual range
- max_size = max(d["size_gb"].max(), 1)
- sizeref = (2.0 * max_size) / (40.0**2) # target ~40px max marker
-
- # Prepare prettified strings for hover
- def _fmt_int(v):
- if v is None or pd.isna(v):
- return ""
- try:
- return str(int(round(float(v))))
- except Exception:
- return str(v)
-
- sfreq_str = d["sfreq"].map(_fmt_int)
- nchans_str = d["nchans"].map(_fmt_int)
-
- fig = px.scatter(
- d,
- x=x_field,
- y="subjects",
- size="size_gb",
- color="modality_label",
- hover_name="dataset",
- custom_data=[
- d["dataset"],
- d["records"],
- d["tasks"],
- nchans_str,
- sfreq_str,
- d["size_bytes"].map(_fmt_size),
- d["modality_label"],
- ],
- size_max=40,
- labels={
- "subjects": "#Subjects",
- "modality_label": "Modality",
- x_field: x_label,
- },
- color_discrete_map=MODALITY_COLOR_MAP,
- title="",
- category_orders={
- "modality_label": [
- label
- for label in MODALITY_COLOR_MAP.keys()
- if label in d["modality_label"].unique()
- ]
- },
- )
-
- # tune marker sizing explicitly for better control
- for tr in fig.data:
- tr.marker.update(
- sizemin=6,
- sizemode="area",
- sizeref=sizeref,
- line=dict(width=0.6, color="rgba(0,0,0,0.3)"),
- )
- tr.hovertemplate = hover
-
- fig.update_layout(
- height=750,
- width=1200, # Set explicit width for consistent sizing
- margin=dict(l=60, r=40, t=80, b=60),
- template="plotly_white",
- legend=dict(
- title="Modality",
- orientation="h",
- yanchor="bottom",
- y=1.02,
- xanchor="right",
- x=0.99,
- ),
- font=dict(
- family="Inter, system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif",
- size=14,
- ),
- title=dict(
- text="",
- x=0.01,
- xanchor="left",
- y=0.98,
- yanchor="top",
- pad=dict(t=10, b=8),
- ),
- autosize=True, # Enable auto-sizing to fill container
- )
-
- fig.update_xaxes(showgrid=True, gridcolor="rgba(0,0,0,0.12)", zeroline=False)
- fig.update_yaxes(showgrid=True, gridcolor="rgba(0,0,0,0.12)", zeroline=False)
-
- out_path = Path(out_html)
- out_path.parent.mkdir(parents=True, exist_ok=True)
- # Add CSS and loading indicator for immediate proper sizing
- html_content = fig.to_html(
- full_html=False,
- include_plotlyjs=False,
- div_id="dataset-bubble",
- config={
- "responsive": True,
- "displaylogo": False,
- "modeBarButtonsToRemove": ["lasso2d", "select2d"],
- "toImageButtonOptions": {
- "format": "png",
- "filename": "dataset_landscape",
- "height": 750,
- "width": 1200,
- "scale": 2,
- },
- },
- )
-
- # Wrap with styling to ensure proper initial sizing
- styled_html = f"""
-
-
Loading dataset landscape...
-{html_content}
-
-"""
-
- with open(str(out_path), "w", encoding="utf-8") as f:
- f.write(styled_html)
- return str(out_path)
-
-
-def human_readable_size(num_bytes: int) -> str:
- """Format bytes using the closest unit among MB, GB, TB (fallback to KB/B).
-
- Chooses the largest unit such that the value is >= 1. Uses base 1024.
- """
- if num_bytes is None:
- return "0 B"
- size = float(num_bytes)
- units = [
- (1024**4, "TB"),
- (1024**3, "GB"),
- (1024**2, "MB"),
- (1024**1, "KB"),
- (1, "B"),
- ]
- for factor, unit in units:
- if size >= factor:
- value = size / factor
- # Use no decimals for B/KB; two decimals otherwise
- if unit in ("B", "KB"):
- return f"{int(round(value))} {unit}"
- return f"{value:.2f} {unit}"
- return "0 B"
-
-
-def get_dataset_url(name: str) -> str:
- """Generate dataset URL for plots (relative to dataset summary page)."""
- name = name.strip()
- return f"api/dataset/eegdash.dataset.{name.upper()}.html"
-
-
def wrap_dataset_name(name: str):
# Remove any surrounding whitespace
name = name.strip()
# Link to the individual dataset API page
# Updated structure: api/dataset/eegdash.dataset..html
- url = f"api/dataset/eegdash.dataset.{name.upper()}.html"
+ url = get_dataset_url(name)
+ if not url:
+ return name.upper()
return f'{name.upper()}'
@@ -406,6 +54,163 @@ def wrap_dataset_name(name: str):
},
}
+DATA_TABLE_TEMPLATE = textwrap.dedent(
+ r"""
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+"""
+)
+
def _tag_normalizer(kind: str):
canonical = {k.lower(): v for k, v in DATASET_CANONICAL_MAP.get(kind, {}).items()}
@@ -517,10 +322,21 @@ def main(source_dir: str, target_dir: str):
f, index_col=False, header=0, skipinitialspace=True
) # , sep=";")
# Generate bubble chart from the raw data to have access to size_bytes
- # Use x-axis as number of records for better spread
bubble_path = target_dir / "dataset_bubble.html"
- gen_datasets_bubble(df_raw, str(bubble_path), x_var="records")
- copyfile(bubble_path, STATIC_DATASET_DIR / bubble_path.name)
+ bubble_output = generate_dataset_bubble(
+ df_raw,
+ bubble_path,
+ x_var="subjects",
+ )
+ copyfile(bubble_output, STATIC_DATASET_DIR / bubble_output.name)
+
+ # Generate Sankey diagram showing dataset flow across categories
+ try:
+ sankey_path = target_dir / "dataset_sankey.html"
+ sankey_output = generate_dataset_sankey(df_raw, sankey_path)
+ copyfile(sankey_output, STATIC_DATASET_DIR / sankey_output.name)
+ except Exception as exc:
+ print(f"[dataset Sankey] Skipped due to error: {exc}")
df = prepare_table(df_raw)
# preserve int values
@@ -568,239 +384,18 @@ def main(source_dir: str, target_dir: str):
escape=False,
table_id="datasets-table",
)
+ html_table = DATA_TABLE_TEMPLATE.replace("", html_table)
table_path = target_dir / "dataset_summary_table.html"
- with open(table_path, "+w", encoding="utf-8") as f:
+ with open(table_path, "w", encoding="utf-8") as f:
f.write(html_table)
copyfile(table_path, STATIC_DATASET_DIR / table_path.name)
# Generate KDE ridgeline plot for modality participant distributions
try:
- d_modal = df_raw[df_raw["dataset"].str.lower() != "test"].copy()
- d_modal["modality_label"] = d_modal["modality of exp"].apply(
- _primary_modality
- )
- d_modal["n_subjects"] = pd.to_numeric(
- d_modal["n_subjects"], errors="coerce"
- )
- d_modal = d_modal.dropna(subset=["n_subjects"])
-
- fig_kde = go.Figure()
- order = [
- label
- for label in MODALITY_COLOR_MAP
- if label in d_modal["modality_label"].unique()
- ]
- rng = np.random.default_rng(42)
-
- for idx, label in enumerate(order):
- subset = d_modal[d_modal["modality_label"] == label].copy()
- vals = subset["n_subjects"].astype(float).dropna()
- if len(vals) < 3:
- continue
- # Generate URLs for datasets in this modality
- subset["dataset_url"] = subset["dataset"].apply(get_dataset_url)
- log_vals = np.log10(vals)
- grid = np.linspace(log_vals.min() - 0.25, log_vals.max() + 0.25, 240)
- kde = gaussian_kde(log_vals)
- density = kde(grid)
- if density.max() <= 0:
- continue
- density_norm = density / density.max()
- amplitude = 0.6
- baseline = idx * 1.1
- y_curve = baseline + density_norm * amplitude
- x_curve = 10**grid
-
- color = MODALITY_COLOR_MAP.get(label, "#6b7280")
- fill = _hex_to_rgba(color, 0.28)
-
- fig_kde.add_trace(
- go.Scatter(
- x=np.concatenate([x_curve, x_curve[::-1]]),
- y=np.concatenate([y_curve, np.full_like(y_curve, baseline)]),
- name=label,
- fill="toself",
- fillcolor=fill,
- line=dict(color="rgba(0,0,0,0)"),
- hoverinfo="skip",
- showlegend=False,
- )
- )
-
- fig_kde.add_trace(
- go.Scatter(
- x=x_curve,
- y=y_curve,
- mode="lines",
- name=label,
- line=dict(color=color, width=2),
- hovertemplate=f"{label} #Participants: %{{x:.0f}}",
- )
- )
-
- jitter = rng.uniform(0.02, amplitude * 0.5, size=len(vals))
- # Prepare custom data with dataset names and URLs
- custom_data = np.column_stack(
- [subset["dataset"].to_numpy(), subset["dataset_url"].to_numpy()]
- )
- fig_kde.add_trace(
- go.Scatter(
- x=vals,
- y=np.full_like(vals, baseline) + jitter,
- mode="markers",
- name=label,
- marker=dict(color=color, size=5, opacity=0.6),
- customdata=custom_data,
- hovertemplate="%{customdata[0]} #Participants: %{x} Click to view dataset details",
- showlegend=False,
- )
- )
-
- if fig_kde.data:
- fig_kde.update_layout(
- height=max(650, 150 * len(order)),
- width=1200, # Set explicit width for consistent sizing
- template="plotly_white",
- xaxis=dict(
- type="log",
- title="#Participants",
- showgrid=True,
- gridcolor="rgba(0,0,0,0.12)",
- zeroline=False,
- ),
- yaxis=dict(
- title="Modality",
- tickmode="array",
- tickvals=[idx * 1.1 for idx in range(len(order))],
- ticktext=order,
- showgrid=False,
- range=[-0.3, max(0.3, (len(order) - 1) * 1.1 + 0.9)],
- ),
- legend=dict(
- title="Modality",
- orientation="h",
- yanchor="bottom",
- y=1.02,
- xanchor="right",
- x=0.99,
- ),
- margin=dict(l=120, r=40, t=80, b=80),
- title=dict(
- text="",
- x=0.01,
- xanchor="left",
- y=0.98,
- yanchor="top",
- ),
- autosize=True, # Enable auto-sizing to fill container
- )
- # Add CSS and loading indicator for immediate proper sizing
- kde_height = max(650, 150 * len(order))
- plot_config = {
- "responsive": True,
- "displaylogo": False,
- "modeBarButtonsToRemove": ["lasso2d", "select2d"],
- "toImageButtonOptions": {
- "format": "png",
- "filename": "participant_kde",
- "height": kde_height,
- "width": 1200,
- "scale": 2,
- },
- }
- fig_spec = fig_kde.to_plotly_json()
- data_json = json.dumps(fig_spec.get("data", []), cls=PlotlyJSONEncoder)
- layout_json = json.dumps(
- fig_spec.get("layout", {}), cls=PlotlyJSONEncoder
- )
- config_json = json.dumps(plot_config, cls=PlotlyJSONEncoder)
-
- # Wrap with styling to ensure proper initial sizing and defer Plotly rendering
- styled_html = f"""
-
-
Loading participant distribution...
-
-
-"""
-
- kde_path = Path(target_dir) / "dataset_kde_modalities.html"
- with open(kde_path, "w", encoding="utf-8") as f:
- f.write(styled_html)
- copyfile(kde_path, STATIC_DATASET_DIR / kde_path.name)
+ kde_path = target_dir / "dataset_kde_modalities.html"
+ kde_output = generate_modality_ridgeline(df_raw, kde_path)
+ if kde_output:
+ copyfile(kde_output, STATIC_DATASET_DIR / kde_output.name)
except Exception as exc:
print(f"[dataset KDE] Skipped due to error: {exc}")
diff --git a/docs/source/dataset_summary.rst b/docs/source/dataset_summary.rst
index b4b607ff..c3ef7a33 100644
--- a/docs/source/dataset_summary.rst
+++ b/docs/source/dataset_summary.rst
@@ -10,13 +10,11 @@
.. rst-class:: dataset-summary-article
-Datasets
-=========
+Datasets Catalog
+================
To leverage recent and ongoing advancements in large-scale computational methods and to ensure the preservation of scientific data generated from publicly funded research, the EEG-DaSh data archive will create a data-sharing resource for MEEG (EEG, MEG) data contributed by collaborators for machine learning (ML) and deep learning (DL) applications.
-The archive is currently still in :bdg-danger:`beta testing` mode, so be kind.
-
.. raw:: html
@@ -27,10 +25,16 @@ The archive is currently still in :bdg-danger:`beta testing` mode, so be kind.
.. include:: dataset_summary/table.rst
- .. tab-item:: Participant KDE
+ .. tab-item:: Participant Distribution
.. include:: dataset_summary/kde.rst
- .. tab-item:: Landscape
+ .. tab-item:: Dataset Flow
+
+ .. include:: dataset_summary/sankey.rst
+
+ .. tab-item:: Scatter of Sample Size vs. Recording Duration
.. include:: dataset_summary/bubble.rst
+
+The archive is currently still in :bdg-danger:`beta testing` mode, so be kind.
diff --git a/docs/source/dataset_summary/bubble.rst b/docs/source/dataset_summary/bubble.rst
index 83e83179..6de57ebf 100644
--- a/docs/source/dataset_summary/bubble.rst
+++ b/docs/source/dataset_summary/bubble.rst
@@ -1,3 +1,5 @@
+.. title:: Dataset landscape
+
.. rubric:: Dataset landscape
.. raw:: html
diff --git a/docs/source/dataset_summary/kde.rst b/docs/source/dataset_summary/kde.rst
index 906a4c6f..e2f66ddf 100644
--- a/docs/source/dataset_summary/kde.rst
+++ b/docs/source/dataset_summary/kde.rst
@@ -1,4 +1,6 @@
-.. rubric:: Participant Distribution by Modality
+.. title:: Participant Distribution by Modality
+
+.. rubric:: Distribution of Sample Sizes Varies by Experimental Modality
.. raw:: html
diff --git a/docs/source/dataset_summary/sankey.rst b/docs/source/dataset_summary/sankey.rst
new file mode 100644
index 00000000..66304778
--- /dev/null
+++ b/docs/source/dataset_summary/sankey.rst
@@ -0,0 +1,20 @@
+.. title:: Dataset flow
+
+.. rubric:: Sankey diagrams of EEGDash Datasets by Population, Modality, and Cognitive Domain
+
+.. raw:: html
+
+
+
+.. raw:: html
+ :file: ../_static/dataset_generated/dataset_sankey.html
+
+.. raw:: html
+
+
+ Figure: Dataset flow across population, modality, and cognitive domain.
+ Link thickness is proportional to the total number of subjects, and the tooltip
+ reports both subject and dataset counts. Hover and click legend entries to
+ explore specific segments.
+
+
diff --git a/docs/source/dataset_summary/table.rst b/docs/source/dataset_summary/table.rst
index 3891d906..b409b575 100644
--- a/docs/source/dataset_summary/table.rst
+++ b/docs/source/dataset_summary/table.rst
@@ -1,3 +1,5 @@
+.. title:: EEG Datasets Table
+
.. rubric:: EEG Datasets Table
The data in EEG-DaSh originates from a collaboration involving 25 laboratories, encompassing 27,053 participants. This extensive collection includes M-EEG data, which is a combination of EEG and MEG signals. The data is sourced from various studies conducted by these labs,
@@ -22,153 +24,4 @@ In addition, EEG-DaSh will incorporate a subset of the data converted from `NEMA
-Pathology, modality, and dataset type now surface as consistent color-coded tags so you can scan the table at a glance and reuse the same visual language as the model catalog.
-
-.. raw:: html
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+Pathology, modality, and dataset type now surface as consistent color-coded tags so you can scan the table at a glance.
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 01b8e41d..632007c6 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -20,11 +20,14 @@ EEG Dash Homepage
.. rst-class:: h4 text-center font-weight-light my-4
-
The EEG-DaSh data archive will establish a data-sharing resource for MEEG (EEG, MEG) data, enabling
large-scale computational advancements to preserve and share scientific data from publicly funded
research for machine learning and deep learning applications.
+.. rst-class:: text-center
+
+**Note:** The "DaSh" in EEG-DaSh stands for **Data Share**.
+
The EEG-DaSh data archive is a collaborative effort led by the University of California, San Diego (UCSD) and Ben-Gurion University of the Negev (BGU) and partially funded by the National Science Foundation (NSF). All are welcome to contribute to the https://github.com/sccn/EEGDash project.
The archive is currently still in :bdg-danger:`beta testing` mode, so be kind.
diff --git a/eegdash/dataset/dataset_summary.csv b/eegdash/dataset/dataset_summary.csv
index 609fa2d3..74781587 100644
--- a/eegdash/dataset/dataset_summary.csv
+++ b/eegdash/dataset/dataset_summary.csv
@@ -198,7 +198,7 @@
197,ds003751,38,38,1,128,250,19.95,4.71 GB,5057922307,0,ds003751,Healthy,other,Multisensory,Affect
198,ds003421,80,20,1,257,1000,11.604,76.77 GB,82433418198,0,ds003421,Healthy,10-20,Multisensory,Decision-making
199,ds002158,117,20,1,,,0.0,428.59 GB,460190030981,0,ds002158,Healthy,10-20,Visual,Affect
-200,ds004951,23,11,1,63,1000,29.563,22.00 GB,23627352274,0,ds004951,?,,Tactile,Learning
+200,ds004951,23,11,1,63,1000,29.563,22.00 GB,23627352274,0,ds004951,,,Tactile,Learning
201,ds004802,38,38,1,65,"2048,512",0.0,29.34 GB,31504070800,0,ds004802,Other,,Visual,Affect
202,ds004816,20,20,1,63,1000,0.0,23.31 GB,25028989553,0,ds004816,Healthy,,Visual,Attention
203,ds005873,2850,125,1,2,256,11935.09,117.21 GB,125851664268,0,,,,,