diff --git a/docs/plot_dataset/__init__.py b/docs/plot_dataset/__init__.py new file mode 100644 index 00000000..85942823 --- /dev/null +++ b/docs/plot_dataset/__init__.py @@ -0,0 +1,13 @@ +"""Plot generation utilities for EEGDash documentation.""" + +from .bubble import generate_dataset_bubble # noqa: F401 +from .colours import ( # noqa: F401 + CANONICAL_MAP, + COLUMN_COLOR_MAPS, + MODALITY_COLOR_MAP, + PATHOLOGY_COLOR_MAP, + TYPE_COLOR_MAP, + hex_to_rgba, +) +from .plot_sankey import generate_dataset_sankey # noqa: F401 +from .ridgeline import generate_modality_ridgeline # noqa: F401 diff --git a/docs/plot_dataset/bubble.py b/docs/plot_dataset/bubble.py new file mode 100644 index 00000000..3e5c3ae5 --- /dev/null +++ b/docs/plot_dataset/bubble.py @@ -0,0 +1,404 @@ +from __future__ import annotations + +from pathlib import Path + +import numpy as np +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go + +try: # Allow execution as a script or module + from .colours import MODALITY_COLOR_MAP + from .utils import get_dataset_url, human_readable_size, primary_modality, safe_int +except ImportError: # pragma: no cover - fallback for direct script execution + from colours import MODALITY_COLOR_MAP # type: ignore + from utils import ( # type: ignore + get_dataset_url, + human_readable_size, + primary_modality, + safe_int, + ) + +__all__ = ["generate_dataset_bubble"] + + +def _to_numeric_median_list(val) -> float | None: + if pd.isna(val): + return None + try: + return float(val) + except Exception: + pass + + s = str(val).strip().strip("[]") + if not s: + return None + + try: + nums = [float(x) for x in s.split(",") if str(x).strip()] + if not nums: + return None + return float(np.median(nums)) + except Exception: + return None + + +def _format_int(value) -> str: + if value is None or pd.isna(value): + return "" + try: + return str(int(round(float(value)))) + except Exception: + return str(value) + + +def _build_hover_template(x_field: str, y_field: str) -> tuple[str, str]: + x_map = { + "duration_h": "Duration (x): %{x:.2f} h", + "size_gb": "Size (x): %{x:.2f} GB", + "tasks": "Tasks (x): %{x:,}", + "subjects": "Subjects (x): %{x:,}", + } + y_map = { + "subjects": "Subjects (y): %{y:,}", + } + x_hover = x_map.get(x_field, "Records (x): %{x:,}") + y_hover = y_map.get(y_field, "Records (y): %{y:,}") + return x_hover, y_hover + + +def generate_dataset_bubble( + df: pd.DataFrame, + out_html: str | Path, + *, + x_var: str = "records", + max_width: int = 1280, + height: int = 720, +) -> Path: + """Generate the dataset landscape bubble chart.""" + data = df.copy() + data = data[data["dataset"].str.lower() != "test"] + + data["duration_h"] = pd.to_numeric( + data.get("duration_hours_total"), errors="coerce" + ) + data["subjects"] = pd.to_numeric(data.get("n_subjects"), errors="coerce") + data["records"] = pd.to_numeric(data.get("n_records"), errors="coerce") + data["tasks"] = pd.to_numeric(data.get("n_tasks"), errors="coerce") + data["size_bytes"] = pd.to_numeric(data.get("size_bytes"), errors="coerce") + + data["sfreq"] = data["sampling_freqs"].map(_to_numeric_median_list) + data["nchans"] = data["nchans_set"].map(_to_numeric_median_list) + + data["modality_label"] = data.get("modality of exp").apply(primary_modality) + + GB = 1024**3 + data["size_gb"] = data["size_bytes"] / GB + + x_field = ( + x_var + if x_var in {"records", "duration_h", "size_gb", "tasks", "subjects"} + else "records" + ) + axis_labels = { + "records": "#Records", + "duration_h": "Duration (hours)", + "size_gb": "Size (GB)", + "tasks": "#Tasks", + "subjects": "#Subjects", + } + x_label = f"{axis_labels[x_field]} (log scale)" + y_field = "subjects" if x_field != "subjects" else "records" + y_label = f"{axis_labels[y_field]} (log scale)" + x_hover, y_hover = _build_hover_template(x_field, y_field) + + required_columns = {x_field, y_field, "size_gb"} + data = data.replace([np.inf, -np.inf], np.nan) + data = data.dropna(subset=list(required_columns)) + data = data[(data[x_field] > 0) & (data[y_field] > 0)] + + data["dataset_url"] = data["dataset"].apply(get_dataset_url) + + out_path = Path(out_html) + out_path.parent.mkdir(parents=True, exist_ok=True) + + if data.empty: + empty_html = """ +
No dataset records available for plotting.
+""" + out_path.write_text(empty_html, encoding="utf-8") + return out_path + + size_max = data["size_gb"].max() + if not np.isfinite(size_max) or size_max <= 0: + size_max = 1.0 + sizeref = (2.0 * size_max) / (40.0**2) + + sfreq_str = data["sfreq"].map(_format_int) + nchans_str = data["nchans"].map(_format_int) + + fig = px.scatter( + data, + x=x_field, + y=y_field, + size="size_gb", + color="modality_label", + hover_name="dataset", + custom_data=[ + data["dataset"], + data["subjects"], + data["records"], + data["tasks"], + nchans_str, + sfreq_str, + data["size_bytes"].map( + lambda bytes_: human_readable_size(safe_int(bytes_, 0)) + ), + data["modality_label"], + data["dataset_url"], + ], + size_max=40, + labels={ + y_field: y_label, + "modality_label": "Modality", + x_field: x_label, + }, + color_discrete_map=MODALITY_COLOR_MAP, + title="", + category_orders={ + "modality_label": [ + label + for label in MODALITY_COLOR_MAP.keys() + if label in data["modality_label"].unique() + ] + }, + log_x=True, + log_y=True, + ) + + numeric_x = pd.to_numeric(data[x_field], errors="coerce") + numeric_y = pd.to_numeric(data[y_field], errors="coerce") + mask = ( + np.isfinite(numeric_x) + & np.isfinite(numeric_y) + & (numeric_x > 0) + & (numeric_y > 0) + ) + + fit_annotation_text = None + if mask.sum() >= 2: + log_x = np.log10(numeric_x[mask]) + log_y = np.log10(numeric_y[mask]) + ss_tot = np.sum((log_y - log_y.mean()) ** 2) + if np.ptp(log_x) > 0 and np.ptp(log_y) > 0 and ss_tot > 0: + slope, intercept = np.polyfit(log_x, log_y, 1) + line_log_x = np.linspace(log_x.min(), log_x.max(), 200) + line_x = 10**line_log_x + line_y = 10 ** (slope * line_log_x + intercept) + fig.add_trace( + go.Scatter( + x=line_x, + y=line_y, + mode="lines", + name="log-log fit", + line=dict(color="#111827", width=2, dash="dot"), + hoverinfo="skip", + showlegend=False, + ) + ) + residuals = log_y - (slope * log_x + intercept) + r_squared = 1 - np.sum(residuals**2) / ss_tot + fit_annotation_text = f"log-log OLS fit R² = {r_squared:.3f}" + + hover_template = ( + "%{customdata[0]}" + f"
{x_hover}" + f"
{y_hover}" + "
Subjects (total): %{customdata[1]:,}" + "
Records (total): %{customdata[2]:,}" + "
Tasks: %{customdata[3]:,}" + "
Channels: %{customdata[4]}" + "
Sampling: %{customdata[5]} Hz" + "
Size: %{customdata[6]}" + "
Modality: %{customdata[7]}" + "
Click bubble to open dataset page" + "" + ) + + for trace in fig.data: + mode = getattr(trace, "mode", "") or "" + if "markers" not in mode: + continue + trace.marker.update( + sizemin=6, + sizemode="area", + sizeref=sizeref, + line=dict(width=0.6, color="rgba(0,0,0,0.3)"), + opacity=0.75, + ) + trace.hovertemplate = hover_template + + fig.update_layout( + height=height, + width=max_width, + margin=dict(l=60, r=40, t=80, b=60), + template="plotly_white", + legend=dict( + title="Modality", + orientation="h", + yanchor="bottom", + y=1.02, + xanchor="right", + x=0.99, + ), + font=dict( + family="Inter, system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif", + size=14, + ), + title=dict(text="", x=0.01, xanchor="left", y=0.98, yanchor="top"), + autosize=True, + ) + + if fit_annotation_text: + fig.add_annotation( + xref="paper", + yref="paper", + x=0.02, + y=0.98, + text=fit_annotation_text, + showarrow=False, + font=dict(size=15, color="#111827"), + bgcolor="rgba(255,255,255,0.75)", + bordercolor="rgba(17,24,39,0.25)", + borderwidth=1, + borderpad=6, + ) + + fig.update_xaxes( + showgrid=True, + gridcolor="rgba(0,0,0,0.12)", + zeroline=False, + type="log", + dtick=1, + ) + fig.update_yaxes( + showgrid=True, + gridcolor="rgba(0,0,0,0.12)", + zeroline=False, + type="log", + dtick=1, + ) + + html_content = fig.to_html( + full_html=False, + include_plotlyjs=False, + div_id="dataset-bubble", + config={ + "responsive": True, + "displaylogo": False, + "modeBarButtonsToRemove": ["lasso2d", "select2d"], + "toImageButtonOptions": { + "format": "png", + "filename": "dataset_landscape", + "height": height, + "width": max_width, + "scale": 2, + }, + }, + ) + + styled_html = f""" + +
Loading dataset landscape...
+{html_content} + +""" + + out_path.write_text(styled_html, encoding="utf-8") + return out_path + + +def _read_dataset(path: Path) -> pd.DataFrame: + return pd.read_csv(path, index_col=False, header=0, skipinitialspace=True) + + +def main() -> None: + import argparse + + parser = argparse.ArgumentParser(description="Generate the dataset bubble chart.") + parser.add_argument("source", type=Path, help="Path to dataset summary CSV") + parser.add_argument( + "--output", + type=Path, + default=Path("dataset_bubble.html"), + help="Output HTML file", + ) + parser.add_argument( + "--x-axis", + choices=["records", "duration_h", "size_gb", "tasks", "subjects"], + default="records", + help="Field for the bubble chart x-axis", + ) + args = parser.parse_args() + + df = _read_dataset(args.source) + output_path = generate_dataset_bubble(df, args.output, x_var=args.x_axis) + print(f"Bubble chart saved to {output_path.resolve()}") + + +if __name__ == "__main__": + main() diff --git a/docs/plot_dataset/colours.py b/docs/plot_dataset/colours.py new file mode 100644 index 00000000..7d2d50ae --- /dev/null +++ b/docs/plot_dataset/colours.py @@ -0,0 +1,98 @@ +"""Helpers for Sankey diagram generation.""" + +# Color mappings consistent with prepare_summary_tables.py and custom.css +PATHOLOGY_COLOR_MAP = { + "Healthy": "#22c55e", # green + "Clinical": "#f87171", # Lighter red to match table + "Unknown": "#94a3b8", # grey +} + +MODALITY_COLOR_MAP = { + "Visual": "#2563eb", + "Auditory": "#0ea5e9", + "Tactile": "#10b981", + "Somatosensory": "#10b981", + "Multisensory": "#ec4899", + "Motor": "#f59e0b", + "Resting State": "#6366f1", + "Rest": "#6366f1", + "Sleep": "#7c3aed", + "Other": "#14b8a6", + "Unknown": "#94a3b8", +} + +TYPE_COLOR_MAP = { + "Perception": "#3b82f6", + "Decision-making": "#eab308", + "Rest": "#16a34a", + "Resting-state": "#16a34a", + "Sleep": "#8b5cf6", + "Cognitive": "#6366f1", + "Clinical": "#f87171", # Lighter red to match table + "Memory": "#c4b5fd", # Lighter purple to match table + "Attention": "#c4b5fd", # Lighter purple to match table + "Intervention": "#c4b5fd", # Lighter purple to match table + "Learning": "#c4b5fd", # Lighter purple to match table + "Other": "#c4b5fd", # Lighter purple to match table + "Unknown": "#94a3b8", +} + +# Canonical mappings to normalize values +CANONICAL_MAP = { + "Type Subject": { + "healthy controls": "Healthy", + "healthy": "Healthy", + "control": "Healthy", + "clinical": "Clinical", + "patient": "Clinical", + }, + "modality of exp": { + "visual": "Visual", + "auditory": "Auditory", + "tactile": "Tactile", + "somatosensory": "Tactile", + "multisensory": "Multisensory", + "motor": "Motor", + "rest": "Resting State", + "resting state": "Resting State", + "resting-state": "Resting State", + "sleep": "Sleep", + "other": "Other", + }, + "type of exp": { + "perception": "Perception", + "decision making": "Decision-making", + "decision-making": "Decision-making", + "rest": "Rest", + "resting state": "Resting-state", + "resting-state": "Resting-state", + "sleep": "Sleep", + "cognitive": "Cognitive", + "clinical": "Clinical", + "other": "Other", + }, +} + +# Map column names to their color maps +COLUMN_COLOR_MAPS = { + "Type Subject": PATHOLOGY_COLOR_MAP, + "modality of exp": MODALITY_COLOR_MAP, + "type of exp": TYPE_COLOR_MAP, +} + + +def hex_to_rgba(hex_color: str, alpha: float = 0.2) -> str: + """Convert hex color to rgba with given alpha.""" + if not isinstance(hex_color, str) or not hex_color.startswith("#"): + # This is not a valid hex color, return a default color + return "rgba(148, 163, 184, 0.2)" # Default grey + hex_color = hex_color.lstrip("#") + if len(hex_color) != 6: + return "rgba(148, 163, 184, 0.2)" # Default grey for invalid length + try: + r = int(hex_color[0:2], 16) + g = int(hex_color[2:4], 16) + b = int(hex_color[4:6], 16) + except ValueError: + return "rgba(148, 163, 184, 0.2)" # Default grey for conversion error + return f"rgba({r}, {g}, {b}, {alpha})" diff --git a/docs/plot_dataset/plot_sankey.py b/docs/plot_dataset/plot_sankey.py new file mode 100644 index 00000000..fb41a3a8 --- /dev/null +++ b/docs/plot_dataset/plot_sankey.py @@ -0,0 +1,352 @@ +from __future__ import annotations + +"""Generate a Sankey diagram from the EEG-Dash dataset summary. + +The script loads ``eegdash/dataset/dataset_summary.csv`` (by default) and builds +an interactive Plotly Sankey diagram connecting three categorical columns. This +mirrors how the documentation summarises datasets across subject type, modality, +and experiment type, but can be reused with any trio of categorical columns via +CLI arguments. +""" + +import argparse +from pathlib import Path +from typing import Sequence + +import pandas as pd +import plotly.graph_objects as go + +try: # Support execution as a script or as a package module + from .colours import CANONICAL_MAP, COLUMN_COLOR_MAPS, hex_to_rgba +except ImportError: # pragma: no cover - fallback for direct script execution + from colours import CANONICAL_MAP, COLUMN_COLOR_MAPS, hex_to_rgba + +DEFAULT_COLUMNS = ["Type Subject", "modality of exp", "type of exp"] +__all__ = ["generate_dataset_sankey", "build_sankey"] + + +def _prepare_dataframe(df: pd.DataFrame, columns: Sequence[str]) -> pd.DataFrame: + all_columns = list(columns) + if "n_subjects" not in all_columns: + all_columns.append("n_subjects") + + missing = [col for col in all_columns if col not in df.columns] + if missing: + msg = f"Columns not found in dataframe: {missing}" + raise ValueError(msg) + + cleaned = df.copy() + + # Fill missing n_subjects with 1 (to count as at least one dataset) + # and ensure the column is numeric integer type. + cleaned["n_subjects"] = ( + pd.to_numeric(cleaned["n_subjects"], errors="coerce").fillna(1).astype(int) + ) + + # Process each column for cleaning and normalization + for col in columns: + # 1. Fill original NaN values with the string 'Unknown' + cleaned[col] = cleaned[col].fillna("Unknown") + + # 2. Split multi-valued cells + cleaned[col] = cleaned[col].astype(str).str.split(r"/|;|,", regex=True) + cleaned = cleaned.explode(col) + + # 3. Clean up whitespace and any empty strings created by splitting + cleaned[col] = cleaned[col].str.strip() + cleaned[col] = cleaned[col].replace(["", "nan"], "Unknown") + + # 4. Apply canonical mapping to standardize terms + if col in CANONICAL_MAP: + mapping = CANONICAL_MAP[col] + # Use .str.lower() for case-insensitive mapping + cleaned[col] = cleaned[col].str.lower().map(mapping).fillna(cleaned[col]) + + # 5. Apply special rule for 'Type Subject' after all other processing + if "Type Subject" in columns: + # The user wants to preserve original labels but color them as 'Clinical'. + # The relabeling to 'Clinical' is now removed. The coloring logic will handle this. + pass + + return cleaned[all_columns] + + +def _load_dataframe(path: Path, columns: Sequence[str]) -> pd.DataFrame: + df = pd.read_csv( + path, + index_col=False, + header=0, + skipinitialspace=True, + ) + return _prepare_dataframe(df, columns) + + +def _build_sankey_data(df: pd.DataFrame, columns: Sequence[str]): + node_labels: list[str] = [] + node_colors: list[str] = [] + node_index: dict[tuple[str, str], int] = {} + + for col in columns: + color_map = COLUMN_COLOR_MAPS.get(col, {}) + + # Sort unique values to ensure "Unknown" appears at the bottom + all_unique = df[col].unique() + # Separate "Unknown" and sort the rest alphabetically + known_values = sorted([v for v in all_unique if v != "Unknown"]) + unique_values = known_values + # Add "Unknown" to the end if it exists + if "Unknown" in all_unique: + unique_values.append("Unknown") + + for val in unique_values: + if (col, val) not in node_index: + node_index[(col, val)] = len(node_labels) + node_labels.append(val) + + # Use "Clinical" color for specific pathologies + node_color = color_map.get(val, "#94a3b8") + if col == "Type Subject" and val not in ["Healthy", "Unknown"]: + node_color = color_map.get("Clinical", "#94a3b8") + node_colors.append(node_color) + + sources: list[int] = [] + targets: list[int] = [] + values: list[int] = [] + link_colors: list[str] = [] + link_hover_labels: list[str] = [] + + for idx in range(len(columns) - 1): + col_from, col_to = columns[idx], columns[idx + 1] + + # Use the color from the source node for the link + source_color_map = COLUMN_COLOR_MAPS.get(col_from, {}) + + # Group by source and target, getting both sum of subjects and count of datasets + grouped = ( + df.groupby([col_from, col_to]) + .agg( + subject_sum=("n_subjects", "sum"), + dataset_count=("n_subjects", "size"), + ) + .reset_index() + ) + + for _, row in grouped.iterrows(): + source_val, target_val, subject_sum, dataset_count = ( + row[col_from], + row[col_to], + row["subject_sum"], + row["dataset_count"], + ) + + source_node_idx = node_index.get((col_from, source_val)) + target_node_idx = node_index.get((col_to, target_val)) + + if source_node_idx is not None and target_node_idx is not None: + sources.append(source_node_idx) + targets.append(target_node_idx) + values.append(subject_sum) # Weight links by sum of subjects + link_hover_labels.append( + f"{source_val} → {target_val}:
" + f"{subject_sum} subjects in {dataset_count} datasets" + ) + + # Assign color to the link based on the source node + source_color = source_color_map.get(source_val, "#94a3b8") + if col_from == "Type Subject" and source_val not in [ + "Healthy", + "Unknown", + ]: + source_color = source_color_map.get("Clinical", "#94a3b8") + link_colors.append(hex_to_rgba(source_color)) + + # Add counts (subjects and datasets) and percentages to the first column labels + first_col_name = columns[0] + first_col_stats = df.groupby(first_col_name).agg( + subject_sum=("n_subjects", "sum"), + dataset_count=("n_subjects", "size"), + ) + total_subjects = first_col_stats["subject_sum"].sum() + + for i, label in enumerate(node_labels): + col, val = next((k for k, v in node_index.items() if v == i), (None, None)) + if col == first_col_name and val in first_col_stats.index: + stats = first_col_stats.loc[val] + subject_sum = stats["subject_sum"] + dataset_count = stats["dataset_count"] + percentage = ( + (subject_sum / total_subjects) * 100 if total_subjects > 0 else 0 + ) + node_labels[i] = ( + f"{label}
({subject_sum} subjects, {dataset_count} datasets, {percentage:.1f}%)" + ) + + return ( + node_labels, + node_colors, + sources, + targets, + values, + link_colors, + link_hover_labels, + ) + + +def build_sankey(df: pd.DataFrame, columns: Sequence[str]) -> go.Figure: + ( + labels, + colors, + sources, + targets, + values, + link_colors, + link_hover_labels, + ) = _build_sankey_data(df, columns) + + sankey = go.Sankey( + arrangement="snap", + node=dict( + pad=30, + thickness=18, + label=labels, + color=colors, + align="left", # Align all labels to the left of the node bars + ), + link=dict( + source=sources, + target=targets, + value=values, + color=link_colors, + hovertemplate="%{customdata}", + customdata=link_hover_labels, + ), + ) + + fig = go.Figure(sankey) + + fig.update_layout( + font=dict(size=14), + height=900, + width=None, + autosize=True, + margin=dict(t=40, b=40, l=40, r=40), + annotations=[ + dict( + x=0, + y=1.05, + xref="paper", + yref="paper", + text="Population Type", + showarrow=False, + font=dict(size=16, color="black"), + ), + dict( + x=0.5, + y=1.05, + xref="paper", + yref="paper", + text="Experimental Modality", + showarrow=False, + font=dict(size=16, color="black"), + ), + dict( + x=1, + y=1.05, + xref="paper", + yref="paper", + text="Cognitive Domain", + showarrow=False, + font=dict(size=16, color="black"), + ), + dict( + x=0, + y=-0.15, # Position the note below the chart + xref="paper", + yref="paper", + text='Note on "Unknown" category: This large portion represents datasets that are still pending categorization.', + showarrow=False, + align="left", + xanchor="left", + font=dict(size=12, color="dimgray"), + ), + ], + ) + return fig + + +def generate_dataset_sankey( + df: pd.DataFrame, + out_html: str | Path, + *, + columns: Sequence[str] | None = None, +) -> Path: + """Generate the dataset Sankey diagram and write it to *out_html*.""" + selected_columns = list(columns) if columns is not None else list(DEFAULT_COLUMNS) + prepared = _prepare_dataframe(df, selected_columns) + fig = build_sankey(prepared, selected_columns) + + out_path = Path(out_html) + out_path.parent.mkdir(parents=True, exist_ok=True) + + html_content = fig.to_html( + full_html=False, + include_plotlyjs=False, + div_id="dataset-sankey", + config={ + "responsive": True, + "displaylogo": False, + "modeBarButtonsToRemove": ["lasso2d", "select2d"], + }, + ) + + out_path.write_text(html_content, encoding="utf-8") + return out_path + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Generate a Sankey diagram from the dataset summary CSV." + ) + parser.add_argument( + "--source", + type=Path, + default=Path("eegdash/dataset/dataset_summary.csv"), + help="Path to the dataset summary CSV file.", + ) + parser.add_argument( + "--columns", + nargs=3, + metavar=("FIRST", "SECOND", "THIRD"), + default=DEFAULT_COLUMNS, + help="Three categorical columns to connect in the Sankey plot.", + ) + parser.add_argument( + "--output", + type=Path, + default=Path("dataset_summary_sankey.html"), + help="Output HTML file for the interactive Sankey diagram.", + ) + return parser.parse_args() + + +def main() -> None: + args = parse_args() + if not args.source.exists(): + raise FileNotFoundError(f"Dataset summary CSV not found at {args.source}") + + columns = list(args.columns) + df = _load_dataframe(args.source, columns) + fig = build_sankey(df, columns) + + args.output.parent.mkdir(parents=True, exist_ok=True) + fig.write_html( + str(args.output), + include_plotlyjs="cdn", + full_html=True, + auto_open=False, + ) + print(f"Sankey diagram saved to {args.output.resolve()}") + + +if __name__ == "__main__": + main() diff --git a/docs/plot_dataset/ridgeline.py b/docs/plot_dataset/ridgeline.py new file mode 100644 index 00000000..34d5a83f --- /dev/null +++ b/docs/plot_dataset/ridgeline.py @@ -0,0 +1,331 @@ +from __future__ import annotations + +import json +from datetime import datetime +from pathlib import Path + +import numpy as np +import pandas as pd +import plotly.graph_objects as go +from plotly.utils import PlotlyJSONEncoder +from scipy.stats import gaussian_kde + +try: # Allow execution as a script or module + from .colours import MODALITY_COLOR_MAP, hex_to_rgba + from .utils import get_dataset_url, primary_modality +except ImportError: # pragma: no cover - fallback for direct script execution + from colours import MODALITY_COLOR_MAP, hex_to_rgba # type: ignore + from utils import get_dataset_url, primary_modality # type: ignore + +__all__ = ["generate_modality_ridgeline"] + + +def generate_modality_ridgeline( + df: pd.DataFrame, + out_html: str | Path, + *, + rng_seed: int = 42, +) -> Path | None: + """Generate a ridgeline (KDE) plot showing participants per modality.""" + data = df[df["dataset"].str.lower() != "test"].copy() + data["modality_label"] = data["modality of exp"].apply(primary_modality) + data["n_subjects"] = pd.to_numeric(data["n_subjects"], errors="coerce") + data = data.dropna(subset=["n_subjects"]) + data = data[data["modality_label"] != "Other"] + + if data.empty: + return None + + median_participants = ( + data.groupby("modality_label")["n_subjects"].median().sort_values() + ) + order = [ + label + for label in median_participants.index + if label in data["modality_label"].unique() + ] + if not order: + return None + + fig = go.Figure() + rng = np.random.default_rng(rng_seed) + amplitude = 0.6 + row_spacing = 0.95 + + for idx, label in enumerate(order): + subset = data[data["modality_label"] == label].copy() + values = subset["n_subjects"].astype(float).dropna() + if len(values) < 3: + continue + + subset["dataset_url"] = subset["dataset"].apply(get_dataset_url) + log_vals = np.log10(values) + grid = np.linspace(log_vals.min() - 0.25, log_vals.max() + 0.25, 240) + kde = gaussian_kde(log_vals) + density = kde(grid) + if density.max() <= 0: + continue + + density_norm = density / density.max() + baseline = idx * row_spacing + y_curve = baseline + density_norm * amplitude + x_curve = 10**grid + + color = MODALITY_COLOR_MAP.get(label, "#6b7280") + fill = hex_to_rgba(color, 0.28) + + fig.add_trace( + go.Scatter( + x=np.concatenate([x_curve, x_curve[::-1]]), + y=np.concatenate([y_curve, np.full_like(y_curve, baseline)]), + name=label, + fill="toself", + fillcolor=fill, + line=dict(color="rgba(0,0,0,0)"), + hoverinfo="skip", + showlegend=False, + ) + ) + + fig.add_trace( + go.Scatter( + x=x_curve, + y=y_curve, + mode="lines", + name=label, + line=dict(color=color, width=2), + hovertemplate=f"{label}
#Participants: %{{x:.0f}}", + showlegend=False, + ) + ) + + jitter = rng.uniform(0.02, amplitude * 0.5, size=len(values)) + median_val = float(median_participants.get(label, np.nan)) + custom_data = np.column_stack( + [subset["dataset"].to_numpy(), subset["dataset_url"].to_numpy()] + ) + fig.add_trace( + go.Scatter( + x=values, + y=np.full_like(values, baseline) + jitter, + mode="markers", + name=label, + marker=dict(color=color, size=8, opacity=0.6), + customdata=custom_data, + hovertemplate="%{customdata[0]}
#Participants: %{x}
Click to view dataset details", + showlegend=False, + ) + ) + + if np.isfinite(median_val) and median_val > 0: + fig.add_trace( + go.Scatter( + x=[median_val, median_val], + y=[baseline, baseline + amplitude], + mode="lines", + line=dict(color=color, width=2, dash="dash"), + hovertemplate=( + f"{label}
Median participants: {median_val:.0f}" + ), + showlegend=False, + ) + ) + + if not fig.data: + return None + + kde_height = max(650, 150 * len(order)) + date_stamp = datetime.now().strftime("%d/%m/%Y") + fig.update_layout( + height=kde_height, + width=1200, + template="plotly_white", + xaxis=dict( + type="log", + title=dict(text="Number of Participants (Log Scale)", font=dict(size=18)), + showgrid=True, + gridcolor="rgba(0,0,0,0.08)", + zeroline=False, + dtick=1, + minor=dict(showgrid=True, gridcolor="rgba(0,0,0,0.04)"), + tickfont=dict(size=14), + ), + yaxis=dict( + title=dict(text="Modality", font=dict(size=18)), + tickmode="array", + tickvals=[idx * row_spacing for idx in range(len(order))], + ticktext=order, + showgrid=False, + range=[-0.25, max(0.35, (len(order) - 1) * row_spacing + amplitude + 0.25)], + tickfont=dict(size=14), + ), + showlegend=False, + margin=dict(l=120, r=40, t=108, b=80), + title=dict( + text=f"
Based on EEG-Dash datasets available at {date_stamp}.", + x=0.5, + xanchor="center", + y=0.98, + yanchor="top", + font=dict(size=20), + ), + autosize=True, + font=dict(size=16), + ) + + fig.add_annotation( + xref="paper", + yref="paper", + x=0.98, + y=0.02, + text="Visual studies consistently use the
largest sample sizes, typically 20-30 participants", + showarrow=False, + font=dict(size=14, color="#111827"), + bgcolor="rgba(255,255,255,0.9)", + bordercolor="rgba(17,24,39,0.3)", + borderwidth=1, + borderpad=8, + xanchor="right", + yanchor="bottom", + ) + + plot_config = { + "responsive": True, + "displaylogo": False, + "modeBarButtonsToRemove": ["lasso2d", "select2d"], + "toImageButtonOptions": { + "format": "png", + "filename": "participant_kde", + "height": kde_height, + "width": 1200, + "scale": 2, + }, + } + + fig_spec = fig.to_plotly_json() + data_json = json.dumps(fig_spec.get("data", []), cls=PlotlyJSONEncoder) + layout_json = json.dumps(fig_spec.get("layout", {}), cls=PlotlyJSONEncoder) + config_json = json.dumps(plot_config, cls=PlotlyJSONEncoder) + + styled_html = f""" + +
Loading participant distribution...
+
+ +""" + + out_path = Path(out_html) + out_path.parent.mkdir(parents=True, exist_ok=True) + out_path.write_text(styled_html, encoding="utf-8") + return out_path + + +def _read_dataset(path: Path) -> pd.DataFrame: + return pd.read_csv(path, index_col=False, header=0, skipinitialspace=True) + + +def main() -> None: + import argparse + + parser = argparse.ArgumentParser( + description="Generate the modality ridgeline plot from a dataset summary CSV." + ) + parser.add_argument("source", type=Path, help="Path to dataset summary CSV") + parser.add_argument( + "--output", + type=Path, + default=Path("dataset_kde_modalities.html"), + help="Output HTML file", + ) + parser.add_argument( + "--seed", + type=int, + default=42, + help="Random seed controlling jitter placement", + ) + args = parser.parse_args() + + df = _read_dataset(args.source) + output_path = generate_modality_ridgeline(df, args.output, rng_seed=args.seed) + if output_path is None: + print("Ridgeline plot could not be generated (insufficient data).") + else: + print(f"Ridgeline plot saved to {output_path.resolve()}") + + +if __name__ == "__main__": + main() diff --git a/docs/plot_dataset/utils.py b/docs/plot_dataset/utils.py new file mode 100644 index 00000000..2a518d69 --- /dev/null +++ b/docs/plot_dataset/utils.py @@ -0,0 +1,109 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Any + +import pandas as pd + +try: # Allow import both as package and script + from .colours import CANONICAL_MAP, MODALITY_COLOR_MAP +except ImportError: # pragma: no cover - fallback for direct script execution + from colours import CANONICAL_MAP, MODALITY_COLOR_MAP # type: ignore + +__all__ = [ + "get_dataset_url", + "human_readable_size", + "primary_modality", + "safe_int", +] + +_SEPARATORS = ("/", "|", ";") + + +def primary_modality(value: Any) -> str: + """Return the canonical modality label for a record.""" + if value is None: + return "Unknown" + if isinstance(value, float) and pd.isna(value): + return "Unknown" + + text = str(value).strip() + if not text: + return "Unknown" + + # normalise separators, keep order of appearance + for sep in _SEPARATORS: + text = text.replace(sep, ",") + tokens = [tok.strip() for tok in text.split(",") if tok.strip()] + if not tokens: + return "Unknown" + + first = tokens[0] + canonical_map = CANONICAL_MAP.get("modality of exp", {}) + lowered = first.lower() + canonical = canonical_map.get(lowered) + if canonical: + return canonical + + if first in MODALITY_COLOR_MAP: + return first + + title_variant = first.title() + if title_variant in MODALITY_COLOR_MAP: + return title_variant + + return "Other" + + +def safe_int(value: Any, default: int | None = None) -> int | None: + """Convert *value* to ``int`` when possible; otherwise return *default*.""" + try: + if value is None or (isinstance(value, float) and pd.isna(value)): + return default + return int(round(float(value))) + except Exception: + return default + + +def human_readable_size(num_bytes: int | float | None) -> str: + """Format bytes using the closest unit among MB, GB, TB (fallback to KB/B).""" + if num_bytes is None: + return "0 B" + + try: + size = float(num_bytes) + except Exception: + return "0 B" + + units = [ + (1024**4, "TB"), + (1024**3, "GB"), + (1024**2, "MB"), + (1024**1, "KB"), + (1, "B"), + ] + + for factor, unit in units: + if size >= factor: + value = size / factor + if unit in {"B", "KB"}: + return f"{int(round(value))} {unit}" + return f"{value:.2f} {unit}" + return "0 B" + + +def get_dataset_url(name: str) -> str: + """Generate dataset URL for plots (relative to dataset summary page).""" + if name is None or (isinstance(name, float) and pd.isna(name)): + return "" + text = str(name).strip() + if not text: + return "" + return f"api/dataset/eegdash.dataset.{text.upper()}.html" + + +def ensure_directory(path: str | Path) -> Path: + """Create *path* directory if required and return ``Path`` instance.""" + dest = Path(path) + dest.mkdir(parents=True, exist_ok=True) + return dest diff --git a/docs/prepare_summary_tables.py b/docs/prepare_summary_tables.py index fd904296..033824f3 100644 --- a/docs/prepare_summary_tables.py +++ b/docs/prepare_summary_tables.py @@ -1,384 +1,32 @@ import glob -import json +import textwrap from argparse import ArgumentParser +from datetime import datetime from pathlib import Path from shutil import copyfile import numpy as np import pandas as pd -import plotly.express as px -import plotly.graph_objects as go -from plotly.utils import PlotlyJSONEncoder -from scipy.stats import gaussian_kde +from plot_dataset import ( + generate_dataset_bubble, + generate_dataset_sankey, + generate_modality_ridgeline, +) +from plot_dataset.utils import get_dataset_url, human_readable_size from table_tag_utils import wrap_tags DOCS_DIR = Path(__file__).resolve().parent STATIC_DATASET_DIR = DOCS_DIR / "source" / "_static" / "dataset_generated" -MODALITY_CANONICAL = { - "visual": "Visual", - "auditory": "Auditory", - "tactile": "Tactile", - "somatosensory": "Tactile", - "multisensory": "Multisensory", - "motor": "Motor", - "rest": "Resting State", - "resting state": "Resting State", - "resting-state": "Resting State", - "sleep": "Sleep", - "other": "Other", -} - -MODALITY_COLOR_MAP = { - "Visual": "#2563eb", - "Auditory": "#0ea5e9", - "Tactile": "#10b981", - "Multisensory": "#ec4899", - "Motor": "#f59e0b", - "Resting State": "#6366f1", - "Sleep": "#7c3aed", - "Other": "#14b8a6", - "Unknown": "#94a3b8", -} - - -def _hex_to_rgba(hex_color: str, alpha: float = 0.4) -> str: - hex_color = hex_color.lstrip("#") - if len(hex_color) != 6: - return f"rgba(99, 102, 241, {alpha})" - r = int(hex_color[0:2], 16) - g = int(hex_color[2:4], 16) - b = int(hex_color[4:6], 16) - return f"rgba({r}, {g}, {b}, {alpha})" - - -def _primary_modality(value: object) -> str: - if value is None: - return "Unknown" - if isinstance(value, float) and pd.isna(value): - return "Unknown" - text = str(value).strip() - if not text: - return "Unknown" - for sep in ("/", "|", ";"): - text = text.replace(sep, ",") - tokens = [tok.strip() for tok in text.split(",") if tok.strip()] - if not tokens: - return "Unknown" - raw = tokens[0].lower() - canonical = MODALITY_CANONICAL.get(raw) - if canonical: - return canonical - candidate = tokens[0].strip() - title_candidate = candidate.title() - if title_candidate in MODALITY_COLOR_MAP: - return title_candidate - return "Other" - - -def _to_numeric_median_list(val) -> float | None: - """Return a numeric value from possible list-like strings. - - Examples - -------- - - "64" -> 64 - - "6,129" -> median -> 67.5 -> 68 - - "128, 512" -> 320 - - 500.0 -> 500 - - """ - if pd.isna(val): - return None - try: - # already numeric - return float(val) - except Exception: - pass - s = str(val).strip().strip("[]") - if not s: - return None - try: - nums = [float(x) for x in s.split(",") if str(x).strip()] - if not nums: - return None - return float(np.median(nums)) - except Exception: - return None - - -def _safe_int(x, default=None): - try: - if x is None or pd.isna(x): - return default - return int(round(float(x))) - except Exception: - return default - - -def gen_datasets_bubble( - df: pd.DataFrame, - out_html: str = "_static/dataset/dataset_bubble.html", - x_var: str = "records", # one of: 'records', 'duration_h', 'size_gb', 'tasks' -): - """Generate an interactive bubble chart for datasets. - - - x: total duration (hours) - - y: number of subjects - - size: on-disk size (GB) - - color: dataset modality - """ - d = df.copy() - d = d[d["dataset"].str.lower() != "test"] - - # numeric columns - d["duration_h"] = pd.to_numeric(d.get("duration_hours_total"), errors="coerce") - d["subjects"] = pd.to_numeric(d.get("n_subjects"), errors="coerce") - d["records"] = pd.to_numeric(d.get("n_records"), errors="coerce") - d["tasks"] = pd.to_numeric(d.get("n_tasks"), errors="coerce") - d["size_bytes"] = pd.to_numeric(d.get("size_bytes"), errors="coerce") - - # parse sampling and channels into representative numeric values - d["sfreq"] = d["sampling_freqs"].map(_to_numeric_median_list) - d["nchans"] = d["nchans_set"].map(_to_numeric_median_list) - - d["modality_label"] = d.get("modality of exp").apply(_primary_modality) - - # disk size in GB for sizing - GB = 1024**3 - d["size_gb"] = d["size_bytes"] / GB - - # hover content - def _fmt_size(bytes_): - return human_readable_size(_safe_int(bytes_, 0)) - - # choose x axis field and labels - x_field = ( - x_var if x_var in {"records", "duration_h", "size_gb", "tasks"} else "records" - ) - x_label = { - "records": "#Records", - "duration_h": "Duration (hours)", - "size_gb": "Size (GB)", - "tasks": "#Tasks", - }[x_field] - - # hover text adapts to x - if x_field == "duration_h": - x_hover = "Duration: %{x:.2f} h" - elif x_field == "size_gb": - x_hover = "Size: %{x:.2f} GB" - elif x_field == "tasks": - x_hover = "Tasks: %{x:,}" - else: - x_hover = "Records (x): %{x:,}" - - hover = ( - "%{customdata[0]}" # dataset id - "
Subjects: %{y:,}" - f"
{x_hover}" - "
Records: %{customdata[1]:,}" - "
Tasks: %{customdata[2]:,}" - "
Channels: %{customdata[3]}" - "
Sampling: %{customdata[4]} Hz" - "
Size: %{customdata[5]}" - "
Modality: %{customdata[6]}" - "" - ) - - d = d.dropna(subset=["duration_h", "subjects", "size_gb"]) # need these - - # Marker sizing: scale into a good visual range - max_size = max(d["size_gb"].max(), 1) - sizeref = (2.0 * max_size) / (40.0**2) # target ~40px max marker - - # Prepare prettified strings for hover - def _fmt_int(v): - if v is None or pd.isna(v): - return "" - try: - return str(int(round(float(v)))) - except Exception: - return str(v) - - sfreq_str = d["sfreq"].map(_fmt_int) - nchans_str = d["nchans"].map(_fmt_int) - - fig = px.scatter( - d, - x=x_field, - y="subjects", - size="size_gb", - color="modality_label", - hover_name="dataset", - custom_data=[ - d["dataset"], - d["records"], - d["tasks"], - nchans_str, - sfreq_str, - d["size_bytes"].map(_fmt_size), - d["modality_label"], - ], - size_max=40, - labels={ - "subjects": "#Subjects", - "modality_label": "Modality", - x_field: x_label, - }, - color_discrete_map=MODALITY_COLOR_MAP, - title="", - category_orders={ - "modality_label": [ - label - for label in MODALITY_COLOR_MAP.keys() - if label in d["modality_label"].unique() - ] - }, - ) - - # tune marker sizing explicitly for better control - for tr in fig.data: - tr.marker.update( - sizemin=6, - sizemode="area", - sizeref=sizeref, - line=dict(width=0.6, color="rgba(0,0,0,0.3)"), - ) - tr.hovertemplate = hover - - fig.update_layout( - height=750, - width=1200, # Set explicit width for consistent sizing - margin=dict(l=60, r=40, t=80, b=60), - template="plotly_white", - legend=dict( - title="Modality", - orientation="h", - yanchor="bottom", - y=1.02, - xanchor="right", - x=0.99, - ), - font=dict( - family="Inter, system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif", - size=14, - ), - title=dict( - text="", - x=0.01, - xanchor="left", - y=0.98, - yanchor="top", - pad=dict(t=10, b=8), - ), - autosize=True, # Enable auto-sizing to fill container - ) - - fig.update_xaxes(showgrid=True, gridcolor="rgba(0,0,0,0.12)", zeroline=False) - fig.update_yaxes(showgrid=True, gridcolor="rgba(0,0,0,0.12)", zeroline=False) - - out_path = Path(out_html) - out_path.parent.mkdir(parents=True, exist_ok=True) - # Add CSS and loading indicator for immediate proper sizing - html_content = fig.to_html( - full_html=False, - include_plotlyjs=False, - div_id="dataset-bubble", - config={ - "responsive": True, - "displaylogo": False, - "modeBarButtonsToRemove": ["lasso2d", "select2d"], - "toImageButtonOptions": { - "format": "png", - "filename": "dataset_landscape", - "height": 750, - "width": 1200, - "scale": 2, - }, - }, - ) - - # Wrap with styling to ensure proper initial sizing - styled_html = f""" - -
Loading dataset landscape...
-{html_content} - -""" - - with open(str(out_path), "w", encoding="utf-8") as f: - f.write(styled_html) - return str(out_path) - - -def human_readable_size(num_bytes: int) -> str: - """Format bytes using the closest unit among MB, GB, TB (fallback to KB/B). - - Chooses the largest unit such that the value is >= 1. Uses base 1024. - """ - if num_bytes is None: - return "0 B" - size = float(num_bytes) - units = [ - (1024**4, "TB"), - (1024**3, "GB"), - (1024**2, "MB"), - (1024**1, "KB"), - (1, "B"), - ] - for factor, unit in units: - if size >= factor: - value = size / factor - # Use no decimals for B/KB; two decimals otherwise - if unit in ("B", "KB"): - return f"{int(round(value))} {unit}" - return f"{value:.2f} {unit}" - return "0 B" - - -def get_dataset_url(name: str) -> str: - """Generate dataset URL for plots (relative to dataset summary page).""" - name = name.strip() - return f"api/dataset/eegdash.dataset.{name.upper()}.html" - - def wrap_dataset_name(name: str): # Remove any surrounding whitespace name = name.strip() # Link to the individual dataset API page # Updated structure: api/dataset/eegdash.dataset..html - url = f"api/dataset/eegdash.dataset.{name.upper()}.html" + url = get_dataset_url(name) + if not url: + return name.upper() return f'{name.upper()}' @@ -406,6 +54,163 @@ def wrap_dataset_name(name: str): }, } +DATA_TABLE_TEMPLATE = textwrap.dedent( + r""" + + + + + + + + + + + + + + + + + + +""" +) + def _tag_normalizer(kind: str): canonical = {k.lower(): v for k, v in DATASET_CANONICAL_MAP.get(kind, {}).items()} @@ -517,10 +322,21 @@ def main(source_dir: str, target_dir: str): f, index_col=False, header=0, skipinitialspace=True ) # , sep=";") # Generate bubble chart from the raw data to have access to size_bytes - # Use x-axis as number of records for better spread bubble_path = target_dir / "dataset_bubble.html" - gen_datasets_bubble(df_raw, str(bubble_path), x_var="records") - copyfile(bubble_path, STATIC_DATASET_DIR / bubble_path.name) + bubble_output = generate_dataset_bubble( + df_raw, + bubble_path, + x_var="subjects", + ) + copyfile(bubble_output, STATIC_DATASET_DIR / bubble_output.name) + + # Generate Sankey diagram showing dataset flow across categories + try: + sankey_path = target_dir / "dataset_sankey.html" + sankey_output = generate_dataset_sankey(df_raw, sankey_path) + copyfile(sankey_output, STATIC_DATASET_DIR / sankey_output.name) + except Exception as exc: + print(f"[dataset Sankey] Skipped due to error: {exc}") df = prepare_table(df_raw) # preserve int values @@ -568,239 +384,18 @@ def main(source_dir: str, target_dir: str): escape=False, table_id="datasets-table", ) + html_table = DATA_TABLE_TEMPLATE.replace("", html_table) table_path = target_dir / "dataset_summary_table.html" - with open(table_path, "+w", encoding="utf-8") as f: + with open(table_path, "w", encoding="utf-8") as f: f.write(html_table) copyfile(table_path, STATIC_DATASET_DIR / table_path.name) # Generate KDE ridgeline plot for modality participant distributions try: - d_modal = df_raw[df_raw["dataset"].str.lower() != "test"].copy() - d_modal["modality_label"] = d_modal["modality of exp"].apply( - _primary_modality - ) - d_modal["n_subjects"] = pd.to_numeric( - d_modal["n_subjects"], errors="coerce" - ) - d_modal = d_modal.dropna(subset=["n_subjects"]) - - fig_kde = go.Figure() - order = [ - label - for label in MODALITY_COLOR_MAP - if label in d_modal["modality_label"].unique() - ] - rng = np.random.default_rng(42) - - for idx, label in enumerate(order): - subset = d_modal[d_modal["modality_label"] == label].copy() - vals = subset["n_subjects"].astype(float).dropna() - if len(vals) < 3: - continue - # Generate URLs for datasets in this modality - subset["dataset_url"] = subset["dataset"].apply(get_dataset_url) - log_vals = np.log10(vals) - grid = np.linspace(log_vals.min() - 0.25, log_vals.max() + 0.25, 240) - kde = gaussian_kde(log_vals) - density = kde(grid) - if density.max() <= 0: - continue - density_norm = density / density.max() - amplitude = 0.6 - baseline = idx * 1.1 - y_curve = baseline + density_norm * amplitude - x_curve = 10**grid - - color = MODALITY_COLOR_MAP.get(label, "#6b7280") - fill = _hex_to_rgba(color, 0.28) - - fig_kde.add_trace( - go.Scatter( - x=np.concatenate([x_curve, x_curve[::-1]]), - y=np.concatenate([y_curve, np.full_like(y_curve, baseline)]), - name=label, - fill="toself", - fillcolor=fill, - line=dict(color="rgba(0,0,0,0)"), - hoverinfo="skip", - showlegend=False, - ) - ) - - fig_kde.add_trace( - go.Scatter( - x=x_curve, - y=y_curve, - mode="lines", - name=label, - line=dict(color=color, width=2), - hovertemplate=f"{label}
#Participants: %{{x:.0f}}", - ) - ) - - jitter = rng.uniform(0.02, amplitude * 0.5, size=len(vals)) - # Prepare custom data with dataset names and URLs - custom_data = np.column_stack( - [subset["dataset"].to_numpy(), subset["dataset_url"].to_numpy()] - ) - fig_kde.add_trace( - go.Scatter( - x=vals, - y=np.full_like(vals, baseline) + jitter, - mode="markers", - name=label, - marker=dict(color=color, size=5, opacity=0.6), - customdata=custom_data, - hovertemplate="%{customdata[0]}
#Participants: %{x}
Click to view dataset details", - showlegend=False, - ) - ) - - if fig_kde.data: - fig_kde.update_layout( - height=max(650, 150 * len(order)), - width=1200, # Set explicit width for consistent sizing - template="plotly_white", - xaxis=dict( - type="log", - title="#Participants", - showgrid=True, - gridcolor="rgba(0,0,0,0.12)", - zeroline=False, - ), - yaxis=dict( - title="Modality", - tickmode="array", - tickvals=[idx * 1.1 for idx in range(len(order))], - ticktext=order, - showgrid=False, - range=[-0.3, max(0.3, (len(order) - 1) * 1.1 + 0.9)], - ), - legend=dict( - title="Modality", - orientation="h", - yanchor="bottom", - y=1.02, - xanchor="right", - x=0.99, - ), - margin=dict(l=120, r=40, t=80, b=80), - title=dict( - text="", - x=0.01, - xanchor="left", - y=0.98, - yanchor="top", - ), - autosize=True, # Enable auto-sizing to fill container - ) - # Add CSS and loading indicator for immediate proper sizing - kde_height = max(650, 150 * len(order)) - plot_config = { - "responsive": True, - "displaylogo": False, - "modeBarButtonsToRemove": ["lasso2d", "select2d"], - "toImageButtonOptions": { - "format": "png", - "filename": "participant_kde", - "height": kde_height, - "width": 1200, - "scale": 2, - }, - } - fig_spec = fig_kde.to_plotly_json() - data_json = json.dumps(fig_spec.get("data", []), cls=PlotlyJSONEncoder) - layout_json = json.dumps( - fig_spec.get("layout", {}), cls=PlotlyJSONEncoder - ) - config_json = json.dumps(plot_config, cls=PlotlyJSONEncoder) - - # Wrap with styling to ensure proper initial sizing and defer Plotly rendering - styled_html = f""" - -
Loading participant distribution...
-
- -""" - - kde_path = Path(target_dir) / "dataset_kde_modalities.html" - with open(kde_path, "w", encoding="utf-8") as f: - f.write(styled_html) - copyfile(kde_path, STATIC_DATASET_DIR / kde_path.name) + kde_path = target_dir / "dataset_kde_modalities.html" + kde_output = generate_modality_ridgeline(df_raw, kde_path) + if kde_output: + copyfile(kde_output, STATIC_DATASET_DIR / kde_output.name) except Exception as exc: print(f"[dataset KDE] Skipped due to error: {exc}") diff --git a/docs/source/dataset_summary.rst b/docs/source/dataset_summary.rst index b4b607ff..c3ef7a33 100644 --- a/docs/source/dataset_summary.rst +++ b/docs/source/dataset_summary.rst @@ -10,13 +10,11 @@ .. rst-class:: dataset-summary-article -Datasets -========= +Datasets Catalog +================ To leverage recent and ongoing advancements in large-scale computational methods and to ensure the preservation of scientific data generated from publicly funded research, the EEG-DaSh data archive will create a data-sharing resource for MEEG (EEG, MEG) data contributed by collaborators for machine learning (ML) and deep learning (DL) applications. -The archive is currently still in :bdg-danger:`beta testing` mode, so be kind. - .. raw:: html @@ -27,10 +25,16 @@ The archive is currently still in :bdg-danger:`beta testing` mode, so be kind. .. include:: dataset_summary/table.rst - .. tab-item:: Participant KDE + .. tab-item:: Participant Distribution .. include:: dataset_summary/kde.rst - .. tab-item:: Landscape + .. tab-item:: Dataset Flow + + .. include:: dataset_summary/sankey.rst + + .. tab-item:: Scatter of Sample Size vs. Recording Duration .. include:: dataset_summary/bubble.rst + +The archive is currently still in :bdg-danger:`beta testing` mode, so be kind. diff --git a/docs/source/dataset_summary/bubble.rst b/docs/source/dataset_summary/bubble.rst index 83e83179..6de57ebf 100644 --- a/docs/source/dataset_summary/bubble.rst +++ b/docs/source/dataset_summary/bubble.rst @@ -1,3 +1,5 @@ +.. title:: Dataset landscape + .. rubric:: Dataset landscape .. raw:: html diff --git a/docs/source/dataset_summary/kde.rst b/docs/source/dataset_summary/kde.rst index 906a4c6f..e2f66ddf 100644 --- a/docs/source/dataset_summary/kde.rst +++ b/docs/source/dataset_summary/kde.rst @@ -1,4 +1,6 @@ -.. rubric:: Participant Distribution by Modality +.. title:: Participant Distribution by Modality + +.. rubric:: Distribution of Sample Sizes Varies by Experimental Modality .. raw:: html diff --git a/docs/source/dataset_summary/sankey.rst b/docs/source/dataset_summary/sankey.rst new file mode 100644 index 00000000..66304778 --- /dev/null +++ b/docs/source/dataset_summary/sankey.rst @@ -0,0 +1,20 @@ +.. title:: Dataset flow + +.. rubric:: Sankey diagrams of EEGDash Datasets by Population, Modality, and Cognitive Domain + +.. raw:: html + +
+ +.. raw:: html + :file: ../_static/dataset_generated/dataset_sankey.html + +.. raw:: html + +
+ Figure: Dataset flow across population, modality, and cognitive domain. + Link thickness is proportional to the total number of subjects, and the tooltip + reports both subject and dataset counts. Hover and click legend entries to + explore specific segments. +
+
diff --git a/docs/source/dataset_summary/table.rst b/docs/source/dataset_summary/table.rst index 3891d906..b409b575 100644 --- a/docs/source/dataset_summary/table.rst +++ b/docs/source/dataset_summary/table.rst @@ -1,3 +1,5 @@ +.. title:: EEG Datasets Table + .. rubric:: EEG Datasets Table The data in EEG-DaSh originates from a collaboration involving 25 laboratories, encompassing 27,053 participants. This extensive collection includes M-EEG data, which is a combination of EEG and MEG signals. The data is sourced from various studies conducted by these labs, @@ -22,153 +24,4 @@ In addition, EEG-DaSh will incorporate a subset of the data converted from `NEMA -Pathology, modality, and dataset type now surface as consistent color-coded tags so you can scan the table at a glance and reuse the same visual language as the model catalog. - -.. raw:: html - - - - - - - - - - - - - - - - - +Pathology, modality, and dataset type now surface as consistent color-coded tags so you can scan the table at a glance. diff --git a/docs/source/index.rst b/docs/source/index.rst index 01b8e41d..632007c6 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -20,11 +20,14 @@ EEG Dash Homepage .. rst-class:: h4 text-center font-weight-light my-4 - The EEG-DaSh data archive will establish a data-sharing resource for MEEG (EEG, MEG) data, enabling large-scale computational advancements to preserve and share scientific data from publicly funded research for machine learning and deep learning applications. +.. rst-class:: text-center + +**Note:** The "DaSh" in EEG-DaSh stands for **Data Share**. + The EEG-DaSh data archive is a collaborative effort led by the University of California, San Diego (UCSD) and Ben-Gurion University of the Negev (BGU) and partially funded by the National Science Foundation (NSF). All are welcome to contribute to the https://github.com/sccn/EEGDash project. The archive is currently still in :bdg-danger:`beta testing` mode, so be kind. diff --git a/eegdash/dataset/dataset_summary.csv b/eegdash/dataset/dataset_summary.csv index 609fa2d3..74781587 100644 --- a/eegdash/dataset/dataset_summary.csv +++ b/eegdash/dataset/dataset_summary.csv @@ -198,7 +198,7 @@ 197,ds003751,38,38,1,128,250,19.95,4.71 GB,5057922307,0,ds003751,Healthy,other,Multisensory,Affect 198,ds003421,80,20,1,257,1000,11.604,76.77 GB,82433418198,0,ds003421,Healthy,10-20,Multisensory,Decision-making 199,ds002158,117,20,1,,,0.0,428.59 GB,460190030981,0,ds002158,Healthy,10-20,Visual,Affect -200,ds004951,23,11,1,63,1000,29.563,22.00 GB,23627352274,0,ds004951,?,,Tactile,Learning +200,ds004951,23,11,1,63,1000,29.563,22.00 GB,23627352274,0,ds004951,,,Tactile,Learning 201,ds004802,38,38,1,65,"2048,512",0.0,29.34 GB,31504070800,0,ds004802,Other,,Visual,Affect 202,ds004816,20,20,1,63,1000,0.0,23.31 GB,25028989553,0,ds004816,Healthy,,Visual,Attention 203,ds005873,2850,125,1,2,256,11935.09,117.21 GB,125851664268,0,,,,,