From cfdeb138972fa5fed125c2f4d8e3a3d665ff7687 Mon Sep 17 00:00:00 2001 From: bruAristimunha Date: Wed, 1 Oct 2025 17:17:40 +0200 Subject: [PATCH 01/30] puting the log scale --- docs/prepare_summary_tables.py | 110 +++++++++++++++++++++----- docs/source/dataset_summary/table.rst | 2 + 2 files changed, 91 insertions(+), 21 deletions(-) diff --git a/docs/prepare_summary_tables.py b/docs/prepare_summary_tables.py index fd904296..c4dda9cd 100644 --- a/docs/prepare_summary_tables.py +++ b/docs/prepare_summary_tables.py @@ -162,6 +162,7 @@ def _fmt_size(bytes_): "size_gb": "Size (GB)", "tasks": "#Tasks", }[x_field] + x_label = f"{x_label} (log scale)" # hover text adapts to x if x_field == "duration_h": @@ -183,14 +184,32 @@ def _fmt_size(bytes_): "
Sampling: %{customdata[4]} Hz" "
Size: %{customdata[5]}" "
Modality: %{customdata[6]}" + "
Click bubble to open dataset page" "" ) - d = d.dropna(subset=["duration_h", "subjects", "size_gb"]) # need these + required_columns = {"subjects", "size_gb", x_field} + d = d.replace([np.inf, -np.inf], np.nan) + d = d.dropna(subset=list(required_columns)) + d = d[(d["subjects"] > 0) & (d[x_field] > 0)] + + d["dataset_url"] = d["dataset"].apply(get_dataset_url) + + if d.empty: + out_path = Path(out_html) + out_path.parent.mkdir(parents=True, exist_ok=True) + no_data_html = """ +
No dataset records available for plotting.
+""" + with open(str(out_path), "w", encoding="utf-8") as f: + f.write(no_data_html) + return str(out_path) # Marker sizing: scale into a good visual range - max_size = max(d["size_gb"].max(), 1) - sizeref = (2.0 * max_size) / (40.0**2) # target ~40px max marker + size_max = d["size_gb"].max() + if not np.isfinite(size_max) or size_max <= 0: + size_max = 1.0 + sizeref = (2.0 * size_max) / (40.0**2) # target ~40px max marker # Prepare prettified strings for hover def _fmt_int(v): @@ -219,10 +238,11 @@ def _fmt_int(v): sfreq_str, d["size_bytes"].map(_fmt_size), d["modality_label"], + d["dataset_url"], ], size_max=40, labels={ - "subjects": "#Subjects", + "subjects": "#Subjects (log scale)", "modality_label": "Modality", x_field: x_label, }, @@ -235,6 +255,8 @@ def _fmt_int(v): if label in d["modality_label"].unique() ] }, + log_x=True, + log_y=True, ) # tune marker sizing explicitly for better control @@ -247,9 +269,12 @@ def _fmt_int(v): ) tr.hovertemplate = hover + plot_width = 1280 + plot_height = 720 + fig.update_layout( - height=750, - width=1200, # Set explicit width for consistent sizing + height=plot_height, + width=plot_width, # Landscape orientation margin=dict(l=60, r=40, t=80, b=60), template="plotly_white", legend=dict( @@ -275,8 +300,20 @@ def _fmt_int(v): autosize=True, # Enable auto-sizing to fill container ) - fig.update_xaxes(showgrid=True, gridcolor="rgba(0,0,0,0.12)", zeroline=False) - fig.update_yaxes(showgrid=True, gridcolor="rgba(0,0,0,0.12)", zeroline=False) + fig.update_xaxes( + showgrid=True, + gridcolor="rgba(0,0,0,0.12)", + zeroline=False, + type="log", + dtick=1, + ) + fig.update_yaxes( + showgrid=True, + gridcolor="rgba(0,0,0,0.12)", + zeroline=False, + type="log", + dtick=1, + ) out_path = Path(out_html) out_path.parent.mkdir(parents=True, exist_ok=True) @@ -292,8 +329,8 @@ def _fmt_int(v): "toImageButtonOptions": { "format": "png", "filename": "dataset_landscape", - "height": 750, - "width": 1200, + "height": plot_height, + "width": plot_width, "scale": 2, }, }, @@ -304,9 +341,9 @@ def _fmt_int(v): +
Loading dataset landscape...
+{html_content} + +""" + + out_path.write_text(styled_html, encoding="utf-8") + return out_path + + +def _read_dataset(path: Path) -> pd.DataFrame: + return pd.read_csv(path, index_col=False, header=0, skipinitialspace=True) + + +def main() -> None: + import argparse + + parser = argparse.ArgumentParser(description="Generate the dataset bubble chart.") + parser.add_argument("source", type=Path, help="Path to dataset summary CSV") + parser.add_argument( + "--output", + type=Path, + default=Path("dataset_bubble.html"), + help="Output HTML file", + ) + parser.add_argument( + "--x-axis", + choices=["records", "duration_h", "size_gb", "tasks", "subjects"], + default="records", + help="Field for the bubble chart x-axis", + ) + args = parser.parse_args() + + df = _read_dataset(args.source) + output_path = generate_dataset_bubble(df, args.output, x_var=args.x_axis) + print(f"Bubble chart saved to {output_path.resolve()}") + + +if __name__ == "__main__": + main() diff --git a/docs/plot_dataset/plot_sankey.py b/docs/plot_dataset/plot_sankey.py index e201e4ee..f33b6534 100644 --- a/docs/plot_dataset/plot_sankey.py +++ b/docs/plot_dataset/plot_sankey.py @@ -15,7 +15,11 @@ import pandas as pd import plotly.graph_objects as go -from colours import CANONICAL_MAP, COLUMN_COLOR_MAPS, hex_to_rgba + +try: # Support execution as a script or as a package module + from .colours import CANONICAL_MAP, COLUMN_COLOR_MAPS, hex_to_rgba +except ImportError: # pragma: no cover - fallback for direct script execution + from colours import CANONICAL_MAP, COLUMN_COLOR_MAPS, hex_to_rgba DEFAULT_COLUMNS = ["Type Subject", "modality of exp", "type of exp"] diff --git a/docs/plot_dataset/ridgeline.py b/docs/plot_dataset/ridgeline.py new file mode 100644 index 00000000..34d5a83f --- /dev/null +++ b/docs/plot_dataset/ridgeline.py @@ -0,0 +1,331 @@ +from __future__ import annotations + +import json +from datetime import datetime +from pathlib import Path + +import numpy as np +import pandas as pd +import plotly.graph_objects as go +from plotly.utils import PlotlyJSONEncoder +from scipy.stats import gaussian_kde + +try: # Allow execution as a script or module + from .colours import MODALITY_COLOR_MAP, hex_to_rgba + from .utils import get_dataset_url, primary_modality +except ImportError: # pragma: no cover - fallback for direct script execution + from colours import MODALITY_COLOR_MAP, hex_to_rgba # type: ignore + from utils import get_dataset_url, primary_modality # type: ignore + +__all__ = ["generate_modality_ridgeline"] + + +def generate_modality_ridgeline( + df: pd.DataFrame, + out_html: str | Path, + *, + rng_seed: int = 42, +) -> Path | None: + """Generate a ridgeline (KDE) plot showing participants per modality.""" + data = df[df["dataset"].str.lower() != "test"].copy() + data["modality_label"] = data["modality of exp"].apply(primary_modality) + data["n_subjects"] = pd.to_numeric(data["n_subjects"], errors="coerce") + data = data.dropna(subset=["n_subjects"]) + data = data[data["modality_label"] != "Other"] + + if data.empty: + return None + + median_participants = ( + data.groupby("modality_label")["n_subjects"].median().sort_values() + ) + order = [ + label + for label in median_participants.index + if label in data["modality_label"].unique() + ] + if not order: + return None + + fig = go.Figure() + rng = np.random.default_rng(rng_seed) + amplitude = 0.6 + row_spacing = 0.95 + + for idx, label in enumerate(order): + subset = data[data["modality_label"] == label].copy() + values = subset["n_subjects"].astype(float).dropna() + if len(values) < 3: + continue + + subset["dataset_url"] = subset["dataset"].apply(get_dataset_url) + log_vals = np.log10(values) + grid = np.linspace(log_vals.min() - 0.25, log_vals.max() + 0.25, 240) + kde = gaussian_kde(log_vals) + density = kde(grid) + if density.max() <= 0: + continue + + density_norm = density / density.max() + baseline = idx * row_spacing + y_curve = baseline + density_norm * amplitude + x_curve = 10**grid + + color = MODALITY_COLOR_MAP.get(label, "#6b7280") + fill = hex_to_rgba(color, 0.28) + + fig.add_trace( + go.Scatter( + x=np.concatenate([x_curve, x_curve[::-1]]), + y=np.concatenate([y_curve, np.full_like(y_curve, baseline)]), + name=label, + fill="toself", + fillcolor=fill, + line=dict(color="rgba(0,0,0,0)"), + hoverinfo="skip", + showlegend=False, + ) + ) + + fig.add_trace( + go.Scatter( + x=x_curve, + y=y_curve, + mode="lines", + name=label, + line=dict(color=color, width=2), + hovertemplate=f"{label}
#Participants: %{{x:.0f}}", + showlegend=False, + ) + ) + + jitter = rng.uniform(0.02, amplitude * 0.5, size=len(values)) + median_val = float(median_participants.get(label, np.nan)) + custom_data = np.column_stack( + [subset["dataset"].to_numpy(), subset["dataset_url"].to_numpy()] + ) + fig.add_trace( + go.Scatter( + x=values, + y=np.full_like(values, baseline) + jitter, + mode="markers", + name=label, + marker=dict(color=color, size=8, opacity=0.6), + customdata=custom_data, + hovertemplate="%{customdata[0]}
#Participants: %{x}
Click to view dataset details", + showlegend=False, + ) + ) + + if np.isfinite(median_val) and median_val > 0: + fig.add_trace( + go.Scatter( + x=[median_val, median_val], + y=[baseline, baseline + amplitude], + mode="lines", + line=dict(color=color, width=2, dash="dash"), + hovertemplate=( + f"{label}
Median participants: {median_val:.0f}" + ), + showlegend=False, + ) + ) + + if not fig.data: + return None + + kde_height = max(650, 150 * len(order)) + date_stamp = datetime.now().strftime("%d/%m/%Y") + fig.update_layout( + height=kde_height, + width=1200, + template="plotly_white", + xaxis=dict( + type="log", + title=dict(text="Number of Participants (Log Scale)", font=dict(size=18)), + showgrid=True, + gridcolor="rgba(0,0,0,0.08)", + zeroline=False, + dtick=1, + minor=dict(showgrid=True, gridcolor="rgba(0,0,0,0.04)"), + tickfont=dict(size=14), + ), + yaxis=dict( + title=dict(text="Modality", font=dict(size=18)), + tickmode="array", + tickvals=[idx * row_spacing for idx in range(len(order))], + ticktext=order, + showgrid=False, + range=[-0.25, max(0.35, (len(order) - 1) * row_spacing + amplitude + 0.25)], + tickfont=dict(size=14), + ), + showlegend=False, + margin=dict(l=120, r=40, t=108, b=80), + title=dict( + text=f"
Based on EEG-Dash datasets available at {date_stamp}.", + x=0.5, + xanchor="center", + y=0.98, + yanchor="top", + font=dict(size=20), + ), + autosize=True, + font=dict(size=16), + ) + + fig.add_annotation( + xref="paper", + yref="paper", + x=0.98, + y=0.02, + text="Visual studies consistently use the
largest sample sizes, typically 20-30 participants", + showarrow=False, + font=dict(size=14, color="#111827"), + bgcolor="rgba(255,255,255,0.9)", + bordercolor="rgba(17,24,39,0.3)", + borderwidth=1, + borderpad=8, + xanchor="right", + yanchor="bottom", + ) + + plot_config = { + "responsive": True, + "displaylogo": False, + "modeBarButtonsToRemove": ["lasso2d", "select2d"], + "toImageButtonOptions": { + "format": "png", + "filename": "participant_kde", + "height": kde_height, + "width": 1200, + "scale": 2, + }, + } + + fig_spec = fig.to_plotly_json() + data_json = json.dumps(fig_spec.get("data", []), cls=PlotlyJSONEncoder) + layout_json = json.dumps(fig_spec.get("layout", {}), cls=PlotlyJSONEncoder) + config_json = json.dumps(plot_config, cls=PlotlyJSONEncoder) + + styled_html = f""" + +
Loading participant distribution...
+
+ +""" + + out_path = Path(out_html) + out_path.parent.mkdir(parents=True, exist_ok=True) + out_path.write_text(styled_html, encoding="utf-8") + return out_path + + +def _read_dataset(path: Path) -> pd.DataFrame: + return pd.read_csv(path, index_col=False, header=0, skipinitialspace=True) + + +def main() -> None: + import argparse + + parser = argparse.ArgumentParser( + description="Generate the modality ridgeline plot from a dataset summary CSV." + ) + parser.add_argument("source", type=Path, help="Path to dataset summary CSV") + parser.add_argument( + "--output", + type=Path, + default=Path("dataset_kde_modalities.html"), + help="Output HTML file", + ) + parser.add_argument( + "--seed", + type=int, + default=42, + help="Random seed controlling jitter placement", + ) + args = parser.parse_args() + + df = _read_dataset(args.source) + output_path = generate_modality_ridgeline(df, args.output, rng_seed=args.seed) + if output_path is None: + print("Ridgeline plot could not be generated (insufficient data).") + else: + print(f"Ridgeline plot saved to {output_path.resolve()}") + + +if __name__ == "__main__": + main() diff --git a/docs/plot_dataset/utils.py b/docs/plot_dataset/utils.py new file mode 100644 index 00000000..fbce6502 --- /dev/null +++ b/docs/plot_dataset/utils.py @@ -0,0 +1,109 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Any + +import pandas as pd + +try: # Allow import both as package and script + from .colours import CANONICAL_MAP, MODALITY_COLOR_MAP +except ImportError: # pragma: no cover - fallback for direct script execution + from colours import CANONICAL_MAP, MODALITY_COLOR_MAP # type: ignore + +__all__ = [ + "get_dataset_url", + "human_readable_size", + "primary_modality", + "safe_int", +] + +_SEPARATORS = ("/", "|", ";") + + +def primary_modality(value: Any) -> str: + """Return the canonical modality label for a record.""" + if value is None: + return "Unknown" + if isinstance(value, float) and pd.isna(value): + return "Unknown" + + text = str(value).strip() + if not text: + return "Unknown" + + # normalise separators, keep order of appearance + for sep in _SEPARATORS: + text = text.replace(sep, ",") + tokens = [tok.strip() for tok in text.split(",") if tok.strip()] + if not tokens: + return "Unknown" + + first = tokens[0] + canonical_map = CANONICAL_MAP.get("modality of exp", {}) + lowered = first.lower() + canonical = canonical_map.get(lowered) + if canonical: + return canonical + + if first in MODALITY_COLOR_MAP: + return first + + title_variant = first.title() + if title_variant in MODALITY_COLOR_MAP: + return title_variant + + return "Other" + + +def safe_int(value: Any, default: int | None = None) -> int | None: + """Convert *value* to ``int`` when possible; otherwise return *default*.""" + try: + if value is None or (isinstance(value, float) and pd.isna(value)): + return default + return int(round(float(value))) + except Exception: + return default + + +def human_readable_size(num_bytes: int | float | None) -> str: + """Format bytes using the closest unit among MB, GB, TB (fallback to KB/B).""" + if num_bytes is None: + return "0 B" + + try: + size = float(num_bytes) + except Exception: + return "0 B" + + units = [ + (1024**4, "TB"), + (1024**3, "GB"), + (1024**2, "MB"), + (1024**1, "KB"), + (1, "B"), + ] + + for factor, unit in units: + if size >= factor: + value = size / factor + if unit in {"B", "KB"}: + return f"{int(round(value))} {unit}" + return f"{value:.2f} {unit}" + return "0 B" + + +def get_dataset_url(name: str) -> str: + """Generate dataset URL for plots (relative to dataset summary page).""" + if name is None or (isinstance(name, float) and pd.isna(name)): + return "" + text = str(name).strip() + if not text: + return "" + return f"../../api/dataset/eegdash.dataset.{text.upper()}.html" + + +def ensure_directory(path: str | Path) -> Path: + """Create *path* directory if required and return ``Path`` instance.""" + dest = Path(path) + dest.mkdir(parents=True, exist_ok=True) + return dest diff --git a/docs/prepare_summary_tables.py b/docs/prepare_summary_tables.py index f0151bcd..51085274 100644 --- a/docs/prepare_summary_tables.py +++ b/docs/prepare_summary_tables.py @@ -1,5 +1,4 @@ import glob -import json from argparse import ArgumentParser from datetime import datetime from pathlib import Path @@ -7,511 +6,14 @@ import numpy as np import pandas as pd -import plotly.express as px -import plotly.graph_objects as go -from plotly.utils import PlotlyJSONEncoder -from scipy.stats import gaussian_kde +from plot_dataset import generate_dataset_bubble, generate_modality_ridgeline +from plot_dataset.utils import get_dataset_url, human_readable_size from table_tag_utils import wrap_tags DOCS_DIR = Path(__file__).resolve().parent STATIC_DATASET_DIR = DOCS_DIR / "source" / "_static" / "dataset_generated" -MODALITY_CANONICAL = { - "visual": "Visual", - "auditory": "Auditory", - "tactile": "Tactile", - "somatosensory": "Tactile", - "multisensory": "Multisensory", - "motor": "Motor", - "rest": "Resting State", - "resting state": "Resting State", - "resting-state": "Resting State", - "sleep": "Sleep", - "other": "Other", -} - -MODALITY_COLOR_MAP = { - "Visual": "#2563eb", - "Auditory": "#0ea5e9", - "Tactile": "#10b981", - "Multisensory": "#ec4899", - "Motor": "#f59e0b", - "Resting State": "#6366f1", - "Sleep": "#7c3aed", - "Other": "#14b8a6", - "Unknown": "#94a3b8", -} - - -def _hex_to_rgba(hex_color: str, alpha: float = 0.4) -> str: - hex_color = hex_color.lstrip("#") - if len(hex_color) != 6: - return f"rgba(99, 102, 241, {alpha})" - r = int(hex_color[0:2], 16) - g = int(hex_color[2:4], 16) - b = int(hex_color[4:6], 16) - return f"rgba({r}, {g}, {b}, {alpha})" - - -def _primary_modality(value: object) -> str: - if value is None: - return "Unknown" - if isinstance(value, float) and pd.isna(value): - return "Unknown" - text = str(value).strip() - if not text: - return "Unknown" - for sep in ("/", "|", ";"): - text = text.replace(sep, ",") - tokens = [tok.strip() for tok in text.split(",") if tok.strip()] - if not tokens: - return "Unknown" - raw = tokens[0].lower() - canonical = MODALITY_CANONICAL.get(raw) - if canonical: - return canonical - candidate = tokens[0].strip() - title_candidate = candidate.title() - if title_candidate in MODALITY_COLOR_MAP: - return title_candidate - return "Other" - - -def _to_numeric_median_list(val) -> float | None: - """Return a numeric value from possible list-like strings. - - Examples - -------- - - "64" -> 64 - - "6,129" -> median -> 67.5 -> 68 - - "128, 512" -> 320 - - 500.0 -> 500 - - """ - if pd.isna(val): - return None - try: - # already numeric - return float(val) - except Exception: - pass - s = str(val).strip().strip("[]") - if not s: - return None - try: - nums = [float(x) for x in s.split(",") if str(x).strip()] - if not nums: - return None - return float(np.median(nums)) - except Exception: - return None - - -def _safe_int(x, default=None): - try: - if x is None or pd.isna(x): - return default - return int(round(float(x))) - except Exception: - return default - - -def gen_datasets_bubble( - df: pd.DataFrame, - out_html: str = "_static/dataset/dataset_bubble.html", - x_var: str = "records", # one of: 'records', 'duration_h', 'size_gb', 'tasks' -): - """Generate an interactive bubble chart for datasets. - - - x: total duration (hours) - - y: number of subjects - - size: on-disk size (GB) - - color: dataset modality - """ - d = df.copy() - d = d[d["dataset"].str.lower() != "test"] - - # numeric columns - d["duration_h"] = pd.to_numeric(d.get("duration_hours_total"), errors="coerce") - d["subjects"] = pd.to_numeric(d.get("n_subjects"), errors="coerce") - d["records"] = pd.to_numeric(d.get("n_records"), errors="coerce") - d["tasks"] = pd.to_numeric(d.get("n_tasks"), errors="coerce") - d["size_bytes"] = pd.to_numeric(d.get("size_bytes"), errors="coerce") - - # parse sampling and channels into representative numeric values - d["sfreq"] = d["sampling_freqs"].map(_to_numeric_median_list) - d["nchans"] = d["nchans_set"].map(_to_numeric_median_list) - - d["modality_label"] = d.get("modality of exp").apply(_primary_modality) - - # disk size in GB for sizing - GB = 1024**3 - d["size_gb"] = d["size_bytes"] / GB - - # hover content - def _fmt_size(bytes_): - return human_readable_size(_safe_int(bytes_, 0)) - - # choose x axis field and labels - x_field = ( - x_var - if x_var in {"records", "duration_h", "size_gb", "tasks", "subjects"} - else "records" - ) - - axis_base_labels = { - "records": "#Records", - "duration_h": "Duration (hours)", - "size_gb": "Size (GB)", - "tasks": "#Tasks", - "subjects": "#Subjects", - } - - x_label = f"{axis_base_labels[x_field]} (log scale)" - y_field = "subjects" - if x_field == "subjects": - y_field = "records" - y_label = f"{axis_base_labels[y_field]} (log scale)" - - # hover text adapts to axis choices - if x_field == "duration_h": - x_hover = "Duration (x): %{x:.2f} h" - elif x_field == "size_gb": - x_hover = "Size (x): %{x:.2f} GB" - elif x_field == "tasks": - x_hover = "Tasks (x): %{x:,}" - elif x_field == "subjects": - x_hover = "Subjects (x): %{x:,}" - else: - x_hover = "Records (x): %{x:,}" - - if y_field == "subjects": - y_hover = "Subjects (y): %{y:,}" - else: - y_hover = "Records (y): %{y:,}" - - hover = ( - "%{customdata[0]}" # dataset id - f"
{x_hover}" - f"
{y_hover}" - "
Subjects (total): %{customdata[1]:,}" - "
Records (total): %{customdata[2]:,}" - "
Tasks: %{customdata[3]:,}" - "
Channels: %{customdata[4]}" - "
Sampling: %{customdata[5]} Hz" - "
Size: %{customdata[6]}" - "
Modality: %{customdata[7]}" - "
Click bubble to open dataset page" - "" - ) - - required_columns = {x_field, y_field, "size_gb"} - d = d.replace([np.inf, -np.inf], np.nan) - d = d.dropna(subset=list(required_columns)) - d = d[(d[x_field] > 0) & (d[y_field] > 0)] - - d["dataset_url"] = d["dataset"].apply(get_dataset_url) - - if d.empty: - out_path = Path(out_html) - out_path.parent.mkdir(parents=True, exist_ok=True) - no_data_html = """ -
No dataset records available for plotting.
-""" - with open(str(out_path), "w", encoding="utf-8") as f: - f.write(no_data_html) - return str(out_path) - - # Marker sizing: scale into a good visual range - size_max = d["size_gb"].max() - if not np.isfinite(size_max) or size_max <= 0: - size_max = 1.0 - sizeref = (2.0 * size_max) / (40.0**2) # target ~40px max marker - - # Prepare prettified strings for hover - def _fmt_int(v): - if v is None or pd.isna(v): - return "" - try: - return str(int(round(float(v)))) - except Exception: - return str(v) - - sfreq_str = d["sfreq"].map(_fmt_int) - nchans_str = d["nchans"].map(_fmt_int) - - fig = px.scatter( - d, - x=x_field, - y=y_field, - size="size_gb", - color="modality_label", - hover_name="dataset", - custom_data=[ - d["dataset"], - d["subjects"], - d["records"], - d["tasks"], - nchans_str, - sfreq_str, - d["size_bytes"].map(_fmt_size), - d["modality_label"], - d["dataset_url"], - ], - size_max=40, - labels={ - y_field: y_label, - "modality_label": "Modality", - x_field: x_label, - }, - color_discrete_map=MODALITY_COLOR_MAP, - title="", - category_orders={ - "modality_label": [ - label - for label in MODALITY_COLOR_MAP.keys() - if label in d["modality_label"].unique() - ] - }, - log_x=True, - log_y=True, - ) - - # Add a log-log regression fit line and R² annotation when data permits - fit_annotation_text = None - numeric_x = pd.to_numeric(d[x_field], errors="coerce") - numeric_y = pd.to_numeric(d[y_field], errors="coerce") - mask = ( - np.isfinite(numeric_x) - & np.isfinite(numeric_y) - & (numeric_x > 0) - & (numeric_y > 0) - ) - - if mask.sum() >= 2: - log_x = np.log10(numeric_x[mask]) - log_y = np.log10(numeric_y[mask]) - ss_tot = np.sum((log_y - log_y.mean()) ** 2) - if np.ptp(log_x) > 0 and np.ptp(log_y) > 0 and ss_tot > 0: - slope, intercept = np.polyfit(log_x, log_y, 1) - line_log_x = np.linspace(log_x.min(), log_x.max(), 200) - line_x = 10**line_log_x - line_y = 10 ** (slope * line_log_x + intercept) - fig.add_trace( - go.Scatter( - x=line_x, - y=line_y, - mode="lines", - name="log-log fit", - line=dict(color="#111827", width=2, dash="dot"), - hoverinfo="skip", - showlegend=False, - ) - ) - residuals = log_y - (slope * log_x + intercept) - r_squared = 1 - np.sum(residuals**2) / ss_tot - fit_annotation_text = f"log-log OLS fit R² = {r_squared:.3f}" - - # tune marker sizing explicitly for better control - for tr in fig.data: - mode = getattr(tr, "mode", "") or "" - if "markers" not in mode: - continue - tr.marker.update( - sizemin=6, - sizemode="area", - sizeref=sizeref, - line=dict(width=0.6, color="rgba(0,0,0,0.3)"), - opacity=0.75, - ) - tr.hovertemplate = hover - - plot_width = 1280 - plot_height = 720 - - fig.update_layout( - height=plot_height, - width=plot_width, # Landscape orientation - margin=dict(l=60, r=40, t=80, b=60), - template="plotly_white", - legend=dict( - title="Modality", - orientation="h", - yanchor="bottom", - y=1.02, - xanchor="right", - x=0.99, - ), - font=dict( - family="Inter, system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif", - size=14, - ), - title=dict( - text="", - x=0.01, - xanchor="left", - y=0.98, - yanchor="top", - pad=dict(t=10, b=8), - ), - autosize=True, # Enable auto-sizing to fill container - ) - - if fit_annotation_text: - fig.add_annotation( - xref="paper", - yref="paper", - x=0.02, - y=0.98, - text=fit_annotation_text, - showarrow=False, - font=dict(size=15, color="#111827"), - bgcolor="rgba(255,255,255,0.75)", - bordercolor="rgba(17,24,39,0.25)", - borderwidth=1, - borderpad=6, - ) - - fig.update_xaxes( - showgrid=True, - gridcolor="rgba(0,0,0,0.12)", - zeroline=False, - type="log", - dtick=1, - ) - fig.update_yaxes( - showgrid=True, - gridcolor="rgba(0,0,0,0.12)", - zeroline=False, - type="log", - dtick=1, - ) - - out_path = Path(out_html) - out_path.parent.mkdir(parents=True, exist_ok=True) - # Add CSS and loading indicator for immediate proper sizing - html_content = fig.to_html( - full_html=False, - include_plotlyjs=False, - div_id="dataset-bubble", - config={ - "responsive": True, - "displaylogo": False, - "modeBarButtonsToRemove": ["lasso2d", "select2d"], - "toImageButtonOptions": { - "format": "png", - "filename": "dataset_landscape", - "height": plot_height, - "width": plot_width, - "scale": 2, - }, - }, - ) - - # Wrap with styling to ensure proper initial sizing - styled_html = f""" - -
Loading dataset landscape...
-{html_content} - -""" - - with open(str(out_path), "w", encoding="utf-8") as f: - f.write(styled_html) - return str(out_path) - - -def human_readable_size(num_bytes: int) -> str: - """Format bytes using the closest unit among MB, GB, TB (fallback to KB/B). - - Chooses the largest unit such that the value is >= 1. Uses base 1024. - """ - if num_bytes is None: - return "0 B" - size = float(num_bytes) - units = [ - (1024**4, "TB"), - (1024**3, "GB"), - (1024**2, "MB"), - (1024**1, "KB"), - (1, "B"), - ] - for factor, unit in units: - if size >= factor: - value = size / factor - # Use no decimals for B/KB; two decimals otherwise - if unit in ("B", "KB"): - return f"{int(round(value))} {unit}" - return f"{value:.2f} {unit}" - return "0 B" - - -def get_dataset_url(name: str) -> str: - """Generate dataset URL for plots (relative to dataset summary page).""" - if name is None or (isinstance(name, float) and pd.isna(name)): - return "" - text = str(name).strip() - if not text: - return "" - return f"../../api/dataset/eegdash.dataset.{text.upper()}.html" - - def wrap_dataset_name(name: str): # Remove any surrounding whitespace name = name.strip() @@ -658,10 +160,13 @@ def main(source_dir: str, target_dir: str): f, index_col=False, header=0, skipinitialspace=True ) # , sep=";") # Generate bubble chart from the raw data to have access to size_bytes - # Use x-axis as number of subjects so participant counts lead the story bubble_path = target_dir / "dataset_bubble.html" - gen_datasets_bubble(df_raw, str(bubble_path), x_var="subjects") - copyfile(bubble_path, STATIC_DATASET_DIR / bubble_path.name) + bubble_output = generate_dataset_bubble( + df_raw, + bubble_path, + x_var="subjects", + ) + copyfile(bubble_output, STATIC_DATASET_DIR / bubble_output.name) df = prepare_table(df_raw) # preserve int values @@ -716,282 +221,10 @@ def main(source_dir: str, target_dir: str): # Generate KDE ridgeline plot for modality participant distributions try: - d_modal = df_raw[df_raw["dataset"].str.lower() != "test"].copy() - d_modal["modality_label"] = d_modal["modality of exp"].apply( - _primary_modality - ) - d_modal["n_subjects"] = pd.to_numeric( - d_modal["n_subjects"], errors="coerce" - ) - d_modal = d_modal.dropna(subset=["n_subjects"]) - - # Filter out "Other" modality - d_modal = d_modal[d_modal["modality_label"] != "Other"] - - # Calculate median participants per modality and reorder ascending - median_participants = ( - d_modal.groupby("modality_label")["n_subjects"].median().sort_values() - ) - order = [ - label - for label in median_participants.index - if label in d_modal["modality_label"].unique() - ] - - fig_kde = go.Figure() - rng = np.random.default_rng(42) - amplitude = 0.6 - row_spacing = 0.95 - - for idx, label in enumerate(order): - subset = d_modal[d_modal["modality_label"] == label].copy() - vals = subset["n_subjects"].astype(float).dropna() - if len(vals) < 3: - continue - # Generate URLs for datasets in this modality - subset["dataset_url"] = subset["dataset"].apply(get_dataset_url) - log_vals = np.log10(vals) - grid = np.linspace(log_vals.min() - 0.25, log_vals.max() + 0.25, 240) - kde = gaussian_kde(log_vals) - density = kde(grid) - if density.max() <= 0: - continue - density_norm = density / density.max() - baseline = idx * row_spacing - y_curve = baseline + density_norm * amplitude - x_curve = 10**grid - - color = MODALITY_COLOR_MAP.get(label, "#6b7280") - fill = _hex_to_rgba(color, 0.28) - - fig_kde.add_trace( - go.Scatter( - x=np.concatenate([x_curve, x_curve[::-1]]), - y=np.concatenate([y_curve, np.full_like(y_curve, baseline)]), - name=label, - fill="toself", - fillcolor=fill, - line=dict(color="rgba(0,0,0,0)"), - hoverinfo="skip", - showlegend=False, - ) - ) - - fig_kde.add_trace( - go.Scatter( - x=x_curve, - y=y_curve, - mode="lines", - name=label, - line=dict(color=color, width=2), - hovertemplate=f"{label}
#Participants: %{{x:.0f}}", - showlegend=False, - ) - ) - - jitter = rng.uniform(0.02, amplitude * 0.5, size=len(vals)) - median_val = float(median_participants.get(label, np.nan)) - - # Prepare custom data with dataset names and URLs - custom_data = np.column_stack( - [subset["dataset"].to_numpy(), subset["dataset_url"].to_numpy()] - ) - fig_kde.add_trace( - go.Scatter( - x=vals, - y=np.full_like(vals, baseline) + jitter, - mode="markers", - name=label, - marker=dict(color=color, size=8, opacity=0.6), - customdata=custom_data, - hovertemplate="%{customdata[0]}
#Participants: %{x}
Click to view dataset details", - showlegend=False, - ) - ) - - if np.isfinite(median_val) and median_val > 0: - fig_kde.add_trace( - go.Scatter( - x=[median_val, median_val], - y=[baseline, baseline + amplitude], - mode="lines", - line=dict(color=color, width=2, dash="dash"), - hovertemplate=( - f"{label}
Median participants: {median_val:.0f}" - ), - showlegend=False, - ) - ) - - if fig_kde.data: - fig_kde.update_layout( - height=max(650, 140 * len(order)), - width=1200, # Set explicit width for consistent sizing - template="plotly_white", - xaxis=dict( - type="log", - title=dict( - text="Number of Participants (Log Scale)", - font=dict(size=18), - ), - showgrid=True, - gridcolor="rgba(0,0,0,0.08)", - zeroline=False, - dtick=1, - minor=dict(showgrid=True, gridcolor="rgba(0,0,0,0.04)"), - tickfont=dict(size=14), - ), - yaxis=dict( - title=dict(text="Modality", font=dict(size=18)), - tickmode="array", - tickvals=[idx * row_spacing for idx in range(len(order))], - ticktext=order, - showgrid=False, - range=[ - -0.25, - max( - 0.35, (len(order) - 1) * row_spacing + amplitude + 0.25 - ), - ], - tickfont=dict(size=14), - ), - showlegend=False, - margin=dict(l=120, r=40, t=108, b=80), - title=dict( - text=f"
Based on a EEG-Dash Datasets avaliables at {datetime.now().strftime('%d/%m/%Y')}.", - x=0.5, - xanchor="center", - y=0.98, - yanchor="top", - font=dict(size=20), - ), - autosize=True, # Enable auto-sizing to fill container - font=dict(size=16), - ) - - # Add annotation highlighting Visual distribution - fig_kde.add_annotation( - xref="paper", - yref="paper", - x=0.98, - y=0.02, - text="Visual studies consistently use the
largest sample sizes, typically 20-30 participants", - showarrow=False, - font=dict(size=14, color="#111827"), - bgcolor="rgba(255,255,255,0.9)", - bordercolor="rgba(17,24,39,0.3)", - borderwidth=1, - borderpad=8, - xanchor="right", - yanchor="bottom", - ) - # Add CSS and loading indicator for immediate proper sizing - kde_height = max(650, 150 * len(order)) - plot_config = { - "responsive": True, - "displaylogo": False, - "modeBarButtonsToRemove": ["lasso2d", "select2d"], - "toImageButtonOptions": { - "format": "png", - "filename": "participant_kde", - "height": kde_height, - "width": 1200, - "scale": 2, - }, - } - fig_spec = fig_kde.to_plotly_json() - data_json = json.dumps(fig_spec.get("data", []), cls=PlotlyJSONEncoder) - layout_json = json.dumps( - fig_spec.get("layout", {}), cls=PlotlyJSONEncoder - ) - config_json = json.dumps(plot_config, cls=PlotlyJSONEncoder) - - # Wrap with styling to ensure proper initial sizing and defer Plotly rendering - styled_html = f""" - -
Loading participant distribution...
-
- -""" - - kde_path = Path(target_dir) / "dataset_kde_modalities.html" - with open(kde_path, "w", encoding="utf-8") as f: - f.write(styled_html) - copyfile(kde_path, STATIC_DATASET_DIR / kde_path.name) + kde_path = target_dir / "dataset_kde_modalities.html" + kde_output = generate_modality_ridgeline(df_raw, kde_path) + if kde_output: + copyfile(kde_output, STATIC_DATASET_DIR / kde_output.name) except Exception as exc: print(f"[dataset KDE] Skipped due to error: {exc}") From 73a54b5e3b1faff72f8d4813e9bd0a2b0a820f3f Mon Sep 17 00:00:00 2001 From: bruAristimunha Date: Wed, 1 Oct 2025 22:34:27 +0200 Subject: [PATCH 22/30] fixing small details --- docs/plot_dataset/__init__.py | 1 + docs/plot_dataset/plot_sankey.py | 111 +++++++++- docs/prepare_summary_tables.py | 14 +- docs/source/dataset_summary.rst | 4 + docs/source/dataset_summary/sankey.rst | 20 ++ docs/source/dataset_summary/table.rst | 293 +++++++++++++------------ 6 files changed, 287 insertions(+), 156 deletions(-) create mode 100644 docs/source/dataset_summary/sankey.rst diff --git a/docs/plot_dataset/__init__.py b/docs/plot_dataset/__init__.py index a258d27f..85942823 100644 --- a/docs/plot_dataset/__init__.py +++ b/docs/plot_dataset/__init__.py @@ -9,4 +9,5 @@ TYPE_COLOR_MAP, hex_to_rgba, ) +from .plot_sankey import generate_dataset_sankey # noqa: F401 from .ridgeline import generate_modality_ridgeline # noqa: F401 diff --git a/docs/plot_dataset/plot_sankey.py b/docs/plot_dataset/plot_sankey.py index f33b6534..a6c163eb 100644 --- a/docs/plot_dataset/plot_sankey.py +++ b/docs/plot_dataset/plot_sankey.py @@ -22,16 +22,10 @@ from colours import CANONICAL_MAP, COLUMN_COLOR_MAPS, hex_to_rgba DEFAULT_COLUMNS = ["Type Subject", "modality of exp", "type of exp"] +__all__ = ["generate_dataset_sankey", "build_sankey"] -def _load_dataframe(path: Path, columns: Sequence[str]) -> pd.DataFrame: - df = pd.read_csv( - path, - index_col=False, - header=0, - skipinitialspace=True, - ) - # Ensure n_subjects is read, as it's needed for weighting +def _prepare_dataframe(df: pd.DataFrame, columns: Sequence[str]) -> pd.DataFrame: all_columns = list(columns) if "n_subjects" not in all_columns: all_columns.append("n_subjects") @@ -55,7 +49,7 @@ def _load_dataframe(path: Path, columns: Sequence[str]) -> pd.DataFrame: cleaned[col] = cleaned[col].fillna("Unknown") # 2. Split multi-valued cells - cleaned[col] = cleaned[col].astype(str).str.split("/|;|,") + cleaned[col] = cleaned[col].astype(str).str.split(r"/|;|,", regex=True) cleaned = cleaned.explode(col) # 3. Clean up whitespace and any empty strings created by splitting @@ -77,6 +71,16 @@ def _load_dataframe(path: Path, columns: Sequence[str]) -> pd.DataFrame: return cleaned[all_columns] +def _load_dataframe(path: Path, columns: Sequence[str]) -> pd.DataFrame: + df = pd.read_csv( + path, + index_col=False, + header=0, + skipinitialspace=True, + ) + return _prepare_dataframe(df, columns) + + def _build_sankey_data(df: pd.DataFrame, columns: Sequence[str]): node_labels: list[str] = [] node_colors: list[str] = [] @@ -268,6 +272,95 @@ def build_sankey(df: pd.DataFrame, columns: Sequence[str]) -> go.Figure: return fig +def generate_dataset_sankey( + df: pd.DataFrame, + out_html: str | Path, + *, + columns: Sequence[str] | None = None, +) -> Path: + """Generate the dataset Sankey diagram and write it to *out_html*.""" + selected_columns = list(columns) if columns is not None else list(DEFAULT_COLUMNS) + prepared = _prepare_dataframe(df, selected_columns) + fig = build_sankey(prepared, selected_columns) + + out_path = Path(out_html) + out_path.parent.mkdir(parents=True, exist_ok=True) + + html_content = fig.to_html( + full_html=False, + include_plotlyjs=False, + div_id="dataset-sankey", + config={ + "responsive": True, + "displaylogo": False, + "modeBarButtonsToRemove": ["lasso2d", "select2d"], + }, + ) + + styled_html = f""" + +
Loading dataset flow...
+{html_content} + +""" + + out_path.write_text(styled_html, encoding="utf-8") + return out_path + + def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( description="Generate a Sankey diagram from the dataset summary CSV." diff --git a/docs/prepare_summary_tables.py b/docs/prepare_summary_tables.py index 51085274..e1980c73 100644 --- a/docs/prepare_summary_tables.py +++ b/docs/prepare_summary_tables.py @@ -6,7 +6,11 @@ import numpy as np import pandas as pd -from plot_dataset import generate_dataset_bubble, generate_modality_ridgeline +from plot_dataset import ( + generate_dataset_bubble, + generate_dataset_sankey, + generate_modality_ridgeline, +) from plot_dataset.utils import get_dataset_url, human_readable_size from table_tag_utils import wrap_tags @@ -168,6 +172,14 @@ def main(source_dir: str, target_dir: str): ) copyfile(bubble_output, STATIC_DATASET_DIR / bubble_output.name) + # Generate Sankey diagram showing dataset flow across categories + try: + sankey_path = target_dir / "dataset_sankey.html" + sankey_output = generate_dataset_sankey(df_raw, sankey_path) + copyfile(sankey_output, STATIC_DATASET_DIR / sankey_output.name) + except Exception as exc: + print(f"[dataset Sankey] Skipped due to error: {exc}") + df = prepare_table(df_raw) # preserve int values df["n_subjects"] = df["n_subjects"].astype(int) diff --git a/docs/source/dataset_summary.rst b/docs/source/dataset_summary.rst index 642fbb2e..63abefeb 100644 --- a/docs/source/dataset_summary.rst +++ b/docs/source/dataset_summary.rst @@ -29,6 +29,10 @@ To leverage recent and ongoing advancements in large-scale computational methods .. include:: dataset_summary/kde.rst + .. tab-item:: Dataset Flow + + .. include:: dataset_summary/sankey.rst + .. tab-item:: Landscape .. include:: dataset_summary/bubble.rst diff --git a/docs/source/dataset_summary/sankey.rst b/docs/source/dataset_summary/sankey.rst new file mode 100644 index 00000000..3403ab62 --- /dev/null +++ b/docs/source/dataset_summary/sankey.rst @@ -0,0 +1,20 @@ +.. title:: Dataset flow + +.. rubric:: Dataset flow + +.. raw:: html + +
+ +.. raw:: html + :file: ../_static/dataset_generated/dataset_sankey.html + +.. raw:: html + +
+ Figure: Dataset flow across population, modality, and cognitive domain. + Link thickness is proportional to the total number of subjects, and the tooltip + reports both subject and dataset counts. Hover and click legend entries to + explore specific segments. +
+
diff --git a/docs/source/dataset_summary/table.rst b/docs/source/dataset_summary/table.rst index 3975b97f..542a87b5 100644 --- a/docs/source/dataset_summary/table.rst +++ b/docs/source/dataset_summary/table.rst @@ -25,151 +25,152 @@ In addition, EEG-DaSh will incorporate a subset of the data converted from `NEMA Pathology, modality, and dataset type now surface as consistent color-coded tags so you can scan the table at a glance. + .. raw:: html - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + From c7ea66924d3eb063e5060d92bd0bfd2c335b93f5 Mon Sep 17 00:00:00 2001 From: bruAristimunha Date: Wed, 1 Oct 2025 22:40:53 +0200 Subject: [PATCH 23/30] improve this --- docs/plot_dataset/plot_sankey.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/docs/plot_dataset/plot_sankey.py b/docs/plot_dataset/plot_sankey.py index a6c163eb..f2506ec0 100644 --- a/docs/plot_dataset/plot_sankey.py +++ b/docs/plot_dataset/plot_sankey.py @@ -300,25 +300,37 @@ def generate_dataset_sankey( styled_html = f"""
Loading dataset flow...
{html_content} From 25d36fa98729cee31045d2ecf0134bb610769c9d Mon Sep 17 00:00:00 2001 From: bruAristimunha Date: Wed, 1 Oct 2025 22:47:40 +0200 Subject: [PATCH 24/30] updating the title --- docs/source/dataset_summary/sankey.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/dataset_summary/sankey.rst b/docs/source/dataset_summary/sankey.rst index 3403ab62..66304778 100644 --- a/docs/source/dataset_summary/sankey.rst +++ b/docs/source/dataset_summary/sankey.rst @@ -1,6 +1,6 @@ .. title:: Dataset flow -.. rubric:: Dataset flow +.. rubric:: Sankey diagrams of EEGDash Datasets by Population, Modality, and Cognitive Domain .. raw:: html From 4c4fdcb12e2ae978201e637a53bd361d955965e2 Mon Sep 17 00:00:00 2001 From: bruAristimunha Date: Wed, 1 Oct 2025 22:47:50 +0200 Subject: [PATCH 25/30] removing the css --- docs/plot_dataset/plot_sankey.py | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/plot_dataset/plot_sankey.py b/docs/plot_dataset/plot_sankey.py index f2506ec0..511b920b 100644 --- a/docs/plot_dataset/plot_sankey.py +++ b/docs/plot_dataset/plot_sankey.py @@ -225,7 +225,6 @@ def build_sankey(df: pd.DataFrame, columns: Sequence[str]) -> go.Figure: fig = go.Figure(sankey) fig.update_layout( - title_text="Sankey diagrams of EEGDash Datasets by Population, Modality, and Cognitive Domain", font=dict(size=14), margin=dict(b=100), # Add bottom margin to make space for the note annotations=[ From 7d6a6a88e1510dde8f047b8477954a92b19f7ed3 Mon Sep 17 00:00:00 2001 From: bruAristimunha Date: Wed, 1 Oct 2025 23:05:28 +0200 Subject: [PATCH 26/30] fixing the plot sankey --- docs/plot_dataset/plot_sankey.py | 79 ++------------------------------ 1 file changed, 5 insertions(+), 74 deletions(-) diff --git a/docs/plot_dataset/plot_sankey.py b/docs/plot_dataset/plot_sankey.py index 511b920b..fb41a3a8 100644 --- a/docs/plot_dataset/plot_sankey.py +++ b/docs/plot_dataset/plot_sankey.py @@ -226,7 +226,10 @@ def build_sankey(df: pd.DataFrame, columns: Sequence[str]) -> go.Figure: fig.update_layout( font=dict(size=14), - margin=dict(b=100), # Add bottom margin to make space for the note + height=900, + width=None, + autosize=True, + margin=dict(t=40, b=40, l=40, r=40), annotations=[ dict( x=0, @@ -296,79 +299,7 @@ def generate_dataset_sankey( }, ) - styled_html = f""" - -
Loading dataset flow...
-{html_content} - -""" - - out_path.write_text(styled_html, encoding="utf-8") + out_path.write_text(html_content, encoding="utf-8") return out_path From ae7abada3f16ba650fbde5756394026fbd451454 Mon Sep 17 00:00:00 2001 From: bruAristimunha Date: Wed, 1 Oct 2025 23:06:42 +0200 Subject: [PATCH 27/30] renaming to better categorization --- docs/source/dataset_summary.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/dataset_summary.rst b/docs/source/dataset_summary.rst index 63abefeb..c3ef7a33 100644 --- a/docs/source/dataset_summary.rst +++ b/docs/source/dataset_summary.rst @@ -25,7 +25,7 @@ To leverage recent and ongoing advancements in large-scale computational methods .. include:: dataset_summary/table.rst - .. tab-item:: Participant KDE + .. tab-item:: Participant Distribution .. include:: dataset_summary/kde.rst @@ -33,7 +33,7 @@ To leverage recent and ongoing advancements in large-scale computational methods .. include:: dataset_summary/sankey.rst - .. tab-item:: Landscape + .. tab-item:: Scatter of Sample Size vs. Recording Duration .. include:: dataset_summary/bubble.rst From 2102e2c0fb724c0ed158ce83854828812d273dad Mon Sep 17 00:00:00 2001 From: bruAristimunha Date: Wed, 1 Oct 2025 23:09:52 +0200 Subject: [PATCH 28/30] small note --- docs/source/index.rst | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/source/index.rst b/docs/source/index.rst index 01b8e41d..632007c6 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -20,11 +20,14 @@ EEG Dash Homepage .. rst-class:: h4 text-center font-weight-light my-4 - The EEG-DaSh data archive will establish a data-sharing resource for MEEG (EEG, MEG) data, enabling large-scale computational advancements to preserve and share scientific data from publicly funded research for machine learning and deep learning applications. +.. rst-class:: text-center + +**Note:** The "DaSh" in EEG-DaSh stands for **Data Share**. + The EEG-DaSh data archive is a collaborative effort led by the University of California, San Diego (UCSD) and Ben-Gurion University of the Negev (BGU) and partially funded by the National Science Foundation (NSF). All are welcome to contribute to the https://github.com/sccn/EEGDash project. The archive is currently still in :bdg-danger:`beta testing` mode, so be kind. From ad00d264ff8fa8d08b561dae6705dea19c1a5414 Mon Sep 17 00:00:00 2001 From: bruAristimunha Date: Wed, 1 Oct 2025 23:18:57 +0200 Subject: [PATCH 29/30] updating the table --- docs/prepare_summary_tables.py | 161 +++++++++++++++++++++++++- docs/source/dataset_summary/table.rst | 149 ------------------------ 2 files changed, 160 insertions(+), 150 deletions(-) diff --git a/docs/prepare_summary_tables.py b/docs/prepare_summary_tables.py index e1980c73..033824f3 100644 --- a/docs/prepare_summary_tables.py +++ b/docs/prepare_summary_tables.py @@ -1,4 +1,5 @@ import glob +import textwrap from argparse import ArgumentParser from datetime import datetime from pathlib import Path @@ -53,6 +54,163 @@ def wrap_dataset_name(name: str): }, } +DATA_TABLE_TEMPLATE = textwrap.dedent( + r""" + + + + + + + + + + + + + + + + + + +""" +) + def _tag_normalizer(kind: str): canonical = {k.lower(): v for k, v in DATASET_CANONICAL_MAP.get(kind, {}).items()} @@ -226,8 +384,9 @@ def main(source_dir: str, target_dir: str): escape=False, table_id="datasets-table", ) + html_table = DATA_TABLE_TEMPLATE.replace("", html_table) table_path = target_dir / "dataset_summary_table.html" - with open(table_path, "+w", encoding="utf-8") as f: + with open(table_path, "w", encoding="utf-8") as f: f.write(html_table) copyfile(table_path, STATIC_DATASET_DIR / table_path.name) diff --git a/docs/source/dataset_summary/table.rst b/docs/source/dataset_summary/table.rst index 542a87b5..b409b575 100644 --- a/docs/source/dataset_summary/table.rst +++ b/docs/source/dataset_summary/table.rst @@ -25,152 +25,3 @@ In addition, EEG-DaSh will incorporate a subset of the data converted from `NEMA Pathology, modality, and dataset type now surface as consistent color-coded tags so you can scan the table at a glance. - -.. raw:: html - - - - - - - - - - - - - - - - - From 638bd3d8b37ebe3cc2966634aa5750411228fe02 Mon Sep 17 00:00:00 2001 From: bruAristimunha Date: Wed, 1 Oct 2025 23:25:43 +0200 Subject: [PATCH 30/30] removing relative path --- docs/plot_dataset/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/plot_dataset/utils.py b/docs/plot_dataset/utils.py index fbce6502..2a518d69 100644 --- a/docs/plot_dataset/utils.py +++ b/docs/plot_dataset/utils.py @@ -99,7 +99,7 @@ def get_dataset_url(name: str) -> str: text = str(name).strip() if not text: return "" - return f"../../api/dataset/eegdash.dataset.{text.upper()}.html" + return f"api/dataset/eegdash.dataset.{text.upper()}.html" def ensure_directory(path: str | Path) -> Path: