spiraldb · mprammer · May 17, 2026 · May 17, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,23 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project follows [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.1.5] - 2026-05-17
+
+### Fixed
+
+- **TUI Columns modal crash on slugs with duplicate column names.**
+  Eleven slugs ship parquet schemas with legitimately repeated
+  top-level column names — the `osmi-mental-health-in-tech-*` survey
+  series (2016 through 2023) repeats "Why or why not?" follow-ups
+  under each yes/no item, and `uci-spambase`, `uci-parkinsons`, and
+  `uk-price-paid` each have one or more repeated headers. The new
+  Columns modal used the bare column name as the Textual DataTable
+  row key, so the second occurrence crashed with `DuplicateKey`.
+  Repeated names are now suffixed with ` (2)`, ` (3)`, etc. for
+  display + lookup; the by-name stats dict no longer silently
+  collapses entries either. The underlying parquet's column names
+  are unchanged.
+
 ## [0.1.4] - 2026-05-17
 
 ### Added

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "raincloud"
-version = "0.1.4"
+version = "0.1.5"
 description = "Client-reproducible pipeline for building a curated catalog of public datasets as Parquet + Vortex files."
 readme = "README.md"
 requires-python = ">=3.11"

diff --git a/scripts/pipeline/browse.py b/scripts/pipeline/browse.py
@@ -1022,6 +1022,23 @@ class _DatasetModal(ModalScreen):
     ]
 
 
+def _dedupe_stat_names(stats: list[dict]) -> list[dict]:
+    """Suffix repeated `name` values with ` (2)`, ` (3)`, etc. so each entry is uniquely keyable."""
+    seen: dict[str, int] = {}
+    out: list[dict] = []
+    for s in stats:
+        name = s.get("name")
+        count = seen.get(name, 0) + 1
+        seen[name] = count
+        if count == 1:
+            out.append(s)
+        else:
+            new = dict(s)
+            new["name"] = f"{name} ({count})"
+            out.append(new)
+    return out
+
+
 class ColumnsModal(_DatasetModal):
     """Full per-column metadata for one slug. When the local parquet isn't
     built, falls back to docs/v1/snapshot.json so the modal can still show
@@ -1036,13 +1053,16 @@ def __init__(self, slug: str, spec: dict,
         super().__init__()
         self.slug = slug
         self.spec = spec
-        self.stats = stats
+        # Some slugs (osmi-* surveys, uci-spambase, uk-price-paid) carry
+        # legitimately duplicated top-level column names. Suffix repeats so
+        # the DataTable row key is unique and stats_by_name doesn't collapse.
+        self.stats = _dedupe_stat_names(stats) if stats is not None else None
         self.source = source  # "parquet" | "snapshot" | None
         # profile.json keyed-by-column-name. Empty when the profile stage
         # hasn't been run; the right detail pane then renders "no profile".
         self.profile_columns = (profile or {}).get("columns") or {}
         # parquet schema stats keyed by name for O(1) lookup from the detail pane.
-        self.stats_by_name = {s["name"]: s for s in (stats or [])}
+        self.stats_by_name = {s["name"]: s for s in (self.stats or [])}
 
     def compose(self) -> ComposeResult:
         suffix = " [dim](from snapshot)[/dim]" if self.source == "snapshot" else ""

diff --git a/tests/test_browse.py b/tests/test_browse.py
@@ -335,6 +335,37 @@ def test_columns_modal_renders_built_state():
     assert m.profile_columns == {}
 
 
+def test_columns_modal_dedupes_duplicate_column_names():
+    """Survey slugs (osmi-* series) and a few others ship parquet schemas
+    with repeated top-level column names. The modal must suffix repeats
+    so the DataTable row key is unique and stats_by_name doesn't silently
+    collapse duplicate-named entries onto the last one."""
+    pytest.importorskip("textual")
+    from scripts.pipeline.browse import ColumnsModal
+
+    stats = [
+        {"name": "Q1", "type": "string", "length": 100,
+         "null_count": 0, "min": None, "max": None},
+        {"name": "Why or why not?", "type": "string", "length": 80,
+         "null_count": 0, "min": None, "max": None},
+        {"name": "Q2", "type": "string", "length": 90,
+         "null_count": 0, "min": None, "max": None},
+        {"name": "Why or why not?", "type": "string", "length": 75,
+         "null_count": 0, "min": None, "max": None},
+        {"name": "Why or why not?", "type": "string", "length": 60,
+         "null_count": 0, "min": None, "max": None},
+    ]
+    m = ColumnsModal("x", {"slug": "x"}, stats)
+    assert [s["name"] for s in m.stats] == [
+        "Q1", "Why or why not?", "Q2",
+        "Why or why not? (2)", "Why or why not? (3)",
+    ]
+    # stats_by_name keeps a distinct entry for each occurrence.
+    assert m.stats_by_name["Why or why not?"]["length"] == 80
+    assert m.stats_by_name["Why or why not? (2)"]["length"] == 75
+    assert m.stats_by_name["Why or why not? (3)"]["length"] == 60
+
+
 def test_render_column_detail_dtype_shapes():
     """`_render_column_detail` produces shape-appropriate multi-line markup."""
     pytest.importorskip("textual")