Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,23 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
and this project follows [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [0.1.5] - 2026-05-17

### Fixed

- **TUI Columns modal crash on slugs with duplicate column names.**
Eleven slugs ship parquet schemas with legitimately repeated
top-level column names — the `osmi-mental-health-in-tech-*` survey
series (2016 through 2023) repeats "Why or why not?" follow-ups
under each yes/no item, and `uci-spambase`, `uci-parkinsons`, and
`uk-price-paid` each have one or more repeated headers. The new
Columns modal used the bare column name as the Textual DataTable
row key, so the second occurrence crashed with `DuplicateKey`.
Repeated names are now suffixed with ` (2)`, ` (3)`, etc. for
display + lookup; the by-name stats dict no longer silently
collapses entries either. The underlying parquet's column names
are unchanged.

## [0.1.4] - 2026-05-17

### Added
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "raincloud"
version = "0.1.4"
version = "0.1.5"
description = "Client-reproducible pipeline for building a curated catalog of public datasets as Parquet + Vortex files."
readme = "README.md"
requires-python = ">=3.11"
Expand Down
24 changes: 22 additions & 2 deletions scripts/pipeline/browse.py
Original file line number Diff line number Diff line change
Expand Up @@ -1022,6 +1022,23 @@ class _DatasetModal(ModalScreen):
]


def _dedupe_stat_names(stats: list[dict]) -> list[dict]:
"""Suffix repeated `name` values with ` (2)`, ` (3)`, etc. so each entry is uniquely keyable."""
seen: dict[str, int] = {}
out: list[dict] = []
for s in stats:
name = s.get("name")
count = seen.get(name, 0) + 1
seen[name] = count
if count == 1:
out.append(s)
else:
new = dict(s)
new["name"] = f"{name} ({count})"
out.append(new)
return out


class ColumnsModal(_DatasetModal):
"""Full per-column metadata for one slug. When the local parquet isn't
built, falls back to docs/v1/snapshot.json so the modal can still show
Expand All @@ -1036,13 +1053,16 @@ def __init__(self, slug: str, spec: dict,
super().__init__()
self.slug = slug
self.spec = spec
self.stats = stats
# Some slugs (osmi-* surveys, uci-spambase, uk-price-paid) carry
# legitimately duplicated top-level column names. Suffix repeats so
# the DataTable row key is unique and stats_by_name doesn't collapse.
self.stats = _dedupe_stat_names(stats) if stats is not None else None
self.source = source # "parquet" | "snapshot" | None
# profile.json keyed-by-column-name. Empty when the profile stage
# hasn't been run; the right detail pane then renders "no profile".
self.profile_columns = (profile or {}).get("columns") or {}
# parquet schema stats keyed by name for O(1) lookup from the detail pane.
self.stats_by_name = {s["name"]: s for s in (stats or [])}
self.stats_by_name = {s["name"]: s for s in (self.stats or [])}

def compose(self) -> ComposeResult:
suffix = " [dim](from snapshot)[/dim]" if self.source == "snapshot" else ""
Expand Down
31 changes: 31 additions & 0 deletions tests/test_browse.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,37 @@ def test_columns_modal_renders_built_state():
assert m.profile_columns == {}


def test_columns_modal_dedupes_duplicate_column_names():
"""Survey slugs (osmi-* series) and a few others ship parquet schemas
with repeated top-level column names. The modal must suffix repeats
so the DataTable row key is unique and stats_by_name doesn't silently
collapse duplicate-named entries onto the last one."""
pytest.importorskip("textual")
from scripts.pipeline.browse import ColumnsModal

stats = [
{"name": "Q1", "type": "string", "length": 100,
"null_count": 0, "min": None, "max": None},
{"name": "Why or why not?", "type": "string", "length": 80,
"null_count": 0, "min": None, "max": None},
{"name": "Q2", "type": "string", "length": 90,
"null_count": 0, "min": None, "max": None},
{"name": "Why or why not?", "type": "string", "length": 75,
"null_count": 0, "min": None, "max": None},
{"name": "Why or why not?", "type": "string", "length": 60,
"null_count": 0, "min": None, "max": None},
]
m = ColumnsModal("x", {"slug": "x"}, stats)
assert [s["name"] for s in m.stats] == [
"Q1", "Why or why not?", "Q2",
"Why or why not? (2)", "Why or why not? (3)",
]
# stats_by_name keeps a distinct entry for each occurrence.
assert m.stats_by_name["Why or why not?"]["length"] == 80
assert m.stats_by_name["Why or why not? (2)"]["length"] == 75
assert m.stats_by_name["Why or why not? (3)"]["length"] == 60


def test_render_column_detail_dtype_shapes():
"""`_render_column_detail` produces shape-appropriate multi-line markup."""
pytest.importorskip("textual")
Expand Down
Loading