From 3c2ecef0399ef05748aed44c427c6e453a2ea452 Mon Sep 17 00:00:00 2001 From: Nico Ritschel Date: Sat, 3 Jan 2026 13:12:15 -0800 Subject: [PATCH] Fix aggregation parsing for complex expressions - Replace greedy regex with sqlglot in Metric class to properly handle expressions like SUM(x) / SUM(y) without mangling them - Fix COUNT DISTINCT detection using isinstance(parsed.this, exp.Distinct) - Add expression metric support in SQL generator for metrics with inline aggregations (agg=None, type=None, sql=) - Fix dependency analyzer to skip resolution for expression metrics with inline aggregations - Fix cumulative metrics to properly resolve references to other measures and generate valid aliases Add kitchen sink tests using patterns from rill-examples to catch edge cases. --- sidemantic/adapters/rill.py | 42 +--- sidemantic/core/dependency_analyzer.py | 14 ++ sidemantic/core/metric.py | 76 ++++-- sidemantic/sql/generator.py | 59 ++++- tests/adapters/rill/test_kitchen_sink.py | 289 +++++++++++++++++++++++ tests/fixtures/rill/kitchen_sink.yaml | 189 +++++++++++++++ 6 files changed, 601 insertions(+), 68 deletions(-) create mode 100644 tests/adapters/rill/test_kitchen_sink.py create mode 100644 tests/fixtures/rill/kitchen_sink.yaml diff --git a/sidemantic/adapters/rill.py b/sidemantic/adapters/rill.py index e5650eb5..abc7e087 100644 --- a/sidemantic/adapters/rill.py +++ b/sidemantic/adapters/rill.py @@ -7,9 +7,7 @@ from pathlib import Path from typing import Any -import sqlglot import yaml -from sqlglot import expressions as exp from sidemantic.core.dimension import Dimension from sidemantic.core.metric import Metric @@ -203,46 +201,14 @@ def _parse_measure(self, measure_def: dict[str, Any]) -> Metric | None: # "simple" = basic aggregation (None type), "derived" = calculation using other measures metric_type = "derived" - # Use sqlglot to detect simple aggregations - agg_type = None - agg_sql = None - try: - parsed = sqlglot.parse_one(expression, read="duckdb") - - # Check if this is a simple aggregation function - if isinstance(parsed, (exp.Sum, exp.Avg, exp.Count, exp.Min, exp.Max)): - # Map sqlglot aggregation types to Sidemantic agg types - if isinstance(parsed, exp.Sum): - agg_type = "sum" - elif isinstance(parsed, exp.Avg): - agg_type = "avg" - elif isinstance(parsed, exp.Count): - if parsed.args.get("distinct"): - agg_type = "count_distinct" - else: - agg_type = "count" - elif isinstance(parsed, exp.Min): - agg_type = "min" - elif isinstance(parsed, exp.Max): - agg_type = "max" - - # Extract the aggregated column/expression - agg_arg = parsed.this - if agg_arg: - agg_sql = agg_arg.sql(dialect="duckdb") - elif isinstance(parsed, exp.Count): - # COUNT(*) case - agg_sql = None - except Exception: - # If parsing fails, treat as custom SQL expression - pass - + # Let the Metric class handle aggregation parsing via its model_validator. + # This properly handles complex expressions like SUM(x) / SUM(y) and + # COUNT(DISTINCT col) using sqlglot. return Metric( name=name, label=label, description=description, - agg=agg_type, - sql=agg_sql if agg_type else expression, + sql=expression, # Pass full expression, Metric will parse aggregations type=metric_type, value_format_name=value_format_name, window_order=window_order, diff --git a/sidemantic/core/dependency_analyzer.py b/sidemantic/core/dependency_analyzer.py index 6725ef0e..bc14eef6 100644 --- a/sidemantic/core/dependency_analyzer.py +++ b/sidemantic/core/dependency_analyzer.py @@ -56,6 +56,20 @@ def extract_metric_dependencies(metric_obj, graph=None, model_context=None) -> s deps.add(metric_obj.sql) return deps + # Check if this is an expression metric with inline aggregations + # (e.g., SUM(x) / SUM(y), COUNT(DISTINCT col) * 1.0) + # These don't have measure dependencies - the aggregations are inline + try: + parsed = sqlglot.parse_one(metric_obj.sql) + # Check if the expression contains any aggregation functions + agg_types = (exp.Sum, exp.Avg, exp.Count, exp.Min, exp.Max, exp.Median) + has_inline_agg = any(parsed.find_all(*agg_types)) + if has_inline_agg and not metric_obj.type: + # Expression metric with inline aggregations - no measure dependencies + return deps + except Exception: + pass + # Extract column references from expression refs = extract_column_references(metric_obj.sql) diff --git a/sidemantic/core/metric.py b/sidemantic/core/metric.py index 96f20ac5..cff73f5d 100644 --- a/sidemantic/core/metric.py +++ b/sidemantic/core/metric.py @@ -45,9 +45,11 @@ def handle_expr_and_parse_agg(cls, data): 1. Converts expr= to sql= for backwards compatibility 2. Parses aggregation functions from SQL (e.g., SUM(amount) -> agg=sum, sql=amount) - """ - import re + Uses sqlglot to properly parse expressions and handle nested parentheses. + Only extracts aggregation from SIMPLE expressions (single aggregation function). + Complex expressions like SUM(x) / SUM(y) are preserved as-is. + """ if isinstance(data, dict): # Step 1: Handle expr alias expr_val = data.get("expr") @@ -72,23 +74,61 @@ def handle_expr_and_parse_agg(cls, data): # Parse if sql is provided and agg is not set # Allow parsing for simple metrics (no type) OR cumulative metrics (to support AVG/COUNT windows) if sql_val and not agg_val and (not type_val or type_val == "cumulative"): - # Match aggregation functions at the start: SUM(expr), COUNT(expr), etc. - agg_pattern = r"^\s*(SUM|COUNT|AVG|MIN|MAX|MEDIAN|COUNT_DISTINCT)\s*\((.*)\)\s*$" - match = re.match(agg_pattern, sql_val, re.IGNORECASE) - - if match: - agg_func = match.group(1).lower() - inner_expr = match.group(2).strip() - - # Extract DISTINCT for COUNT(DISTINCT col) - if agg_func == "count": - distinct_match = re.match(r"^\s*DISTINCT\s+(.+)$", inner_expr, re.IGNORECASE) - if distinct_match: + try: + import sqlglot + from sqlglot import expressions as exp + + parsed = sqlglot.parse_one(sql_val, read="duckdb") + + # Only extract if the TOP-LEVEL expression is a simple aggregation + # This prevents breaking expressions like SUM(x) / SUM(y) + agg_map = { + exp.Sum: "sum", + exp.Avg: "avg", + exp.Min: "min", + exp.Max: "max", + exp.Median: "median", + } + + agg_func = None + inner_expr = None + + # Check for standard aggregations + for agg_class, agg_name in agg_map.items(): + if isinstance(parsed, agg_class): + agg_func = agg_name + if parsed.this: + inner_expr = parsed.this.sql(dialect="duckdb") + break + + # Handle COUNT specially (need to detect DISTINCT) + if isinstance(parsed, exp.Count): + # Check if the argument is a Distinct expression + if isinstance(parsed.this, exp.Distinct): agg_func = "count_distinct" - inner_expr = distinct_match.group(1).strip() - - data["agg"] = agg_func - data["sql"] = inner_expr + # Extract all expressions from inside Distinct + # e.g., COUNT(DISTINCT a, b) -> "a, b" + if parsed.this.expressions: + inner_expr = ", ".join(e.sql(dialect="duckdb") for e in parsed.this.expressions) + else: + inner_expr = parsed.this.sql(dialect="duckdb") + else: + agg_func = "count" + if parsed.this: + inner_expr = parsed.this.sql(dialect="duckdb") + # COUNT(*) case - inner_expr stays None + + if agg_func: + data["agg"] = agg_func + if inner_expr is not None: + data["sql"] = inner_expr + elif agg_func == "count": + # COUNT(*) - leave sql as None or "*" + data["sql"] = None + + except Exception: + # If sqlglot parsing fails, leave the expression as-is + pass return data diff --git a/sidemantic/sql/generator.py b/sidemantic/sql/generator.py index 2f8ff2c3..d3284ae0 100644 --- a/sidemantic/sql/generator.py +++ b/sidemantic/sql/generator.py @@ -1398,13 +1398,15 @@ def _build_main_select( # Complex metric types (derived, ratio) can be built inline # Note: cumulative, time_comparison, conversion are handled via special query generators # and won't appear in this code path - if measure.type in ["derived", "ratio"]: + # Also handle "expression metrics" - metrics with inline aggregations like SUM(x)/SUM(y) + is_expression_metric = not measure.type and not measure.agg and measure.sql + if measure.type in ["derived", "ratio"] or is_expression_metric: # Use complex metric builder metric_expr = self._build_metric_sql(measure, model_name) metric_expr = self._wrap_with_fill_nulls(metric_expr, measure) select_exprs.append(f"{metric_expr} AS {alias}") elif not measure.agg: - # Complex types that need special handling (shouldn't reach here normally) + # Unknown metric type that needs special handling raise ValueError( f"Metric '{measure.name}' with type '{measure.type}' cannot be queried directly. " f"Use generate() instead of _build_main_select() for this metric type." @@ -1736,7 +1738,12 @@ def _build_metric_sql(self, metric, model_context: str | None = None) -> str: # Check if this is a SQL expression metric (has inline aggregations) # These metrics already contain complete SQL and shouldn't have dependencies replaced - has_inline_agg = any(agg in formula.upper() for agg in ["COUNT(", "SUM(", "AVG(", "MIN(", "MAX("]) + try: + parsed = sqlglot.parse_one(formula, read=self.dialect) + agg_types = (exp.Sum, exp.Avg, exp.Count, exp.Min, exp.Max, exp.Median) + has_inline_agg = any(parsed.find_all(*agg_types)) + except Exception: + has_inline_agg = False if has_inline_agg: # This is a SQL expression metric with inline aggregations. @@ -2009,7 +2016,12 @@ def _generate_with_window_functions( cumulative_metrics.append(m) # Add the base measure/metric to base_metrics if metric.sql: - base_metrics.append(metric.sql) + base_ref = metric.sql + # Qualify unqualified references with the model name + if "." not in base_ref and "." in m: + model_name = m.split(".")[0] + base_ref = f"{model_name}.{base_ref}" + base_metrics.append(base_ref) elif metric and metric.type == "time_comparison": # Validate required fields if not metric.base_metric: @@ -2076,7 +2088,16 @@ def _generate_with_window_functions( # Add cumulative metrics with window functions for m in cumulative_metrics: - metric = self.graph.get_metric(m) + # Handle both qualified (model.measure) and unqualified references + if "." in m: + model_name, measure_name = m.split(".", 1) + model = self.graph.get_model(model_name) + metric = model.get_metric(measure_name) if model else None + # Use just the measure name as the alias (not model.measure) + metric_alias = measure_name + else: + metric = self.graph.get_metric(m) + metric_alias = m if not metric or (not metric.sql and not metric.window_expression): continue @@ -2107,7 +2128,7 @@ def _generate_with_window_functions( if metric.window_expression: order_col = time_dim frame = metric.window_frame or "ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW" - window_expr = f"{metric.window_expression} OVER (ORDER BY {order_col} {frame}) AS {m}" + window_expr = f"{metric.window_expression} OVER (ORDER BY {order_col} {frame}) AS {metric_alias}" select_exprs.append(window_expr) continue @@ -2118,8 +2139,22 @@ def _generate_with_window_functions( # It's a direct measure reference - extract just the measure name base_alias = base_ref.split(".")[1] else: - # It's a metric reference - check if it exists and get its underlying measure - base_metric = self.graph.get_metric(base_ref) + # It's an unqualified reference - check model first, then graph-level + base_metric = None + # Get model name from the cumulative metric reference + cum_model_name = m.split(".")[0] if "." in m else None + if cum_model_name: + cum_model = self.graph.get_model(cum_model_name) + if cum_model: + base_metric = cum_model.get_metric(base_ref) + + # Fallback to graph-level metric + if not base_metric: + try: + base_metric = self.graph.get_metric(base_ref) + except KeyError: + pass + if base_metric and base_metric.sql: # Use the underlying measure name if "." in base_metric.sql: @@ -2145,20 +2180,20 @@ def _generate_with_window_functions( grain = metric.grain_to_date partition = self._date_trunc(grain, time_dim) - window_expr = f"{agg_func}({base_col}) OVER (PARTITION BY {partition} ORDER BY {time_dim} ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS {m}" + window_expr = f"{agg_func}({base_col}) OVER (PARTITION BY {partition} ORDER BY {time_dim} ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS {metric_alias}" elif metric.window: # Parse window (e.g., "7 days") window_parts = metric.window.split() if len(window_parts) == 2: num, unit = window_parts # For date-based windows, use RANGE - window_expr = f"{agg_func}({base_col}) OVER (ORDER BY {time_dim} RANGE BETWEEN INTERVAL '{num} {unit}' PRECEDING AND CURRENT ROW) AS {m}" + window_expr = f"{agg_func}({base_col}) OVER (ORDER BY {time_dim} RANGE BETWEEN INTERVAL '{num} {unit}' PRECEDING AND CURRENT ROW) AS {metric_alias}" else: # Fallback to rows - window_expr = f"{agg_func}({base_col}) OVER (ORDER BY {time_dim} ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS {m}" + window_expr = f"{agg_func}({base_col}) OVER (ORDER BY {time_dim} ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS {metric_alias}" else: # Running total (unbounded window) - window_expr = f"{agg_func}({base_col}) OVER (ORDER BY {time_dim} ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS {m}" + window_expr = f"{agg_func}({base_col}) OVER (ORDER BY {time_dim} ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS {metric_alias}" select_exprs.append(window_expr) diff --git a/tests/adapters/rill/test_kitchen_sink.py b/tests/adapters/rill/test_kitchen_sink.py new file mode 100644 index 00000000..37c38fd9 --- /dev/null +++ b/tests/adapters/rill/test_kitchen_sink.py @@ -0,0 +1,289 @@ +"""Kitchen sink tests for Rill adapter using real-world patterns. + +These tests use expressions from rill-examples to find holes in the implementation. +The goal is to find bugs, not conform tests to bugs. +""" + +import tempfile +from pathlib import Path + +import pytest + +from sidemantic import SemanticLayer +from sidemantic.adapters.rill import RillAdapter + + +@pytest.fixture +def kitchen_sink_layer(): + """Load kitchen sink fixture.""" + adapter = RillAdapter() + graph = adapter.parse("tests/fixtures/rill/kitchen_sink.yaml") + layer = SemanticLayer() + layer.graph = graph + return layer + + +class TestKitchenSinkParsing: + """Test that the kitchen sink fixture parses correctly.""" + + def test_model_loads(self, kitchen_sink_layer): + """Verify the model loads.""" + assert "kitchen_sink" in kitchen_sink_layer.graph.models + + def test_dimension_count(self, kitchen_sink_layer): + """Verify all dimensions loaded.""" + model = kitchen_sink_layer.graph.models["kitchen_sink"] + # 5 explicit dimensions + 1 auto-created timeseries + assert len(model.dimensions) >= 5 + + def test_metric_count(self, kitchen_sink_layer): + """Verify all metrics loaded.""" + model = kitchen_sink_layer.graph.models["kitchen_sink"] + assert len(model.metrics) >= 20 + + +class TestSimpleAggregations: + """Test simple aggregation expressions.""" + + def test_count_star(self, kitchen_sink_layer): + """COUNT(*) should work.""" + sql = kitchen_sink_layer.compile(metrics=["kitchen_sink.total_records"]) + assert "COUNT(" in sql.upper() + + def test_sum(self, kitchen_sink_layer): + """SUM(column) should work.""" + sql = kitchen_sink_layer.compile(metrics=["kitchen_sink.total_revenue"]) + assert "SUM(" in sql.upper() + + def test_avg(self, kitchen_sink_layer): + """AVG(column) should work.""" + sql = kitchen_sink_layer.compile(metrics=["kitchen_sink.avg_amount"]) + assert "AVG(" in sql.upper() + + def test_min(self, kitchen_sink_layer): + """MIN(column) should work.""" + sql = kitchen_sink_layer.compile(metrics=["kitchen_sink.min_amount"]) + assert "MIN(" in sql.upper() + + def test_max(self, kitchen_sink_layer): + """MAX(column) should work.""" + sql = kitchen_sink_layer.compile(metrics=["kitchen_sink.max_amount"]) + assert "MAX(" in sql.upper() + + def test_median(self, kitchen_sink_layer): + """MEDIAN(column) should work.""" + sql = kitchen_sink_layer.compile(metrics=["kitchen_sink.median_duration"]) + assert "MEDIAN(" in sql.upper() + + +class TestCountDistinct: + """Test COUNT DISTINCT expressions.""" + + def test_count_distinct_simple(self, kitchen_sink_layer): + """COUNT(DISTINCT col) should produce valid SQL.""" + sql = kitchen_sink_layer.compile(metrics=["kitchen_sink.unique_users"]) + # Should have COUNT and DISTINCT + assert "COUNT(" in sql.upper() + assert "DISTINCT" in sql.upper() + # Should NOT have broken SQL like "DISTINCT user_id AS" + assert "DISTINCT user_id AS" not in sql + + def test_count_distinct_with_function(self, kitchen_sink_layer): + """COUNT(DISTINCT CONCAT(...)) from rill-311-ops.""" + sql = kitchen_sink_layer.compile(metrics=["kitchen_sink.unique_locations"]) + assert "COUNT(" in sql.upper() + assert "DISTINCT" in sql.upper() + assert "CONCAT" in sql.upper() + + +class TestArithmeticExpressions: + """Test expressions with arithmetic operators. + + These are the patterns that break with the greedy regex bug. + """ + + def test_division_of_sums(self, kitchen_sink_layer): + """SUM(x) / SUM(y) from rill-github-analytics. + + BUG: Greedy regex in Metric class matches first SUM( to last ), + extracting 'deletions) / SUM(changes' as the inner expression. + """ + model = kitchen_sink_layer.graph.models["kitchen_sink"] + metric = model.get_metric("deletion_pct") + + # The expression should be preserved as-is (no agg extraction) + # OR properly parsed as a ratio + assert metric is not None + + # If parsed as expression without agg: + if metric.agg is None: + assert metric.sql == "SUM(deletions) / SUM(changes)" + else: + # Should NOT have broken SQL + assert ") / SUM(" not in (metric.sql or "") + + def test_subtraction_of_sums(self, kitchen_sink_layer): + """SUM(x) - SUM(y) from rill-cost-monitoring.""" + model = kitchen_sink_layer.graph.models["kitchen_sink"] + metric = model.get_metric("net_revenue") + + assert metric is not None + if metric.agg is None: + assert metric.sql == "SUM(revenue) - SUM(cost)" + else: + assert ") - SUM(" not in (metric.sql or "") + + def test_division_of_counts(self, kitchen_sink_layer): + """COUNT(*) / COUNT(DISTINCT x) from rill-github-analytics.""" + model = kitchen_sink_layer.graph.models["kitchen_sink"] + metric = model.get_metric("files_per_commit") + + assert metric is not None + if metric.agg is None: + assert "COUNT(*)" in metric.sql + assert "COUNT(DISTINCT" in metric.sql + else: + assert ") / COUNT(" not in (metric.sql or "") + + def test_sum_with_division(self, kitchen_sink_layer): + """SUM(x)/1000 from rill-openrtb-prog-ads.""" + model = kitchen_sink_layer.graph.models["kitchen_sink"] + metric = model.get_metric("ad_spend") + + assert metric is not None + # Should either preserve full expression or work correctly + sql = kitchen_sink_layer.compile(metrics=["kitchen_sink.ad_spend"]) + # Should not error and should contain the division + assert "/1000" in sql or "/ 1000" in sql + + def test_ratio_with_multiplier(self, kitchen_sink_layer): + """SUM(x)*1.0/SUM(y)*1.0 from rill-app-engagement.""" + model = kitchen_sink_layer.graph.models["kitchen_sink"] + metric = model.get_metric("conversion_rate") + + assert metric is not None + # Full expression should be preserved + if metric.agg is None: + assert "*1.0" in metric.sql + + def test_margin_calculation(self, kitchen_sink_layer): + """(SUM(x) - SUM(y))/SUM(x) from rill-cost-monitoring.""" + model = kitchen_sink_layer.graph.models["kitchen_sink"] + metric = model.get_metric("margin") + + assert metric is not None + # Complex expression - should be preserved + if metric.agg is None: + assert "SUM(revenue)" in metric.sql + assert "SUM(cost)" in metric.sql + + +class TestConditionalAggregations: + """Test aggregations with CASE expressions.""" + + def test_count_with_case(self, kitchen_sink_layer): + """COUNT(CASE WHEN ... END) should work.""" + sql = kitchen_sink_layer.compile(metrics=["kitchen_sink.completed_orders"]) + assert "COUNT(" in sql.upper() + assert "CASE" in sql.upper() + + def test_sum_with_case(self, kitchen_sink_layer): + """SUM(CASE WHEN ... END) should work.""" + sql = kitchen_sink_layer.compile(metrics=["kitchen_sink.completed_revenue"]) + assert "SUM(" in sql.upper() + assert "CASE" in sql.upper() + + +class TestDerivedMetrics: + """Test derived/calculated metrics.""" + + def test_derived_metric_references(self, kitchen_sink_layer): + """Derived metric referencing other metrics.""" + model = kitchen_sink_layer.graph.models["kitchen_sink"] + metric = model.get_metric("revenue_per_user") + + assert metric is not None + assert metric.type == "derived" + + def test_derived_metric_sql(self, kitchen_sink_layer): + """Derived metric should compile.""" + sql = kitchen_sink_layer.compile(metrics=["kitchen_sink.revenue_per_user"]) + # Should reference the base metrics + assert sql is not None + + +class TestWindowFunctions: + """Test window function metrics.""" + + def test_window_metric_parsing(self, kitchen_sink_layer): + """Window function metrics should parse.""" + model = kitchen_sink_layer.graph.models["kitchen_sink"] + metric = model.get_metric("rolling_7day_revenue") + + assert metric is not None + assert metric.type == "cumulative" + + +class TestSQLGeneration: + """Test that generated SQL is syntactically valid.""" + + def test_all_metrics_compile(self, kitchen_sink_layer): + """All metrics should compile without errors.""" + model = kitchen_sink_layer.graph.models["kitchen_sink"] + + failed = [] + for metric in model.metrics: + try: + sql = kitchen_sink_layer.compile(metrics=[f"kitchen_sink.{metric.name}"]) + # Basic syntax check - should have SELECT + assert "SELECT" in sql.upper() + except Exception as e: + failed.append((metric.name, str(e))) + + if failed: + msg = "\n".join(f" {name}: {err}" for name, err in failed) + pytest.fail(f"Failed to compile metrics:\n{msg}") + + def test_sql_is_parseable(self, kitchen_sink_layer): + """Generated SQL should be parseable by sqlglot.""" + import sqlglot + + model = kitchen_sink_layer.graph.models["kitchen_sink"] + + failed = [] + for metric in model.metrics: + try: + sql = kitchen_sink_layer.compile(metrics=[f"kitchen_sink.{metric.name}"]) + # Strip the comment line + sql_no_comment = "\n".join(line for line in sql.split("\n") if not line.strip().startswith("--")) + sqlglot.parse_one(sql_no_comment, read="duckdb") + except Exception as e: + failed.append((metric.name, str(e))) + + if failed: + msg = "\n".join(f" {name}: {err}" for name, err in failed) + pytest.fail(f"Generated invalid SQL:\n{msg}") + + +class TestRoundtrip: + """Test export and re-import.""" + + def test_roundtrip_preserves_expressions(self, kitchen_sink_layer): + """Complex expressions should survive roundtrip.""" + adapter = RillAdapter() + + with tempfile.TemporaryDirectory() as tmpdir: + adapter.export(kitchen_sink_layer.graph, tmpdir) + graph2 = adapter.parse(Path(tmpdir) / "kitchen_sink.yaml") + + model1 = kitchen_sink_layer.graph.models["kitchen_sink"] + model2 = graph2.models["kitchen_sink"] + + # Check a complex metric survived + m1 = model1.get_metric("margin") + m2 = model2.get_metric("margin") + + assert m1 is not None + assert m2 is not None + # Both should have the full expression (not broken) + # Either both have agg=None with full SQL, or both properly decomposed diff --git a/tests/fixtures/rill/kitchen_sink.yaml b/tests/fixtures/rill/kitchen_sink.yaml new file mode 100644 index 00000000..c7434638 --- /dev/null +++ b/tests/fixtures/rill/kitchen_sink.yaml @@ -0,0 +1,189 @@ +# Kitchen sink Rill metrics view based on real rill-examples patterns +# Tests edge cases and complex expressions from: +# - rill-311-ops (MEDIAN, COUNT DISTINCT CONCAT) +# - rill-app-engagement (ratios with *1.0, count with space) +# - rill-cost-monitoring (arithmetic expressions, margin calculations) +# - rill-github-analytics (division ratios, count(*)) +# - rill-openrtb-prog-ads (complex nested arithmetic) + +type: metrics_view +name: kitchen_sink +display_name: Kitchen Sink Analytics +description: Comprehensive test of Rill adapter edge cases +model: kitchen_sink_model +timeseries: event_time +smallest_time_grain: hour +default_time_range: P1W + +dimensions: + # Simple column reference + - name: status + display_name: Status + description: Current status + column: status + + # Expression-based dimension (CASE) + - name: status_type + display_name: Status Type + expression: | + CASE + WHEN LOWER(status) IN ('open', 'new', 'in progress') THEN 'Active' + WHEN status IS NULL THEN 'Unknown' + ELSE 'Closed' + END + + # Concatenation in dimension + - name: full_location + display_name: Full Location + expression: CONCAT(city, ', ', state) + + # Boolean dimension + - name: is_merge_commit + display_name: Merge Commit + column: is_merge_commit + description: True if the commit is a merge commit + + # Nested CASE expression + - name: size_bucket + display_name: Size Bucket + expression: | + CASE + WHEN amount < 100 THEN 'small' + WHEN amount < 1000 THEN 'medium' + WHEN amount < 10000 THEN 'large' + ELSE 'enterprise' + END + +measures: + # Simple COUNT(*) + - name: total_records + display_name: Total Records + expression: COUNT(*) + format_preset: humanize + + # COUNT DISTINCT + - name: unique_users + display_name: Unique Users + expression: COUNT(DISTINCT user_id) + format_preset: humanize + + # COUNT DISTINCT with CONCAT (from rill-311-ops) + - name: unique_locations + display_name: Unique Locations + expression: COUNT(DISTINCT CONCAT(latitude, longitude)) + format_preset: humanize + + # MEDIAN aggregation (from rill-311-ops) + - name: median_duration + display_name: Median Duration (Hours) + expression: MEDIAN(duration_hours) + format_preset: humanize + + # Simple SUM + - name: total_revenue + display_name: Total Revenue + expression: SUM(revenue) + format_preset: currency_usd + + # SUM with division constant (from rill-openrtb-prog-ads) + - name: ad_spend + display_name: Advertising Spend + expression: SUM(media_spend_usd)/1000 + format_preset: currency_usd + + # Ratio with *1.0 pattern (from rill-app-engagement) + - name: conversion_rate + display_name: Conversion Rate + expression: SUM(clicks)*1.0/SUM(views)*1.0 + format_preset: percentage + + # Simple ratio (from rill-github-analytics) + - name: deletion_pct + display_name: Deletion Percentage + expression: SUM(deletions) / SUM(changes) + format_preset: percentage + + # Complex arithmetic (from rill-cost-monitoring) + - name: margin + display_name: Margin + expression: (SUM(revenue) - SUM(cost))/SUM(revenue) + format_preset: percentage + + # Net calculation + - name: net_revenue + display_name: Net Revenue + expression: SUM(revenue) - SUM(cost) + format_preset: currency_usd + + # Division in AVG context (from rill-openrtb-prog-ads) + - name: avg_bid_price + display_name: Average Bid Price + expression: SUM(bid_price_usd)*1.0/SUM(bid_cnt)/1000 + format_preset: currency_usd + + # Files per commit ratio (from rill-github-analytics) + - name: files_per_commit + display_name: Files per Commit + expression: COUNT(*) / COUNT(DISTINCT commit_hash) + format_preset: humanize + + # Count with space before parenthesis (from rill-app-engagement) + - name: unique_visitors + display_name: Unique Visitors + expression: count (distinct visitor_id) + format_preset: humanize + + # Completion rate with count distinct ratio + - name: completion_rate + display_name: Completion Rate + expression: count (distinct completed_id)*1.0/count (distinct started_id) *1.0 + format_preset: percentage + + # Derived measure referencing other measures + - name: revenue_per_user + display_name: Revenue per User + type: derived + expression: total_revenue / unique_users + format_preset: currency_usd + requires: + - total_revenue + - unique_users + + # Window function / rolling average + - name: rolling_7day_revenue + display_name: 7-Day Rolling Revenue + expression: AVG(total_revenue) + requires: + - total_revenue + window: + order: event_time + frame: RANGE BETWEEN INTERVAL 6 DAY PRECEDING AND CURRENT ROW + + # MIN/MAX aggregations + - name: min_amount + display_name: Minimum Amount + expression: MIN(amount) + format_preset: currency_usd + + - name: max_amount + display_name: Maximum Amount + expression: MAX(amount) + format_preset: currency_usd + + # AVG aggregation + - name: avg_amount + display_name: Average Amount + expression: AVG(amount) + format_preset: currency_usd + + # Conditional count (CASE in COUNT) + - name: completed_orders + display_name: Completed Orders + expression: COUNT(CASE WHEN status = 'completed' THEN 1 END) + format_preset: humanize + + # SUM with CASE + - name: completed_revenue + display_name: Completed Revenue + expression: SUM(CASE WHEN status = 'completed' THEN revenue ELSE 0 END) + format_preset: currency_usd