From dfdd9cf98f0b9faa4807fd025bd1438fd80315c8 Mon Sep 17 00:00:00 2001 From: Regis Caillaud Date: Mon, 25 Aug 2025 12:06:37 +0200 Subject: [PATCH 01/12] Add pretty printer for plan and expr --- examples/builder_example.py | 2 + src/substrait/builders/display.py | 569 ++++++++++++++++++++++++++++++ 2 files changed, 571 insertions(+) create mode 100644 src/substrait/builders/display.py diff --git a/examples/builder_example.py b/examples/builder_example.py index 0a9dd0d..af8828f 100644 --- a/examples/builder_example.py +++ b/examples/builder_example.py @@ -2,6 +2,7 @@ from substrait.builders.extended_expression import column, scalar_function, literal from substrait.builders.type import i64, boolean, struct, named_struct from substrait.extension_registry import ExtensionRegistry +from substrait.builders.display import pretty_print_plan registry = ExtensionRegistry(load_default_extensions=True) @@ -22,6 +23,7 @@ table = project(table, expressions=[column("id")]) print(table(registry)) +pretty_print_plan(table(registry), use_colors=True) """ extension_uris { diff --git a/src/substrait/builders/display.py b/src/substrait/builders/display.py new file mode 100644 index 0000000..15db9b1 --- /dev/null +++ b/src/substrait/builders/display.py @@ -0,0 +1,569 @@ +""" +Substrait Plan Pretty Printer + +This module provides a concise pretty printer for Substrait plans and expressions +in a readable format using indentation, -> characters, and colors. +""" + +import substrait.gen.proto.algebra_pb2 as stalg +import substrait.gen.proto.plan_pb2 as stp +import substrait.gen.proto.type_pb2 as stt +import substrait.gen.proto.extended_expression_pb2 as stee + + +# ANSI color codes +class Colors: + RESET = "\033[0m" + BOLD = "\033[1m" + RED = "\033[91m" + GREEN = "\033[92m" + YELLOW = "\033[93m" + BLUE = "\033[94m" + MAGENTA = "\033[95m" + CYAN = "\033[96m" + GRAY = "\033[90m" + WHITE = "\033[97m" + + +class PlanPrinter: + """Concise pretty printer for Substrait plans and expressions""" + + def __init__( + self, indent_size: int = 2, show_metadata: bool = False, use_colors: bool = None + ): + self.indent_size = indent_size + self.show_metadata = show_metadata + self.schema_names = [] # Track current schema column names + + # Auto-detect color support if not specified + if use_colors is None: + self.use_colors = self._detect_color_support() + else: + self.use_colors = use_colors + + def _detect_color_support(self) -> bool: + """Detect if the terminal supports colors""" + import os + import sys + + # Check if we're in a terminal + if not hasattr(sys.stdout, "isatty") or not sys.stdout.isatty(): + return False + + # Check environment variables + if "NO_COLOR" in os.environ: + return False + + # Check if we're in a dumb terminal + term = os.environ.get("TERM", "") + if term == "dumb": + return False + + # Check if we're in Windows Command Prompt (which has limited color support) + if os.name == "nt" and "ANSICON" not in os.environ: + return False + + return True + + def _color(self, text: str, color: str) -> str: + """Apply color to text if colors are enabled""" + if self.use_colors: + return f"{color}{text}{Colors.RESET}" + return text + + def _indent_prefix(self, depth: int) -> str: + """Generate indentation prefix with -> character""" + if depth == 0: + return "" + return f"{Colors.GRAY if self.use_colors else ''}->{Colors.RESET if self.use_colors else ''}" + + def _get_indent_with_arrow(self, depth: int) -> str: + """Get proper indentation with arrow for alignment""" + base_indent = " " * (depth * self.indent_size) + if depth == 0: + return base_indent + # Ensure consistent spacing: base_indent + arrow + space + return f"{base_indent}{self._indent_prefix(depth)} " + + def _resolve_field_name(self, field_index: int) -> str: + """Resolve a field index to a column name if available""" + if 0 <= field_index < len(self.schema_names): + return self.schema_names[field_index] + return f"field_{field_index}" + + def print_plan(self, plan: stp.Plan) -> None: + """Print a Substrait plan to a string""" + output = self.stringify_plan(plan) + print(output) + + def print_expression(self, expression: stalg.Expression) -> None: + """Print a Substrait expression to a string""" + output = self.stringify_expression(expression) + print(output) + + def stringify_plan(self, plan: stp.Plan) -> str: + """Stringify a Substrait plan""" + import io + + stream = io.StringIO() + self._stream_plan(plan, stream, 0) + return stream.getvalue() + + def stringify_expression(self, expression: stalg.Expression) -> str: + """Stringify a Substrait expression""" + import io + + stream = io.StringIO() + self._stream_expression(expression, stream, 0) + return stream.getvalue() + + def _stream_plan(self, plan: stp.Plan, stream, depth: int): + """Print a plan concisely""" + indent = " " * (depth * self.indent_size) + + # Print relations (the main content) + for i, rel in enumerate(plan.relations): + if i > 0: + stream.write(f"{indent}{self._indent_prefix(depth)}\n") + self._stream_relation(rel, stream, depth) + + def _stream_relation(self, rel: stp.PlanRel, stream, depth: int): + """Print a plan relation concisely""" + indent = " " * (depth * self.indent_size) + + if rel.HasField("root"): + self._stream_rel_root(rel.root, stream, depth) + elif rel.HasField("rel"): + self._stream_rel(rel.rel, stream, depth) + + def _stream_rel_root(self, root: stalg.RelRoot, stream, depth: int): + """Print a relation root concisely""" + indent = " " * (depth * self.indent_size) + + # Show output names if they exist + if root.names: + stream.write( + f"{indent}{self._color('output:', Colors.CYAN)} {self._color(list(root.names), Colors.YELLOW)}\n" + ) + + # Print the input relation + self._stream_rel(root.input, stream, depth) + + def _stream_rel(self, rel: stalg.Rel, stream, depth: int): + """Print a relation concisely""" + indent = " " * (depth * self.indent_size) + + if rel.HasField("read"): + self._stream_read_rel(rel.read, stream, depth) + elif rel.HasField("filter"): + self._stream_filter_rel(rel.filter, stream, depth) + elif rel.HasField("project"): + self._stream_project_rel(rel.project, stream, depth) + elif rel.HasField("aggregate"): + self._stream_aggregate_rel(rel.aggregate, stream, depth) + elif rel.HasField("sort"): + self._stream_sort_rel(rel.sort, stream, depth) + elif rel.HasField("join"): + self._stream_join_rel(rel.join, stream, depth) + elif rel.HasField("cross"): + self._stream_cross_rel(rel.cross, stream, depth) + elif rel.HasField("fetch"): + self._stream_fetch_rel(rel.fetch, stream, depth) + else: + stream.write(f"{indent}\n") + + def _stream_read_rel(self, read: stalg.ReadRel, stream, depth: int): + """Print a read relation concisely""" + indent = " " * (depth * self.indent_size) + + if read.HasField("named_table"): + table_names = list(read.named_table.names) + stream.write( + f"{indent}{self._color('read:', Colors.GREEN)} {self._color(table_names[0] if len(table_names) == 1 else table_names, Colors.YELLOW)}\n" + ) + + if read.HasField("base_schema"): + # Capture schema names for field resolution + self.schema_names = list(read.base_schema.names) + if self.show_metadata: + stream.write( + f"{self._get_indent_with_arrow(depth + 1)}{self._color('schema:', Colors.BLUE)} {self._color(self.schema_names, Colors.YELLOW)}\n" + ) + + def _stream_filter_rel(self, filter_rel: stalg.FilterRel, stream, depth: int): + """Print a filter relation concisely""" + indent = " " * (depth * self.indent_size) + + stream.write(f"{indent}{self._color('filter', Colors.RED)}\n") + stream.write( + f"{self._get_indent_with_arrow(depth + 1)}{self._color('input:', Colors.BLUE)}\n" + ) + self._stream_rel(filter_rel.input, stream, depth + 1) + stream.write( + f"{self._get_indent_with_arrow(depth + 1)}{self._color('condition:', Colors.BLUE)}\n" + ) + self._stream_expression(filter_rel.condition, stream, depth + 1) + + def _stream_project_rel(self, project: stalg.ProjectRel, stream, depth: int): + """Print a project relation concisely""" + indent = " " * (depth * self.indent_size) + + stream.write(f"{indent}{self._color('project', Colors.MAGENTA)}\n") + stream.write( + f"{self._get_indent_with_arrow(depth + 1)}{self._color('input:', Colors.BLUE)}\n" + ) + self._stream_rel(project.input, stream, depth + 1) + + if project.expressions: + stream.write( + f"{self._get_indent_with_arrow(depth + 1)}{self._color('expr', Colors.BLUE)}({self._color(len(project.expressions), Colors.YELLOW)}):\n" + ) + for i, expr in enumerate(project.expressions): + stream.write( + f"{self._get_indent_with_arrow(depth + 2)}{self._color(f'expr', Colors.BLUE)}[{self._color(f'{i}', Colors.CYAN)}]:\n" + ) + self._stream_expression(expr, stream, depth + 2) + + def _stream_aggregate_rel(self, aggregate: stalg.AggregateRel, stream, depth: int): + """Print an aggregate relation concisely""" + indent = " " * (depth * self.indent_size) + + stream.write(f"{indent}{self._color('aggregate', Colors.CYAN)}\n") + stream.write( + f"{self._get_indent_with_arrow(depth + 1)}{self._color('input:', Colors.BLUE)}\n" + ) + self._stream_rel(aggregate.input, stream, depth + 1) + + if aggregate.groupings: + stream.write( + f"{self._get_indent_with_arrow(depth + 1)}{self._color('groups:', Colors.BLUE)} {self._color(len(aggregate.groupings), Colors.YELLOW)}\n" + ) + + if aggregate.measures: + stream.write( + f"{self._get_indent_with_arrow(depth + 1)}{self._color('measures:', Colors.BLUE)} {self._color(len(aggregate.measures), Colors.YELLOW)}\n" + ) + + def _stream_sort_rel(self, sort: stalg.SortRel, stream, depth: int): + """Print a sort relation concisely""" + indent = " " * (depth * self.indent_size) + + stream.write( + f"{indent}{self._color('sort:', Colors.YELLOW)} {self._color(f'{len(sort.sorts)} fields', Colors.YELLOW)}\n" + ) + stream.write( + f"{self._get_indent_with_arrow(depth + 1)}{self._color('input:', Colors.BLUE)}\n" + ) + self._stream_rel(sort.input, stream, depth + 1) + + def _stream_join_rel(self, join: stalg.JoinRel, stream, depth: int): + """Print a join relation concisely""" + indent = " " * (depth * self.indent_size) + + stream.write( + f"{indent}{self._color('join:', Colors.GREEN)} {self._color(join.type, Colors.YELLOW)}\n" + ) + stream.write( + f"{self._get_indent_with_arrow(depth + 1)}{self._color('left:', Colors.BLUE)}\n" + ) + self._stream_rel(join.left, stream, depth + 1) + stream.write( + f"{self._get_indent_with_arrow(depth + 1)}{self._color('right:', Colors.BLUE)}\n" + ) + self._stream_rel(join.right, stream, depth + 1) + + if join.HasField("expression"): + stream.write( + f"{self._get_indent_with_arrow(depth + 1)}{self._color('on:', Colors.BLUE)}\n" + ) + self._stream_expression(join.expression, stream, depth + 1) + + def _stream_cross_rel(self, cross: stalg.CrossRel, stream, depth: int): + """Print a cross relation concisely""" + indent = " " * (depth * self.indent_size) + + stream.write(f"{indent}{self._color('cross', Colors.GREEN)}\n") + stream.write( + f"{self._get_indent_with_arrow(depth + 1)}{self._color('left:', Colors.BLUE)}\n" + ) + self._stream_rel(cross.left, stream, depth + 1) + stream.write( + f"{self._get_indent_with_arrow(depth + 1)}{self._color('right:', Colors.BLUE)}\n" + ) + self._stream_rel(cross.right, stream, depth + 1) + + def _stream_fetch_rel(self, fetch: stalg.FetchRel, stream, depth: int): + """Print a fetch relation concisely""" + indent = " " * (depth * self.indent_size) + + stream.write( + f"{indent}{self._color('fetch:', Colors.YELLOW)} {self._color(f'offset={fetch.offset}, count={fetch.count}', Colors.YELLOW)}\n" + ) + stream.write( + f"{self._get_indent_with_arrow(depth + 1)}{self._color('input:', Colors.BLUE)}\n" + ) + self._stream_rel(fetch.input, stream, depth + 1) + + def _stream_expression(self, expression: stalg.Expression, stream, depth: int): + """Print an expression concisely""" + indent = " " * (depth * self.indent_size) + + if expression.HasField("literal"): + self._stream_literal(expression.literal, stream, depth) + elif expression.HasField("selection"): + self._stream_selection(expression.selection, stream, depth) + elif expression.HasField("scalar_function"): + self._stream_scalar_function(expression.scalar_function, stream, depth) + elif expression.HasField("cast"): + self._stream_cast(expression.cast, stream, depth) + elif expression.HasField("if_then"): + self._stream_if_then(expression.if_then, stream, depth) + elif expression.HasField("window_function"): + self._stream_window_function(expression.window_function, stream, depth) + else: + stream.write(f"{indent}\n") + + def _stream_literal(self, literal: stalg.Expression.Literal, stream, depth: int): + """Print a literal concisely""" + indent = " " * (depth * self.indent_size) + + if literal.HasField("boolean"): + stream.write(f"{indent}literal: {literal.boolean}\n") + elif literal.HasField("i32"): + stream.write(f"{indent}literal: {literal.i32}\n") + elif literal.HasField("i64"): + stream.write(f"{indent}literal: {literal.i64}\n") + elif literal.HasField("fp32"): + stream.write(f"{indent}literal: {literal.fp32}\n") + elif literal.HasField("fp64"): + stream.write(f"{indent}literal: {literal.fp64}\n") + elif literal.HasField("string"): + stream.write(f'{indent}literal: "{literal.string}"\n') + elif literal.HasField("date"): + stream.write(f"{indent}literal: date={literal.date}\n") + elif literal.HasField("timestamp"): + stream.write(f"{indent}literal: timestamp={literal.timestamp}\n") + else: + stream.write(f"{indent}literal: \n") + + def _stream_selection( + self, selection: stalg.Expression.FieldReference, stream, depth: int + ): + """Print a field reference concisely""" + indent = " " * (depth * self.indent_size) + + if selection.HasField("direct_reference"): + if selection.direct_reference.HasField("struct_field"): + field_idx = selection.direct_reference.struct_field.field + column_name = self._resolve_field_name(field_idx) + stream.write(f"{indent}field: {column_name}\n") + else: + stream.write(f"{indent}field: direct\n") + else: + stream.write(f"{indent}field: root\n") + + def _stream_scalar_function( + self, func: stalg.Expression.ScalarFunction, stream, depth: int + ): + """Print a scalar function concisely""" + indent = " " * (depth * self.indent_size) + + stream.write(f"{indent}function: {func.function_reference}\n") + + # Print function arguments + if func.arguments: + stream.write( + f"{self._get_indent_with_arrow(depth + 1)}{self._color('args', Colors.BLUE)}({self._color(len(func.arguments), Colors.YELLOW)}):\n" + ) + for i, arg in enumerate(func.arguments): + # Get the argument content as a string without newlines + arg_content = self._get_function_argument_string(arg) + stream.write( + f"{self._get_indent_with_arrow(depth + 2)}{self._color(f'args', Colors.BLUE)}[{self._color(f'{i}', Colors.CYAN)}]: {arg_content}\n" + ) + + # Print function options if present + if func.options: + stream.write(f"{indent}| options: {len(func.options)}\n") + for i, option in enumerate(func.options): + stream.write(f"{indent}| | {i}: {option.name}={option.value}\n") + + # Print output type if present + if func.HasField("output_type"): + stream.write( + f"{indent}| output_type: {self._type_to_string(func.output_type)}\n" + ) + + def _stream_cast(self, cast: stalg.Expression.Cast, stream, depth: int): + """Print a cast expression concisely""" + indent = " " * (depth * self.indent_size) + + stream.write(f"{indent}cast\n") + stream.write(f"{indent}| input:\n") + self._stream_expression(cast.input, stream, depth + 1) + stream.write(f"{indent}| to: {self._type_to_string(cast.type)}\n") + + def _stream_if_then(self, if_then: stalg.Expression.IfThen, stream, depth: int): + """Print an if-then expression concisely""" + indent = " " * (depth * self.indent_size) + + stream.write(f"{indent}if_then\n") + if if_then.ifs: + stream.write(f"{indent}| if:\n") + self._stream_expression(if_then.ifs[0].if_, stream, depth + 1) + stream.write(f"{indent}| then:\n") + self._stream_expression(if_then.ifs[0].then, stream, depth + 1) + + if if_then.HasField("else_"): + stream.write(f"{indent}| else:\n") + self._stream_expression(if_then.else_, stream, depth + 1) + + def _stream_window_function( + self, func: stalg.Expression.WindowFunction, stream, depth: int + ): + """Print a window function concisely""" + indent = " " * (depth * self.indent_size) + + stream.write(f"{indent}window_function: {func.function_reference}\n") + if func.arguments: + stream.write(f"{indent}| args: {len(func.arguments)}\n") + for i, arg in enumerate(func.arguments): + stream.write(f"{indent}| | {i}:\n") + self._stream_expression(arg, stream, depth + 2) + + def _get_function_argument_string(self, arg) -> str: + """Get function argument content as a string without newlines""" + if hasattr(arg, "value") and arg.HasField("value"): + if arg.value.HasField("literal"): + # For simple literals, return the value directly + if arg.value.literal.HasField("boolean"): + return f"literal: {arg.value.literal.boolean}" + elif arg.value.literal.HasField("i32"): + return f"literal: {arg.value.literal.i32}" + elif arg.value.literal.HasField("i64"): + return f"literal: {arg.value.literal.i64}" + elif arg.value.literal.HasField("fp32"): + return f"literal: {arg.value.literal.fp32}" + elif arg.value.literal.HasField("fp64"): + return f"literal: {arg.value.literal.fp64}" + elif arg.value.literal.HasField("string"): + return f'literal: "{arg.value.literal.string}"' + elif arg.value.literal.HasField("date"): + return f"literal: date={arg.value.literal.date}" + elif arg.value.literal.HasField("timestamp"): + return f"literal: timestamp={arg.value.literal.timestamp}" + else: + return "literal: " + elif arg.value.HasField("selection"): + # For simple field references, return the column name if available + if arg.value.selection.HasField("direct_reference"): + if arg.value.selection.direct_reference.HasField("struct_field"): + field_idx = ( + arg.value.selection.direct_reference.struct_field.field + ) + column_name = self._resolve_field_name(field_idx) + return f"field: {column_name}" + else: + return "field: direct" + else: + return "field: root" + elif arg.value.HasField("scalar_function"): + return f"function: {arg.value.scalar_function.function_reference}" + elif arg.value.HasField("enum"): + return f"enum: {arg.value.enum}" + else: + return "" + else: + return "" + + def _stream_function_argument(self, arg, stream, depth: int): + """Print a function argument concisely""" + indent = " " * (depth * self.indent_size) + + # FunctionArgument can have different types of values + if hasattr(arg, "value") and arg.HasField("value"): + if arg.value.HasField("literal"): + # For simple literals, put the value on the same line + if arg.value.literal.HasField("boolean"): + stream.write(f"{indent}literal: {arg.value.literal.boolean}\n") + elif arg.value.literal.HasField("i32"): + stream.write(f"{indent}literal: {arg.value.literal.i32}\n") + elif arg.value.literal.HasField("i64"): + stream.write(f"{indent}literal: {arg.value.literal.i64}\n") + elif arg.value.literal.HasField("fp32"): + stream.write(f"{indent}literal: {arg.value.literal.fp32}\n") + elif arg.value.literal.HasField("fp64"): + stream.write(f"{indent}literal: {arg.value.literal.fp64}\n") + elif arg.value.literal.HasField("string"): + stream.write(f'{indent}literal: "{arg.value.literal.string}"\n') + elif arg.value.literal.HasField("date"): + stream.write(f"{indent}literal: date={arg.value.literal.date}\n") + elif arg.value.literal.HasField("timestamp"): + stream.write( + f"{indent}literal: timestamp={arg.value.literal.timestamp}\n" + ) + else: + stream.write(f"{indent}literal: \n") + elif arg.value.HasField("selection"): + # For simple field references, put the column name on the same line + if arg.value.selection.HasField("direct_reference"): + if arg.value.selection.direct_reference.HasField("struct_field"): + field_idx = ( + arg.value.selection.direct_reference.struct_field.field + ) + column_name = self._resolve_field_name(field_idx) + stream.write(f"{indent}field: {column_name}\n") + else: + stream.write(f"{indent}field: direct\n") + else: + stream.write(f"{indent}field: root\n") + elif arg.value.HasField("scalar_function"): + self._stream_scalar_function(arg.value.scalar_function, stream, depth) + elif arg.value.HasField("enum"): + stream.write(f"{indent}enum: {arg.value.enum}\n") + else: + stream.write(f"{indent}\n") + else: + stream.write(f"{indent}\n") + + def _type_to_string(self, type_info: stt.Type) -> str: + """Convert a type to a concise string representation""" + if type_info.HasField("bool"): + return f"bool({type_info.bool.nullability})" + elif type_info.HasField("i32"): + return f"i32({type_info.i32.nullability})" + elif type_info.HasField("i64"): + return f"i64({type_info.i64.nullability})" + elif type_info.HasField("fp32"): + return f"fp32({type_info.fp32.nullability})" + elif type_info.HasField("fp64"): + return f"fp64({type_info.fp64.nullability})" + elif type_info.HasField("string"): + return f"string({type_info.string.nullability})" + elif type_info.HasField("struct"): + return f"struct({len(type_info.struct.types)} fields)" + else: + return "" + + +# Convenience functions for easy usage +def pretty_print_plan( + plan: stp.Plan, + indent_size: int = 2, + show_metadata: bool = False, + use_colors: bool = False, +) -> None: + """Convenience function to print a Substrait plan concisely""" + printer = PlanPrinter(indent_size, show_metadata, use_colors) + printer.print_plan(plan) + + +def pretty_print_expression( + expression: stalg.Expression, + indent_size: int = 2, + show_metadata: bool = False, + use_colors: bool = False, +) -> None: + """Convenience function to print a Substrait expression concisely""" + printer = PlanPrinter(indent_size, show_metadata, use_colors) + printer.stringify_expression(expression) From 703fdc6f8faa5526004507635f5ae684d483586e Mon Sep 17 00:00:00 2001 From: Regis Caillaud Date: Mon, 25 Aug 2025 13:07:32 +0200 Subject: [PATCH 02/12] Add Literal Map pretty print, add more advanced example --- examples/builder_example.py | 481 +++++++++++++++++++++++------- src/substrait/builders/display.py | 124 +++++++- 2 files changed, 494 insertions(+), 111 deletions(-) diff --git a/examples/builder_example.py b/examples/builder_example.py index af8828f..6cc1c77 100644 --- a/examples/builder_example.py +++ b/examples/builder_example.py @@ -1,109 +1,102 @@ -from substrait.builders.plan import read_named_table, project, filter -from substrait.builders.extended_expression import column, scalar_function, literal -from substrait.builders.type import i64, boolean, struct, named_struct +from substrait.builders.plan import ( + read_named_table, + project, + filter, + sort, + fetch, +) +from substrait.builders.extended_expression import ( + column, + scalar_function, + literal, +) +from substrait.builders.type import i64, boolean, struct, named_struct, fp64, string from substrait.extension_registry import ExtensionRegistry -from substrait.builders.display import pretty_print_plan +from substrait.builders.display import pretty_print_plan, pretty_print_expression +import substrait.gen.proto.algebra_pb2 as stalg registry = ExtensionRegistry(load_default_extensions=True) -ns = named_struct( - names=["id", "is_applicable"], struct=struct(types=[i64(nullable=False), boolean()]) -) -table = read_named_table("example_table", ns) -table = filter(table, expression=column("is_applicable")) -table = filter( - table, - expression=scalar_function( - "functions_comparison.yaml", - "lt", - expressions=[column("id"), literal(100, i64())], - ), -) -table = project(table, expressions=[column("id")]) - -print(table(registry)) -pretty_print_plan(table(registry), use_colors=True) - -""" -extension_uris { - extension_uri_anchor: 13 - uri: "functions_comparison.yaml" -} -extensions { - extension_function { - extension_uri_reference: 13 - function_anchor: 495 - name: "lt" +def basic_example(): + ns = named_struct( + names=["id", "is_applicable"], + struct=struct(types=[i64(nullable=False), boolean()]), + ) + + table = read_named_table("example_table", ns) + table = filter(table, expression=column("is_applicable")) + table = filter( + table, + expression=scalar_function( + "functions_comparison.yaml", + "lt", + expressions=[column("id"), literal(100, i64())], + ), + ) + table = project(table, expressions=[column("id")]) + + print(table(registry)) + pretty_print_plan(table(registry), use_colors=True) + + """ + extension_uris { + extension_uri_anchor: 13 + uri: "functions_comparison.yaml" + } + extensions { + extension_function { + extension_uri_reference: 13 + function_anchor: 495 + name: "lt" + } } -} -relations { - root { - input { - project { - common { - emit { - output_mapping: 2 + relations { + root { + input { + project { + common { + emit { + output_mapping: 2 + } } - } - input { - filter { - input { - filter { - input { - read { - common { - direct { + input { + filter { + input { + filter { + input { + read { + common { + direct { + } } - } - base_schema { - names: "id" - names: "is_applicable" - struct { - types { - i64 { - nullability: NULLABILITY_REQUIRED + base_schema { + names: "id" + names: "is_applicable" + struct { + types { + i64 { + nullability: NULLABILITY_REQUIRED + } } - } - types { - bool { - nullability: NULLABILITY_NULLABLE + types { + bool { + nullability: NULLABILITY_NULLABLE + } } + nullability: NULLABILITY_NULLABLE } - nullability: NULLABILITY_NULLABLE } - } - named_table { - names: "example_table" - } - } - } - condition { - selection { - direct_reference { - struct_field { - field: 1 + named_table { + names: "example_table" } } - root_reference { - } } - } - } - } - condition { - scalar_function { - function_reference: 495 - output_type { - bool { - nullability: NULLABILITY_NULLABLE - } - } - arguments { - value { + condition { selection { direct_reference { struct_field { + field: 1 } } root_reference { @@ -111,31 +104,313 @@ } } } - arguments { - value { - literal { - i64: 100 - nullable: true + } + condition { + scalar_function { + function_reference: 495 + output_type { + bool { + nullability: NULLABILITY_NULLABLE + } + } + arguments { + value { + selection { + direct_reference { + struct_field { + } + } + root_reference { + } + } + } + } + arguments { + value { + literal { + i64: 100 + nullable: true + } } } } } } } - } - expressions { - selection { - direct_reference { - struct_field { + expressions { + selection { + direct_reference { + struct_field { + } + } + root_reference { } - } - root_reference { } } } } + names: "id" } - names: "id" } -} -""" + """ + + +def advanced_example(): + print("=== Simple Example ===") + # Simple example (original) + ns = named_struct( + names=["id", "is_applicable"], + struct=struct(types=[i64(nullable=False), boolean()]), + ) + + table = read_named_table("example_table", ns) + table = filter(table, expression=column("is_applicable")) + table = filter( + table, + expression=scalar_function( + "functions_comparison.yaml", + "lt", + expressions=[column("id"), literal(100, i64())], + ), + ) + table = project(table, expressions=[column("id")]) + + print("Simple filtered table:") + pretty_print_plan(table(registry), use_colors=True) + + print("\n" + "=" * 50 + "\n") + + print("=== Scalar Function with Options Example ===") + # Example with scalar function that has options + users_schema = named_struct( + names=["user_id", "name", "age", "salary"], + struct=struct( + types=[ + i64(nullable=False), # user_id + string(nullable=False), # name + i64(nullable=False), # age + fp64(nullable=False), # salary + ] + ), + ) + + users = read_named_table("users", users_schema) + + # Filter users with age > 25 + adult_users = filter( + users, + expression=scalar_function( + "functions_comparison.yaml", + "gt", + expressions=[column("age"), literal(25, i64())], + ), + ) + + # Project with calculated fields + enriched_users = project( + adult_users, + expressions=[ + column("user_id"), + column("name"), + column("age"), + column("salary"), + # Add a calculated field (this would show function options if available) + scalar_function( + "functions_arithmetic.yaml", + "multiply", + expressions=[column("salary"), literal(1.1, fp64())], + alias="salary_with_bonus", + ), + ], + ) + + print("Users with age > 25 and calculated salary:") + pretty_print_plan(enriched_users(registry), use_colors=True) + + print("\n" + "=" * 50 + "\n") + + print("=== Sort and Fetch Example ===") + # Example with sort and fetch operations + orders_schema = named_struct( + names=["order_id", "amount", "status"], + struct=struct( + types=[ + i64(nullable=False), # order_id + fp64(nullable=False), # amount + string(nullable=False), # status + ] + ), + ) + + orders = read_named_table("orders", orders_schema) + + # Filter orders with amount > 50 + high_value_orders = filter( + orders, + expression=scalar_function( + "functions_comparison.yaml", + "gt", + expressions=[column("amount"), literal(50.0, fp64())], + ), + ) + + # Sort by amount descending + sorted_orders = sort( + high_value_orders, + expressions=[ + ( + column("amount"), + stalg.SortField.SORT_DIRECTION_DESC_NULLS_LAST, + ) # descending order + ], + ) + + # Limit to top 5 results + final_result = fetch( + sorted_orders, offset=literal(0, i64()), count=literal(5, i64()) + ) + + print("Top 5 high-value orders sorted by amount:") + pretty_print_plan(final_result(registry), use_colors=True) + + print("\n" + "=" * 50 + "\n") + + +def expression_only_example(): + print("=== Expression-Only Example ===") + # Show complex expression structure + complex_expr = scalar_function( + "functions_arithmetic.yaml", + "multiply", + expressions=[ + scalar_function( + "functions_arithmetic.yaml", + "add", + expressions=[ + column("base_salary"), + scalar_function( + "functions_arithmetic.yaml", + "multiply", + expressions=[ + column("base_salary"), + literal(0.15, fp64()), # 15% bonus + ], + ), + ], + ), + scalar_function( + "functions_arithmetic.yaml", + "subtract", + expressions=[ + literal(1.0, fp64()), + literal(0.25, fp64()), # 25% tax rate + ], + ), + ], + ) + + print("Complex salary calculation expression:") + # Create a simple plan to wrap the expression + dummy_schema = named_struct( + names=["base_salary"], struct=struct(types=[fp64(nullable=False)]) + ) + dummy_table = read_named_table("dummy", dummy_schema) + dummy_plan = project(dummy_table, expressions=[complex_expr]) + pretty_print_plan(dummy_plan(registry), use_colors=True) + + print("\n" + "=" * 50 + "\n") + + print("=== Manual Map Literal Example ===") + # Example with manually constructed map literal + + # Create a map literal manually + map_literal_expr = stalg.Expression( + literal=stalg.Expression.Literal( + map=stalg.Expression.Literal.Map( + key_values=[ + stalg.Expression.Literal.Map.KeyValue( + key=stalg.Expression.Literal(string="batch_size"), + value=stalg.Expression.Literal(i64=1000), + ), + stalg.Expression.Literal.Map.KeyValue( + key=stalg.Expression.Literal(string="timeout"), + value=stalg.Expression.Literal(i64=30), + ), + stalg.Expression.Literal.Map.KeyValue( + key=stalg.Expression.Literal(string="retry_count"), + value=stalg.Expression.Literal(i64=3), + ), + ] + ) + ) + ) + print("Simple map literal:") + print(f" Type: {type(map_literal_expr)}") + print(f" Has literal field: {map_literal_expr.HasField('literal')}") + if map_literal_expr.HasField("literal"): + print(f" Has map field: {map_literal_expr.literal.HasField('map')}") + if map_literal_expr.literal.HasField("map"): + print( + f" Map key_values count: {len(map_literal_expr.literal.map.key_values)}" + ) + + print("\nTesting pretty_print_expression:") + pretty_print_expression(map_literal_expr, use_colors=True) + + # Create a nested map literal to test recursion + nested_map_expr = stalg.Expression( + literal=stalg.Expression.Literal( + map=stalg.Expression.Literal.Map( + key_values=[ + stalg.Expression.Literal.Map.KeyValue( + key=stalg.Expression.Literal(string="config"), + value=stalg.Expression.Literal( + map=stalg.Expression.Literal.Map( + key_values=[ + stalg.Expression.Literal.Map.KeyValue( + key=stalg.Expression.Literal( + string="threshold" + ), + value=stalg.Expression.Literal(i64=50), + ), + stalg.Expression.Literal.Map.KeyValue( + key=stalg.Expression.Literal(string="enabled"), + value=stalg.Expression.Literal(boolean=True), + ), + ] + ) + ), + ), + stalg.Expression.Literal.Map.KeyValue( + key=stalg.Expression.Literal(string="version"), + value=stalg.Expression.Literal(string="1.0.0"), + ), + ] + ) + ) + ) + + print("\nNested map literal:") + print(f" Type: {type(nested_map_expr)}") + print(f" Has literal field: {nested_map_expr.HasField('literal')}") + if nested_map_expr.HasField("literal"): + print(f" Has map field: {nested_map_expr.literal.HasField('map')}") + if nested_map_expr.literal.HasField("map"): + print( + f" Map key_values count: {len(nested_map_expr.literal.map.key_values)}" + ) + # Check if it has nested maps + for i, kv in enumerate(nested_map_expr.literal.map.key_values): + if kv.value.HasField("map"): + print( + f" Key {i} has nested map with {len(kv.value.map.key_values)} entries" + ) + + print("\nTesting pretty_print_expression with nested map:") + pretty_print_expression(nested_map_expr, use_colors=True) + + +if __name__ == "__main__": + basic_example() + advanced_example() + expression_only_example() diff --git a/src/substrait/builders/display.py b/src/substrait/builders/display.py index 15db9b1..09901e7 100644 --- a/src/substrait/builders/display.py +++ b/src/substrait/builders/display.py @@ -343,6 +343,9 @@ def _stream_literal(self, literal: stalg.Expression.Literal, stream, depth: int) stream.write(f"{indent}literal: date={literal.date}\n") elif literal.HasField("timestamp"): stream.write(f"{indent}literal: timestamp={literal.timestamp}\n") + elif literal.HasField("map"): + stream.write(f"{indent}literal: map\n") + self._stream_map_literal(literal.map, stream, depth + 1) else: stream.write(f"{indent}literal: \n") @@ -376,17 +379,36 @@ def _stream_scalar_function( f"{self._get_indent_with_arrow(depth + 1)}{self._color('args', Colors.BLUE)}({self._color(len(func.arguments), Colors.YELLOW)}):\n" ) for i, arg in enumerate(func.arguments): - # Get the argument content as a string without newlines - arg_content = self._get_function_argument_string(arg) - stream.write( - f"{self._get_indent_with_arrow(depth + 2)}{self._color(f'args', Colors.BLUE)}[{self._color(f'{i}', Colors.CYAN)}]: {arg_content}\n" - ) + # Check if this is a nested scalar function + if ( + hasattr(arg, "value") + and arg.HasField("value") + and arg.value.HasField("scalar_function") + ): + # Recursively expand nested scalar functions + stream.write( + f"{self._get_indent_with_arrow(depth + 2)}{self._color(f'args', Colors.BLUE)}[{self._color(f'{i}', Colors.CYAN)}]:\n" + ) + self._stream_scalar_function( + arg.value.scalar_function, stream, depth + 3 + ) + else: + # For simple arguments, show on the same line + arg_content = self._get_function_argument_string(arg) + stream.write( + f"{self._get_indent_with_arrow(depth + 2)}{self._color(f'args', Colors.BLUE)}[{self._color(f'{i}', Colors.CYAN)}]: {arg_content}\n" + ) # Print function options if present if func.options: stream.write(f"{indent}| options: {len(func.options)}\n") for i, option in enumerate(func.options): - stream.write(f"{indent}| | {i}: {option.name}={option.value}\n") + # Handle preference as a list + if hasattr(option, "preference") and option.preference: + pref_str = f"[{', '.join(str(p) for p in option.preference)}]" + else: + pref_str = "[]" + stream.write(f"{indent}| | {i}: {option.name}={pref_str}\n") # Print output type if present if func.HasField("output_type"): @@ -452,6 +474,9 @@ def _get_function_argument_string(self, arg) -> str: return f"literal: date={arg.value.literal.date}" elif arg.value.literal.HasField("timestamp"): return f"literal: timestamp={arg.value.literal.timestamp}" + elif arg.value.literal.HasField("map"): + # For maps, we'll handle them specially in the main printing + return "" else: return "literal: " elif arg.value.HasField("selection"): @@ -468,7 +493,9 @@ def _get_function_argument_string(self, arg) -> str: else: return "field: root" elif arg.value.HasField("scalar_function"): - return f"function: {arg.value.scalar_function.function_reference}" + # For nested scalar functions, we'll handle them specially in the main printing + # Return a placeholder that indicates it needs recursive expansion + return "" elif arg.value.HasField("enum"): return f"enum: {arg.value.enum}" else: @@ -502,6 +529,10 @@ def _stream_function_argument(self, arg, stream, depth: int): stream.write( f"{indent}literal: timestamp={arg.value.literal.timestamp}\n" ) + elif arg.value.literal.HasField("map"): + # Handle map literals with proper indentation + stream.write(f"{indent}literal: map\n") + self._stream_map_literal(arg.value.literal.map, stream, depth + 1) else: stream.write(f"{indent}literal: \n") elif arg.value.HasField("selection"): @@ -526,6 +557,83 @@ def _stream_function_argument(self, arg, stream, depth: int): else: stream.write(f"{indent}\n") + def _stream_map_literal( + self, map_literal: stalg.Expression.Literal.Map, stream, depth: int + ): + """Print a map literal with proper indentation""" + indent = " " * (depth * self.indent_size) + + if map_literal.key_values: + stream.write( + f"{indent}-> {self._color('key_values', Colors.BLUE)}({self._color(len(map_literal.key_values), Colors.YELLOW)}):\n" + ) + for i, kv in enumerate(map_literal.key_values): + stream.write( + f"{indent} -> {self._color('key_values', Colors.BLUE)}[{self._color(f'{i}', Colors.CYAN)}]:\n" + ) + stream.write( + f"{indent} -> {self._color('key', Colors.BLUE)}: {self._color(kv.key.string, Colors.GREEN)}\n" + ) + stream.write(f"{indent} -> {self._color('value', Colors.BLUE)}:\n") + self._stream_literal_value(kv.value, stream, depth + 2) + else: + stream.write(f"{indent}-> {self._color('empty map', Colors.YELLOW)}\n") + + def _stream_literal_value( + self, literal: stalg.Expression.Literal, stream, depth: int + ): + """Print a literal value with proper indentation""" + indent = " " * (depth * self.indent_size) + + if literal.HasField("boolean"): + stream.write( + f"{indent}{self._color('boolean', Colors.BLUE)}: {self._color(literal.boolean, Colors.GREEN)}\n" + ) + elif literal.HasField("i32"): + stream.write( + f"{indent}{self._color('i32', Colors.BLUE)}: {self._color(literal.i32, Colors.GREEN)}\n" + ) + elif literal.HasField("i64"): + stream.write( + f"{indent}{self._color('i64', Colors.BLUE)}: {self._color(literal.i64, Colors.GREEN)}\n" + ) + elif literal.HasField("fp32"): + stream.write( + f"{indent}{self._color('fp32', Colors.BLUE)}: {self._color(literal.fp32, Colors.GREEN)}\n" + ) + elif literal.HasField("fp64"): + stream.write( + f"{indent}{self._color('fp64', Colors.BLUE)}: {self._color(literal.fp64, Colors.GREEN)}\n" + ) + elif literal.HasField("string"): + stream.write( + f"{indent}{self._color('string', Colors.BLUE)}: {self._color(f'"{literal.string}"', Colors.GREEN)}\n" + ) + elif literal.HasField("date"): + stream.write( + f"{indent}{self._color('date', Colors.BLUE)}: {self._color(literal.date, Colors.GREEN)}\n" + ) + elif literal.HasField("timestamp"): + stream.write( + f"{indent}{self._color('timestamp', Colors.BLUE)}: {self._color(literal.timestamp, Colors.GREEN)}\n" + ) + elif literal.HasField("map"): + # Recursively handle nested maps + stream.write(f"{indent}{self._color('map', Colors.BLUE)}:\n") + self._stream_map_literal(literal.map, stream, depth + 1) + elif literal.HasField("list"): + # Handle list literals + stream.write( + f"{indent}{self._color('list', Colors.BLUE)}({self._color(len(literal.list.values), Colors.YELLOW)}):\n" + ) + for i, item in enumerate(literal.list.values): + stream.write(f"{indent} -> {self._color(f'{i}', Colors.CYAN)}:\n") + self._stream_literal_value(item, stream, depth + 2) + else: + stream.write( + f"{indent}{self._color('', Colors.RED)}\n" + ) + def _type_to_string(self, type_info: stt.Type) -> str: """Convert a type to a concise string representation""" if type_info.HasField("bool"): @@ -566,4 +674,4 @@ def pretty_print_expression( ) -> None: """Convenience function to print a Substrait expression concisely""" printer = PlanPrinter(indent_size, show_metadata, use_colors) - printer.stringify_expression(expression) + printer.print_expression(expression) From 9cb346d98382b20a086254543f72f90303aaed60 Mon Sep 17 00:00:00 2001 From: Regis Caillaud Date: Mon, 25 Aug 2025 13:17:45 +0200 Subject: [PATCH 03/12] replace old | indent with -> indent --- src/substrait/builders/display.py | 38 +++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/src/substrait/builders/display.py b/src/substrait/builders/display.py index 09901e7..15e2c7a 100644 --- a/src/substrait/builders/display.py +++ b/src/substrait/builders/display.py @@ -401,19 +401,23 @@ def _stream_scalar_function( # Print function options if present if func.options: - stream.write(f"{indent}| options: {len(func.options)}\n") + stream.write( + f"{self._get_indent_with_arrow(depth + 1)}{self._color('options', Colors.BLUE)}: {self._color(len(func.options), Colors.YELLOW)}\n" + ) for i, option in enumerate(func.options): # Handle preference as a list if hasattr(option, "preference") and option.preference: pref_str = f"[{', '.join(str(p) for p in option.preference)}]" else: pref_str = "[]" - stream.write(f"{indent}| | {i}: {option.name}={pref_str}\n") + stream.write( + f"{self._get_indent_with_arrow(depth + 2)}{self._color(f'{i}', Colors.CYAN)}: {option.name}={pref_str}\n" + ) # Print output type if present if func.HasField("output_type"): stream.write( - f"{indent}| output_type: {self._type_to_string(func.output_type)}\n" + f"{self._get_indent_with_arrow(depth + 1)}{self._color('output_type', Colors.BLUE)}: {self._type_to_string(func.output_type)}\n" ) def _stream_cast(self, cast: stalg.Expression.Cast, stream, depth: int): @@ -421,9 +425,13 @@ def _stream_cast(self, cast: stalg.Expression.Cast, stream, depth: int): indent = " " * (depth * self.indent_size) stream.write(f"{indent}cast\n") - stream.write(f"{indent}| input:\n") + stream.write( + f"{self._get_indent_with_arrow(depth + 1)}{self._color('input', Colors.BLUE)}:\n" + ) self._stream_expression(cast.input, stream, depth + 1) - stream.write(f"{indent}| to: {self._type_to_string(cast.type)}\n") + stream.write( + f"{self._get_indent_with_arrow(depth + 1)}{self._color('to', Colors.BLUE)}: {self._type_to_string(cast.type)}\n" + ) def _stream_if_then(self, if_then: stalg.Expression.IfThen, stream, depth: int): """Print an if-then expression concisely""" @@ -431,13 +439,19 @@ def _stream_if_then(self, if_then: stalg.Expression.IfThen, stream, depth: int): stream.write(f"{indent}if_then\n") if if_then.ifs: - stream.write(f"{indent}| if:\n") + stream.write( + f"{self._get_indent_with_arrow(depth + 1)}{self._color('if', Colors.BLUE)}:\n" + ) self._stream_expression(if_then.ifs[0].if_, stream, depth + 1) - stream.write(f"{indent}| then:\n") + stream.write( + f"{self._get_indent_with_arrow(depth + 1)}{self._color('then', Colors.BLUE)}:\n" + ) self._stream_expression(if_then.ifs[0].then, stream, depth + 1) if if_then.HasField("else_"): - stream.write(f"{indent}| else:\n") + stream.write( + f"{self._get_indent_with_arrow(depth + 1)}{self._color('else', Colors.BLUE)}:\n" + ) self._stream_expression(if_then.else_, stream, depth + 1) def _stream_window_function( @@ -448,9 +462,13 @@ def _stream_window_function( stream.write(f"{indent}window_function: {func.function_reference}\n") if func.arguments: - stream.write(f"{indent}| args: {len(func.arguments)}\n") + stream.write( + f"{self._get_indent_with_arrow(depth + 1)}{self._color('args', Colors.BLUE)}: {self._color(len(func.arguments), Colors.YELLOW)}\n" + ) for i, arg in enumerate(func.arguments): - stream.write(f"{indent}| | {i}:\n") + stream.write( + f"{self._get_indent_with_arrow(depth + 2)}{self._color(f'{i}', Colors.CYAN)}:\n" + ) self._stream_expression(arg, stream, depth + 2) def _get_function_argument_string(self, arg) -> str: From 99152420b0de7dd71e5fd6946947777dc4c4fadf Mon Sep 17 00:00:00 2001 From: Regis Caillaud Date: Mon, 25 Aug 2025 13:20:18 +0200 Subject: [PATCH 04/12] Fix recursion instead of placeholder --- src/substrait/builders/display.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/substrait/builders/display.py b/src/substrait/builders/display.py index 15e2c7a..0c20b87 100644 --- a/src/substrait/builders/display.py +++ b/src/substrait/builders/display.py @@ -393,11 +393,11 @@ def _stream_scalar_function( arg.value.scalar_function, stream, depth + 3 ) else: - # For simple arguments, show on the same line - arg_content = self._get_function_argument_string(arg) + # Always show the full recursive output for all argument types stream.write( - f"{self._get_indent_with_arrow(depth + 2)}{self._color(f'args', Colors.BLUE)}[{self._color(f'{i}', Colors.CYAN)}]: {arg_content}\n" + f"{self._get_indent_with_arrow(depth + 2)}{self._color(f'args', Colors.BLUE)}[{self._color(f'{i}', Colors.CYAN)}]:\n" ) + self._stream_function_argument(arg, stream, depth + 3) # Print function options if present if func.options: From 1070cb1a862cde68ed5d3884e1371e27fc718f5e Mon Sep 17 00:00:00 2001 From: Regis Caillaud Date: Mon, 25 Aug 2025 14:41:52 +0200 Subject: [PATCH 05/12] Fix CI lint & format --- src/substrait/builders/display.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/substrait/builders/display.py b/src/substrait/builders/display.py index 0c20b87..3d7b3e7 100644 --- a/src/substrait/builders/display.py +++ b/src/substrait/builders/display.py @@ -129,7 +129,6 @@ def _stream_plan(self, plan: stp.Plan, stream, depth: int): def _stream_relation(self, rel: stp.PlanRel, stream, depth: int): """Print a plan relation concisely""" - indent = " " * (depth * self.indent_size) if rel.HasField("root"): self._stream_rel_root(rel.root, stream, depth) @@ -220,7 +219,7 @@ def _stream_project_rel(self, project: stalg.ProjectRel, stream, depth: int): ) for i, expr in enumerate(project.expressions): stream.write( - f"{self._get_indent_with_arrow(depth + 2)}{self._color(f'expr', Colors.BLUE)}[{self._color(f'{i}', Colors.CYAN)}]:\n" + f"{self._get_indent_with_arrow(depth + 2)}{self._color('expr', Colors.BLUE)}[{self._color(f'{i}', Colors.CYAN)}]:\n" ) self._stream_expression(expr, stream, depth + 2) @@ -387,7 +386,7 @@ def _stream_scalar_function( ): # Recursively expand nested scalar functions stream.write( - f"{self._get_indent_with_arrow(depth + 2)}{self._color(f'args', Colors.BLUE)}[{self._color(f'{i}', Colors.CYAN)}]:\n" + f"{self._get_indent_with_arrow(depth + 2)}{self._color('args', Colors.BLUE)}[{self._color(f'{i}', Colors.CYAN)}]:\n" ) self._stream_scalar_function( arg.value.scalar_function, stream, depth + 3 @@ -395,7 +394,7 @@ def _stream_scalar_function( else: # Always show the full recursive output for all argument types stream.write( - f"{self._get_indent_with_arrow(depth + 2)}{self._color(f'args', Colors.BLUE)}[{self._color(f'{i}', Colors.CYAN)}]:\n" + f"{self._get_indent_with_arrow(depth + 2)}{self._color('args', Colors.BLUE)}[{self._color(f'{i}', Colors.CYAN)}]:\n" ) self._stream_function_argument(arg, stream, depth + 3) From 2afdf8b1866da8f534209fac2a9e0e6cba42cdd1 Mon Sep 17 00:00:00 2001 From: Regis Caillaud Date: Tue, 26 Aug 2025 10:13:10 +0200 Subject: [PATCH 06/12] Move display to utils folder;move existing utils to folder as init module to keep architcture retro compatible --- examples/builder_example.py | 2 +- src/substrait/{utils.py => utils/__init__.py} | 4 ++++ src/substrait/{builders => utils}/display.py | 0 3 files changed, 5 insertions(+), 1 deletion(-) rename src/substrait/{utils.py => utils/__init__.py} (97%) rename src/substrait/{builders => utils}/display.py (100%) diff --git a/examples/builder_example.py b/examples/builder_example.py index 6cc1c77..ced6e7e 100644 --- a/examples/builder_example.py +++ b/examples/builder_example.py @@ -12,7 +12,7 @@ ) from substrait.builders.type import i64, boolean, struct, named_struct, fp64, string from substrait.extension_registry import ExtensionRegistry -from substrait.builders.display import pretty_print_plan, pretty_print_expression +from substrait.utils.display import pretty_print_plan, pretty_print_expression import substrait.gen.proto.algebra_pb2 as stalg registry = ExtensionRegistry(load_default_extensions=True) diff --git a/src/substrait/utils.py b/src/substrait/utils/__init__.py similarity index 97% rename from src/substrait/utils.py rename to src/substrait/utils/__init__.py index bd84381..0a8e5b4 100644 --- a/src/substrait/utils.py +++ b/src/substrait/utils/__init__.py @@ -1,3 +1,7 @@ +""" +Utility and debugging functions for Substrait. +""" + import substrait.gen.proto.type_pb2 as stp import substrait.gen.proto.extensions.extensions_pb2 as ste from typing import Iterable diff --git a/src/substrait/builders/display.py b/src/substrait/utils/display.py similarity index 100% rename from src/substrait/builders/display.py rename to src/substrait/utils/display.py From 0b0b59229ce0585b5f6ec95e9c7346c810140d51 Mon Sep 17 00:00:00 2001 From: Regis Caillaud Date: Tue, 26 Aug 2025 10:28:55 +0200 Subject: [PATCH 07/12] lint --- src/substrait/utils/display.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/substrait/utils/display.py b/src/substrait/utils/display.py index 3d7b3e7..2a0a2ea 100644 --- a/src/substrait/utils/display.py +++ b/src/substrait/utils/display.py @@ -8,7 +8,6 @@ import substrait.gen.proto.algebra_pb2 as stalg import substrait.gen.proto.plan_pb2 as stp import substrait.gen.proto.type_pb2 as stt -import substrait.gen.proto.extended_expression_pb2 as stee # ANSI color codes From 67d7ebae6c5579b476a1d1bc427f53513ac1de19 Mon Sep 17 00:00:00 2001 From: Regis Caillaud Date: Thu, 28 Aug 2025 07:41:45 +0000 Subject: [PATCH 08/12] Add single extension rel, multi extenion rel --- src/substrait/utils/display.py | 80 ++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/src/substrait/utils/display.py b/src/substrait/utils/display.py index 3d7b3e7..a2fc51e 100644 --- a/src/substrait/utils/display.py +++ b/src/substrait/utils/display.py @@ -168,6 +168,10 @@ def _stream_rel(self, rel: stalg.Rel, stream, depth: int): self._stream_cross_rel(rel.cross, stream, depth) elif rel.HasField("fetch"): self._stream_fetch_rel(rel.fetch, stream, depth) + elif rel.HasField("extension_single"): + self._stream_extension_single_rel(rel.extension_single, stream, depth) + elif rel.HasField("extension_multi"): + self._stream_extension_multi_rel(rel.extension_multi, stream, depth) else: stream.write(f"{indent}\n") @@ -303,6 +307,82 @@ def _stream_fetch_rel(self, fetch: stalg.FetchRel, stream, depth: int): ) self._stream_rel(fetch.input, stream, depth + 1) + def _stream_extension_single_rel( + self, extension: stalg.ExtensionSingleRel, stream, depth: int + ): + """Print an extension single relation concisely""" + indent = " " * (depth * self.indent_size) + + stream.write(f"{indent}{self._color('extension_single', Colors.MAGENTA)}\n") + stream.write( + f"{self._get_indent_with_arrow(depth + 1)}{self._color('input:', Colors.BLUE)}\n" + ) + self._stream_rel(extension.input, stream, depth + 1) + + if extension.HasField("detail"): + stream.write( + f"{self._get_indent_with_arrow(depth + 1)}{self._color('detail:', Colors.BLUE)}\n" + ) + # Try to unpack and display the detail if it's an Expression + try: + from google.protobuf import any_pb2 + + detail = extension.detail + if detail.type_url and detail.value: + # Try to unpack as Expression + expression = stalg.Expression() + detail.Unpack(expression) + self._stream_expression(expression, stream, depth + 2) + else: + stream.write( + f"{self._get_indent_with_arrow(depth + 2)}\n" + ) + except Exception: + stream.write( + f"{self._get_indent_with_arrow(depth + 2)}\n" + ) + + def _stream_extension_multi_rel( + self, extension: stalg.ExtensionMultiRel, stream, depth: int + ): + """Print an extension multi relation concisely""" + indent = " " * (depth * self.indent_size) + + stream.write(f"{indent}{self._color('extension_multi', Colors.MAGENTA)}\n") + + if extension.inputs: + stream.write( + f"{self._get_indent_with_arrow(depth + 1)}{self._color('inputs', Colors.BLUE)}({self._color(len(extension.inputs), Colors.YELLOW)}):\n" + ) + for i, input_rel in enumerate(extension.inputs): + stream.write( + f"{self._get_indent_with_arrow(depth + 2)}{self._color('input', Colors.BLUE)}[{self._color(f'{i}', Colors.CYAN)}]:\n" + ) + self._stream_rel(input_rel, stream, depth + 3) + + if extension.HasField("detail"): + stream.write( + f"{self._get_indent_with_arrow(depth + 1)}{self._color('detail:', Colors.BLUE)}\n" + ) + # Try to unpack and display the detail if it's an Expression + try: + from google.protobuf import any_pb2 + + detail = extension.detail + if detail.type_url and detail.value: + # Try to unpack as Expression + expression = stalg.Expression() + detail.Unpack(expression) + self._stream_expression(expression, stream, depth + 2) + else: + stream.write( + f"{self._get_indent_with_arrow(depth + 2)}\n" + ) + except Exception: + stream.write( + f"{self._get_indent_with_arrow(depth + 2)}\n" + ) + def _stream_expression(self, expression: stalg.Expression, stream, depth: int): """Print an expression concisely""" indent = " " * (depth * self.indent_size) From fb6a2cf7ab2b4ed8539c9c05038e91b3c8b08f64 Mon Sep 17 00:00:00 2001 From: Regis Caillaud Date: Thu, 28 Aug 2025 08:34:50 +0000 Subject: [PATCH 09/12] Add virtual table & struct values --- src/substrait/utils/display.py | 70 ++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/src/substrait/utils/display.py b/src/substrait/utils/display.py index f123e9d..42ee73f 100644 --- a/src/substrait/utils/display.py +++ b/src/substrait/utils/display.py @@ -183,6 +183,29 @@ def _stream_read_rel(self, read: stalg.ReadRel, stream, depth: int): stream.write( f"{indent}{self._color('read:', Colors.GREEN)} {self._color(table_names[0] if len(table_names) == 1 else table_names, Colors.YELLOW)}\n" ) + elif read.HasField("virtual_table"): + stream.write( + f"{indent}{self._color('read:', Colors.GREEN)} {self._color('virtual_table', Colors.YELLOW)}\n" + ) + if read.virtual_table.values: + stream.write( + f"{self._get_indent_with_arrow(depth + 1)}{self._color('values:', Colors.BLUE)} {self._color(len(read.virtual_table.values), Colors.YELLOW)}\n" + ) + # Show the actual values, not just count + for i, value in enumerate(read.virtual_table.values): + # Handle struct values properly + if hasattr(value, "fields"): + stream.write( + f"{self._get_indent_with_arrow(depth + 2)}value[{i}]: " + ) + self._stream_struct_literal( + value, stream, depth + 2, inline=True + ) + else: + stream.write( + f"{self._get_indent_with_arrow(depth + 2)}value[{i}]: " + ) + self._stream_literal_value(value, stream, depth + 2) if read.HasField("base_schema"): # Capture schema names for field resolution @@ -730,6 +753,53 @@ def _stream_literal_value( f"{indent}{self._color('', Colors.RED)}\n" ) + def _stream_struct_literal( + self, struct_literal, stream, depth: int, inline: bool = False + ): + """Print a struct literal value with proper indentation""" + if inline: + # When inline, don't add extra indentation since we're already on the same line + indent = "" + else: + indent = " " * (depth * self.indent_size) + + if hasattr(struct_literal, "fields") and struct_literal.fields: + stream.write(f"{indent}{self._color('struct', Colors.BLUE)}\n") + for i, field in enumerate(struct_literal.fields): + # Show field index + stream.write(f"{self._get_indent_with_arrow(depth + 1)}field[{i}]:\n") + # Show the actual field value with proper indentation + if hasattr(field, "i64"): + stream.write( + f"{self._get_indent_with_arrow(depth + 2)}{self._color('i64', Colors.BLUE)}: {self._color(field.i64, Colors.GREEN)}\n" + ) + elif hasattr(field, "fp64"): + stream.write( + f"{self._get_indent_with_arrow(depth + 2)}{self._color('fp64', Colors.BLUE)}: {self._color(field.fp64, Colors.GREEN)}\n" + ) + elif hasattr(field, "fp32"): + stream.write( + f"{self._get_indent_with_arrow(depth + 2)}{self._color('fp32', Colors.BLUE)}: {self._color(field.fp32, Colors.GREEN)}\n" + ) + elif hasattr(field, "i32"): + stream.write( + f"{self._get_indent_with_arrow(depth + 2)}{self._color('i32', Colors.BLUE)}: {self._color(field.i32, Colors.GREEN)}\n" + ) + elif hasattr(field, "string"): + stream.write( + f"{self._get_indent_with_arrow(depth + 2)}{self._color('string', Colors.BLUE)}: {self._color(f'"{field.string}"', Colors.GREEN)}\n" + ) + elif hasattr(field, "boolean"): + stream.write( + f"{self._get_indent_with_arrow(depth + 2)}{self._color('boolean', Colors.BLUE)}: {self._color(field.boolean, Colors.GREEN)}\n" + ) + else: + stream.write( + f"{self._get_indent_with_arrow(depth + 2)}{self._color('', Colors.RED)}\n" + ) + else: + stream.write(f"{indent}{self._color('empty_struct', Colors.YELLOW)}\n") + def _type_to_string(self, type_info: stt.Type) -> str: """Convert a type to a concise string representation""" if type_info.HasField("bool"): From abb01e4822a15f1f5da935c391da76c856e01110 Mon Sep 17 00:00:00 2001 From: Regis Caillaud Date: Thu, 28 Aug 2025 14:05:03 +0200 Subject: [PATCH 10/12] fix unused import --- src/substrait/utils/display.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/substrait/utils/display.py b/src/substrait/utils/display.py index 42ee73f..aae0ad6 100644 --- a/src/substrait/utils/display.py +++ b/src/substrait/utils/display.py @@ -347,8 +347,6 @@ def _stream_extension_single_rel( ) # Try to unpack and display the detail if it's an Expression try: - from google.protobuf import any_pb2 - detail = extension.detail if detail.type_url and detail.value: # Try to unpack as Expression @@ -388,8 +386,6 @@ def _stream_extension_multi_rel( ) # Try to unpack and display the detail if it's an Expression try: - from google.protobuf import any_pb2 - detail = extension.detail if detail.type_url and detail.value: # Try to unpack as Expression From dfa053c4992d43f10d30143c40081427704212ef Mon Sep 17 00:00:00 2001 From: Regis Caillaud Date: Mon, 1 Sep 2025 16:56:58 +0200 Subject: [PATCH 11/12] hasattr -> HasField --- src/substrait/utils/display.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/substrait/utils/display.py b/src/substrait/utils/display.py index aae0ad6..eeff803 100644 --- a/src/substrait/utils/display.py +++ b/src/substrait/utils/display.py @@ -765,27 +765,27 @@ def _stream_struct_literal( # Show field index stream.write(f"{self._get_indent_with_arrow(depth + 1)}field[{i}]:\n") # Show the actual field value with proper indentation - if hasattr(field, "i64"): + if field.HasField("i64"): stream.write( f"{self._get_indent_with_arrow(depth + 2)}{self._color('i64', Colors.BLUE)}: {self._color(field.i64, Colors.GREEN)}\n" ) - elif hasattr(field, "fp64"): + elif field.HasField("fp64"): stream.write( f"{self._get_indent_with_arrow(depth + 2)}{self._color('fp64', Colors.BLUE)}: {self._color(field.fp64, Colors.GREEN)}\n" ) - elif hasattr(field, "fp32"): + elif field.HasField("fp32"): stream.write( f"{self._get_indent_with_arrow(depth + 2)}{self._color('fp32', Colors.BLUE)}: {self._color(field.fp32, Colors.GREEN)}\n" ) - elif hasattr(field, "i32"): + elif field.HasField("i32"): stream.write( f"{self._get_indent_with_arrow(depth + 2)}{self._color('i32', Colors.BLUE)}: {self._color(field.i32, Colors.GREEN)}\n" ) - elif hasattr(field, "string"): + elif field.HasField("string"): stream.write( f"{self._get_indent_with_arrow(depth + 2)}{self._color('string', Colors.BLUE)}: {self._color(f'"{field.string}"', Colors.GREEN)}\n" ) - elif hasattr(field, "boolean"): + elif field.HasField("boolean"): stream.write( f"{self._get_indent_with_arrow(depth + 2)}{self._color('boolean', Colors.BLUE)}: {self._color(field.boolean, Colors.GREEN)}\n" ) From e9886d5c0fc84510abc4cfc3c1ef151b11997c10 Mon Sep 17 00:00:00 2001 From: Regis Caillaud Date: Sat, 6 Sep 2025 02:40:32 +0200 Subject: [PATCH 12/12] Fix simply typo --- src/substrait/utils/display.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/substrait/utils/display.py b/src/substrait/utils/display.py index eeff803..0b5d3c1 100644 --- a/src/substrait/utils/display.py +++ b/src/substrait/utils/display.py @@ -566,7 +566,7 @@ def _stream_window_function( stream.write( f"{self._get_indent_with_arrow(depth + 2)}{self._color(f'{i}', Colors.CYAN)}:\n" ) - self._stream_expression(arg, stream, depth + 2) + self._stream_function_argument(arg, stream, depth + 2) def _get_function_argument_string(self, arg) -> str: """Get function argument content as a string without newlines"""