From aad096b1befaf5bc9cde174c1f53424b3b8a958a Mon Sep 17 00:00:00 2001 From: Gil Forsyth Date: Thu, 15 Jun 2023 10:08:32 -0400 Subject: [PATCH] feat: package extension yamls --- .gitattributes | 1 + gen_proto.sh | 8 + src/substrait/extensions/extension_types.yaml | 10 + .../functions_aggregate_approx.yaml | 18 + .../functions_aggregate_generic.yaml | 37 + .../extensions/functions_arithmetic.yaml | 1588 +++++++++++++++++ .../functions_arithmetic_decimal.yaml | 151 ++ .../extensions/functions_boolean.yaml | 140 ++ .../extensions/functions_comparison.yaml | 216 +++ .../extensions/functions_datetime.yaml | 690 +++++++ .../extensions/functions_logarithmic.yaml | 147 ++ .../extensions/functions_rounding.yaml | 270 +++ src/substrait/extensions/functions_set.yaml | 27 + .../extensions/functions_string.yaml | 1397 +++++++++++++++ src/substrait/extensions/type_variations.yaml | 25 + src/substrait/extensions/unknown.yaml | 66 + 16 files changed, 4791 insertions(+) create mode 100644 src/substrait/extensions/extension_types.yaml create mode 100644 src/substrait/extensions/functions_aggregate_approx.yaml create mode 100644 src/substrait/extensions/functions_aggregate_generic.yaml create mode 100644 src/substrait/extensions/functions_arithmetic.yaml create mode 100644 src/substrait/extensions/functions_arithmetic_decimal.yaml create mode 100644 src/substrait/extensions/functions_boolean.yaml create mode 100644 src/substrait/extensions/functions_comparison.yaml create mode 100644 src/substrait/extensions/functions_datetime.yaml create mode 100644 src/substrait/extensions/functions_logarithmic.yaml create mode 100644 src/substrait/extensions/functions_rounding.yaml create mode 100644 src/substrait/extensions/functions_set.yaml create mode 100644 src/substrait/extensions/functions_string.yaml create mode 100644 src/substrait/extensions/type_variations.yaml create mode 100644 src/substrait/extensions/unknown.yaml diff --git a/.gitattributes b/.gitattributes index 439fb8f..7928937 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1 +1,2 @@ src/substrait/gen/** linguist-generated=true +src/substrait/extensions/** linguist-generated=true diff --git a/gen_proto.sh b/gen_proto.sh index 5d61caa..04939b1 100755 --- a/gen_proto.sh +++ b/gen_proto.sh @@ -7,6 +7,7 @@ submodule_dir=./third_party/substrait src_dir="$submodule_dir"/proto tmp_dir=./buf_work_dir dest_dir=./src/substrait/gen +extension_dir=./src/substrait/extensions # Prefix the protobuf files with a unique configuration to prevent namespace conflicts # with other substrait packages. Save output to the work dir. @@ -19,5 +20,12 @@ rm -rf "$dest_dir" buf generate protol --in-place --create-package --python-out "$dest_dir" buf +# Remove the old extension files +rm -rf "$extension_dir" + +# Copy over new yaml files +cp -fr "$submodule_dir"/extensions "$extension_dir" +find "$extension_dir" -type f -exec chmod u+rw {} + + # Remove the temporary work dir rm -rf "$tmp_dir" diff --git a/src/substrait/extensions/extension_types.yaml b/src/substrait/extensions/extension_types.yaml new file mode 100644 index 0000000..e03073c --- /dev/null +++ b/src/substrait/extensions/extension_types.yaml @@ -0,0 +1,10 @@ +--- +types: + - name: point + structure: + latitude: i32 + longitude: i32 + - name: line + structure: + start: point + end: point diff --git a/src/substrait/extensions/functions_aggregate_approx.yaml b/src/substrait/extensions/functions_aggregate_approx.yaml new file mode 100644 index 0000000..c77caec --- /dev/null +++ b/src/substrait/extensions/functions_aggregate_approx.yaml @@ -0,0 +1,18 @@ +%YAML 1.2 +--- +aggregate_functions: + - name: "approx_count_distinct" + description: >- + Calculates the approximate number of rows that contain distinct values of the expression argument using + HyperLogLog. This function provides an alternative to the COUNT (DISTINCT expression) function, which + returns the exact number of rows that contain distinct values of an expression. APPROX_COUNT_DISTINCT + processes large amounts of data significantly faster than COUNT, with negligible deviation from the exact + result. + impls: + - args: + - name: x + value: any + nullability: DECLARED_OUTPUT + decomposable: MANY + intermediate: binary + return: i64 diff --git a/src/substrait/extensions/functions_aggregate_generic.yaml b/src/substrait/extensions/functions_aggregate_generic.yaml new file mode 100644 index 0000000..4d891e9 --- /dev/null +++ b/src/substrait/extensions/functions_aggregate_generic.yaml @@ -0,0 +1,37 @@ +%YAML 1.2 +--- +aggregate_functions: + - name: "count" + description: Count a set of values + impls: + - args: + - name: x + value: any + options: + overflow: + values: [SILENT, SATURATE, ERROR] + nullability: DECLARED_OUTPUT + decomposable: MANY + intermediate: i64 + return: i64 + - name: "count" + description: "Count a set of records (not field referenced)" + impls: + - options: + overflow: + values: [SILENT, SATURATE, ERROR] + nullability: DECLARED_OUTPUT + decomposable: MANY + intermediate: i64 + return: i64 + - name: "any_value" + description: > + Selects an arbitrary value from a group of values. + + If the input is empty, the function returns null. + impls: + - args: + - name: x + value: any + nullability: DECLARED_OUTPUT + return: any? diff --git a/src/substrait/extensions/functions_arithmetic.yaml b/src/substrait/extensions/functions_arithmetic.yaml new file mode 100644 index 0000000..61573e8 --- /dev/null +++ b/src/substrait/extensions/functions_arithmetic.yaml @@ -0,0 +1,1588 @@ +%YAML 1.2 +--- +scalar_functions: + - + name: "add" + description: "Add two values." + impls: + - args: + - name: x + value: i8 + - name: y + value: i8 + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + return: i8 + - args: + - name: x + value: i16 + - name: y + value: i16 + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + return: i16 + - args: + - name: x + value: i32 + - name: y + value: i32 + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + return: i32 + - args: + - value: i64 + - value: i64 + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + return: i64 + - args: + - name: x + value: fp32 + - name: y + value: fp32 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + return: fp32 + - args: + - name: x + value: fp64 + - name: y + value: fp64 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + return: fp64 + - + name: "subtract" + description: "Subtract one value from another." + impls: + - args: + - name: x + value: i8 + - name: y + value: i8 + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + return: i8 + - args: + - name: x + value: i16 + - name: y + value: i16 + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + return: i16 + - args: + - name: x + value: i32 + - name: y + value: i32 + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + return: i32 + - args: + - name: x + value: i64 + - name: y + value: i64 + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + return: i64 + - args: + - name: x + value: fp32 + - name: y + value: fp32 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + return: fp32 + - args: + - name: x + value: fp64 + - name: y + value: fp64 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + return: fp64 + - + name: "multiply" + description: "Multiply two values." + impls: + - args: + - name: x + value: i8 + - name: y + value: i8 + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + return: i8 + - args: + - name: x + value: i16 + - name: y + value: i16 + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + return: i16 + - args: + - name: x + value: i32 + - name: y + value: i32 + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + return: i32 + - args: + - name: x + value: i64 + - name: y + value: i64 + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + return: i64 + - args: + - name: x + value: fp32 + - name: y + value: fp32 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + return: fp32 + - args: + - name: x + value: fp64 + - name: y + value: fp64 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + return: fp64 + - + name: "divide" + description: > + Divide x by y. In the case of integer division, partial values are truncated (i.e. rounded towards 0). + The `on_division_by_zero` option governs behavior in cases where y is 0 and x is not 0. + `LIMIT` means positive or negative infinity (depending on the sign of x and y). + If x and y are both 0 or both +/-infinity, behavior will be governed by `on_domain_error`. + impls: + - args: + - name: x + value: i8 + - name: y + value: i8 + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + return: i8 + - args: + - name: x + value: i16 + - name: y + value: i16 + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + return: i16 + - args: + - name: x + value: i32 + - name: y + value: i32 + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + return: i32 + - args: + - name: x + value: i64 + - name: y + value: i64 + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + return: i64 + - args: + - name: x + value: fp32 + - name: y + value: fp32 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + on_domain_error: + values: [ NAN, ERROR ] + on_division_by_zero: + values: [ LIMIT, NAN, ERROR ] + return: fp32 + - args: + - name: x + value: fp64 + - name: y + value: fp64 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + on_domain_error: + values: [ NAN, ERROR ] + on_division_by_zero: + values: [ LIMIT, NAN, ERROR ] + return: fp64 + - + name: "negate" + description: "Negation of the value" + impls: + - args: + - name: x + value: i8 + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + return: i8 + - args: + - name: x + value: i16 + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + return: i16 + - args: + - name: x + value: i32 + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + return: i32 + - args: + - name: x + value: i64 + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + return: i64 + - args: + - name: x + value: fp32 + return: fp32 + - args: + - name: x + value: fp64 + return: fp64 + - + name: "modulus" + description: "Get the remainder when dividing one value by another." + impls: + - args: + - name: x + value: i8 + - name: y + value: i8 + return: i8 + - args: + - name: x + value: i16 + - name: y + value: i16 + return: i16 + - args: + - name: x + value: i32 + - name: y + value: i32 + return: i32 + - args: + - name: x + value: i64 + - name: y + value: i64 + return: i64 + - + name: "power" + description: "Take the power with x as the base and y as exponent." + impls: + - args: + - name: x + value: i64 + - name: y + value: i64 + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + return: i64 + - args: + - name: x + value: fp32 + - name: y + value: fp32 + return: fp32 + - args: + - name: x + value: fp64 + - name: y + value: fp64 + return: fp64 + - + name: "sqrt" + description: "Square root of the value" + impls: + - args: + - name: x + value: i64 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + on_domain_error: + values: [ NAN, ERROR ] + return: fp64 + - args: + - name: x + value: fp32 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + on_domain_error: + values: [ NAN, ERROR ] + return: fp32 + - args: + - name: x + value: fp64 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + on_domain_error: + values: [ NAN, ERROR ] + return: fp64 + - + name: "exp" + description: "The mathematical constant e, raised to the power of the value." + impls: + - args: + - name: x + value: fp32 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + return: fp32 + - args: + - name: x + value: fp64 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + return: fp64 + - + name: "cos" + description: "Get the cosine of a value in radians." + impls: + - args: + - name: x + value: fp32 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + return: fp64 + - args: + - name: x + value: fp64 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + return: fp64 + - + name: "sin" + description: "Get the sine of a value in radians." + impls: + - args: + - name: x + value: fp32 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + return: fp64 + - args: + - name: x + value: fp64 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + return: fp64 + - + name: "tan" + description: "Get the tangent of a value in radians." + impls: + - args: + - name: x + value: fp32 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + return: fp64 + - args: + - name: x + value: fp64 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + return: fp64 + - + name: "cosh" + description: "Get the hyperbolic cosine of a value in radians." + impls: + - args: + - name: x + value: fp32 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + return: fp32 + - args: + - name: x + value: fp64 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + return: fp64 + - + name: "sinh" + description: "Get the hyperbolic sine of a value in radians." + impls: + - args: + - name: x + value: fp32 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + return: fp32 + - args: + - name: x + value: fp64 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + return: fp64 + - + name: "tanh" + description: "Get the hyperbolic tangent of a value in radians." + impls: + - args: + - name: x + value: fp32 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + return: fp32 + - args: + - name: x + value: fp64 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + return: fp64 + - + name: "acos" + description: "Get the arccosine of a value in radians." + impls: + - args: + - name: x + value: fp32 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + on_domain_error: + values: [ NAN, ERROR ] + return: fp64 + - args: + - name: x + value: fp64 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + on_domain_error: + values: [ NAN, ERROR ] + return: fp64 + - + name: "asin" + description: "Get the arcsine of a value in radians." + impls: + - args: + - name: x + value: fp32 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + on_domain_error: + values: [ NAN, ERROR ] + return: fp64 + - args: + - name: x + value: fp64 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + on_domain_error: + values: [ NAN, ERROR ] + return: fp64 + - + name: "atan" + description: "Get the arctangent of a value in radians." + impls: + - args: + - name: x + value: fp32 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + return: fp64 + - args: + - name: x + value: fp64 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + return: fp64 + - + name: "acosh" + description: "Get the hyperbolic arccosine of a value in radians." + impls: + - args: + - name: x + value: fp32 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + on_domain_error: + values: [ NAN, ERROR ] + return: fp32 + - args: + - name: x + value: fp64 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + on_domain_error: + values: [ NAN, ERROR ] + return: fp64 + - + name: "asinh" + description: "Get the hyperbolic arcsine of a value in radians." + impls: + - args: + - name: x + value: fp32 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + return: fp32 + - args: + - name: x + value: fp64 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + return: fp64 + - + name: "atanh" + description: "Get the hyperbolic arctangent of a value in radians." + impls: + - args: + - name: x + value: fp32 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + on_domain_error: + values: [ NAN, ERROR ] + return: fp32 + - args: + - name: x + value: fp64 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + on_domain_error: + values: [ NAN, ERROR ] + return: fp64 + - + name: "atan2" + description: "Get the arctangent of values given as x/y pairs." + impls: + - args: + - name: x + value: fp32 + - name: y + value: fp32 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + on_domain_error: + values: [ NAN, ERROR ] + return: fp64 + - args: + - name: x + value: fp64 + - name: y + value: fp64 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + on_domain_error: + values: [ NAN, ERROR ] + return: fp64 + - + name: "abs" + description: > + Calculate the absolute value of the argument. + + Integer values allow the specification of overflow behavior to handle the + unevenness of the twos complement, e.g. Int8 range [-128 : 127]. + impls: + - args: + - name: x + value: i8 + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + return: i8 + - args: + - name: x + value: i16 + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + return: i16 + - args: + - name: x + value: i32 + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + return: i32 + - args: + - name: x + value: i64 + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + return: i64 + - args: + - name: x + value: fp32 + return: fp32 + - args: + - name: x + value: fp64 + return: fp64 + - + name: "sign" + description: > + Return the signedness of the argument. + + Integer values return signedness with the same type as the input. + Possible return values are [-1, 0, 1] + + Floating point values return signedness with the same type as the input. + Possible return values are [-1.0, -0.0, 0.0, 1.0, NaN] + impls: + - args: + - name: x + value: i8 + return: i8 + - args: + - name: x + value: i16 + return: i16 + - args: + - name: x + value: i32 + return: i32 + - args: + - name: x + value: i64 + return: i64 + - args: + - name: x + value: fp32 + return: fp32 + - args: + - name: x + value: fp64 + return: fp64 + - + name: "factorial" + description: > + Return the factorial of a given integer input. + + The factorial of 0! is 1 by convention. + + Negative inputs will raise an error. + impls: + - args: + - value: i32 + name: "n" + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + return: i32 + - args: + - value: i64 + name: "n" + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + return: i64 + - + name: "bitwise_not" + description: > + Return the bitwise NOT result for one integer input. + + impls: + - args: + - name: x + value: i8 + return: i8 + - args: + - name: x + value: i16 + return: i16 + - args: + - name: x + value: i32 + return: i32 + - args: + - name: x + value: i64 + return: i64 + - + name: "bitwise_and" + description: > + Return the bitwise AND result for two integer inputs. + + impls: + - args: + - name: x + value: i8 + - name: y + value: i8 + return: i8 + - args: + - name: x + value: i16 + - name: y + value: i16 + return: i16 + - args: + - name: x + value: i32 + - name: y + value: i32 + return: i32 + - args: + - name: x + value: i64 + - name: y + value: i64 + return: i64 + - + name: "bitwise_or" + description: > + Return the bitwise OR result for two given integer inputs. + + impls: + - args: + - name: x + value: i8 + - name: y + value: i8 + return: i8 + - args: + - name: x + value: i16 + - name: y + value: i16 + return: i16 + - args: + - name: x + value: i32 + - name: y + value: i32 + return: i32 + - args: + - name: x + value: i64 + - name: y + value: i64 + return: i64 + - + name: "bitwise_xor" + description: > + Return the bitwise XOR result for two integer inputs. + + impls: + - args: + - name: x + value: i8 + - name: y + value: i8 + return: i8 + - args: + - name: x + value: i16 + - name: y + value: i16 + return: i16 + - args: + - name: x + value: i32 + - name: y + value: i32 + return: i32 + - args: + - name: x + value: i64 + - name: y + value: i64 + return: i64 + +aggregate_functions: + - name: "sum" + description: Sum a set of values. The sum of zero elements yields null. + impls: + - args: + - name: x + value: i8 + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + nullability: DECLARED_OUTPUT + decomposable: MANY + intermediate: i64? + return: i64? + - args: + - name: x + value: i16 + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + nullability: DECLARED_OUTPUT + decomposable: MANY + intermediate: i64? + return: i64? + - args: + - name: x + value: i32 + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + nullability: DECLARED_OUTPUT + decomposable: MANY + intermediate: i64? + return: i64? + - args: + - name: x + value: i64 + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + nullability: DECLARED_OUTPUT + decomposable: MANY + intermediate: i64? + return: i64? + - args: + - name: x + value: fp32 + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + nullability: DECLARED_OUTPUT + decomposable: MANY + intermediate: fp64? + return: fp64? + - args: + - name: x + value: fp64 + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + nullability: DECLARED_OUTPUT + decomposable: MANY + intermediate: fp64? + return: fp64? + - name: "sum0" + description: > + Sum a set of values. The sum of zero elements yields zero. + + Null values are ignored. + impls: + - args: + - name: x + value: i8 + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + nullability: DECLARED_OUTPUT + decomposable: MANY + intermediate: i64 + return: i64 + - args: + - name: x + value: i16 + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + nullability: DECLARED_OUTPUT + decomposable: MANY + intermediate: i64 + return: i64 + - args: + - name: x + value: i32 + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + nullability: DECLARED_OUTPUT + decomposable: MANY + intermediate: i64 + return: i64 + - args: + - name: x + value: i64 + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + nullability: DECLARED_OUTPUT + decomposable: MANY + intermediate: i64 + return: i64 + - args: + - name: x + value: fp32 + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + nullability: DECLARED_OUTPUT + decomposable: MANY + intermediate: fp64 + return: fp64 + - args: + - name: x + value: fp64 + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + nullability: DECLARED_OUTPUT + decomposable: MANY + intermediate: fp64 + return: fp64 + - name: "avg" + description: Average a set of values. For integral types, this truncates partial values. + impls: + - args: + - name: x + value: i8 + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + nullability: DECLARED_OUTPUT + decomposable: MANY + intermediate: "STRUCT" + return: i8? + - args: + - name: x + value: i16 + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + nullability: DECLARED_OUTPUT + decomposable: MANY + intermediate: "STRUCT" + return: i16? + - args: + - name: x + value: i32 + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + nullability: DECLARED_OUTPUT + decomposable: MANY + intermediate: "STRUCT" + return: i32? + - args: + - name: x + value: i64 + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + nullability: DECLARED_OUTPUT + decomposable: MANY + intermediate: "STRUCT" + return: i64? + - args: + - name: x + value: fp32 + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + nullability: DECLARED_OUTPUT + decomposable: MANY + intermediate: "STRUCT" + return: fp32? + - args: + - name: x + value: fp64 + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + nullability: DECLARED_OUTPUT + decomposable: MANY + intermediate: "STRUCT" + return: fp64? + - name: "min" + description: Min a set of values. + impls: + - args: + - name: x + value: i8 + nullability: DECLARED_OUTPUT + decomposable: MANY + intermediate: i8? + return: i8? + - args: + - name: x + value: i16 + nullability: DECLARED_OUTPUT + decomposable: MANY + intermediate: i16? + return: i16? + - args: + - name: x + value: i32 + nullability: DECLARED_OUTPUT + decomposable: MANY + intermediate: i32? + return: i32? + - args: + - name: x + value: i64 + nullability: DECLARED_OUTPUT + decomposable: MANY + intermediate: i64? + return: i64? + - args: + - name: x + value: fp32 + nullability: DECLARED_OUTPUT + decomposable: MANY + intermediate: fp32? + return: fp32? + - args: + - name: x + value: fp64 + nullability: DECLARED_OUTPUT + decomposable: MANY + intermediate: fp64? + return: fp64? + - name: "max" + description: Max a set of values. + impls: + - args: + - name: x + value: i8 + nullability: DECLARED_OUTPUT + decomposable: MANY + intermediate: i8? + return: i8? + - args: + - name: x + value: i16 + nullability: DECLARED_OUTPUT + decomposable: MANY + intermediate: i16? + return: i16? + - args: + - name: x + value: i32 + nullability: DECLARED_OUTPUT + decomposable: MANY + intermediate: i32? + return: i32? + - args: + - name: x + value: i64 + nullability: DECLARED_OUTPUT + decomposable: MANY + intermediate: i64? + return: i64? + - args: + - name: x + value: fp32 + nullability: DECLARED_OUTPUT + decomposable: MANY + intermediate: fp32? + return: fp32? + - args: + - name: x + value: fp64 + nullability: DECLARED_OUTPUT + decomposable: MANY + intermediate: fp64? + return: fp64? + - name: "product" + description: Product of a set of values. Returns 1 for empty input. + impls: + - args: + - name: x + value: i8 + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + nullability: MIRROR + decomposable: MANY + intermediate: i64 + return: i8 + - args: + - name: x + value: i16 + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + nullability: MIRROR + decomposable: MANY + intermediate: i64 + return: i16 + - args: + - name: x + value: i32 + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + nullability: MIRROR + decomposable: MANY + intermediate: i64 + return: i32 + - args: + - name: x + value: i64 + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + nullability: MIRROR + decomposable: MANY + intermediate: i64 + return: i64 + - args: + - name: x + value: fp32 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + nullability: MIRROR + decomposable: MANY + intermediate: fp64 + return: fp32 + - args: + - name: x + value: fp64 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + nullability: MIRROR + decomposable: MANY + intermediate: fp64 + return: fp64 + - name: "std_dev" + description: Calculates standard-deviation for a set of values. + impls: + - args: + - name: x + value: fp32 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + distribution: + values: [ SAMPLE, POPULATION] + nullability: DECLARED_OUTPUT + return: fp32? + - args: + - name: x + value: fp64 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + distribution: + values: [ SAMPLE, POPULATION] + nullability: DECLARED_OUTPUT + return: fp64? + - name: "variance" + description: Calculates variance for a set of values. + impls: + - args: + - name: x + value: fp32 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + distribution: + values: [ SAMPLE, POPULATION] + nullability: DECLARED_OUTPUT + return: fp32? + - args: + - name: x + value: fp64 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + distribution: + values: [ SAMPLE, POPULATION] + nullability: DECLARED_OUTPUT + return: fp64? + - name: "corr" + description: > + Calculates the value of Pearson's correlation coefficient between `x` and `y`. + If there is no input, null is returned. + impls: + - args: + - name: x + value: fp32 + - name: y + value: fp32 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + nullability: DECLARED_OUTPUT + return: fp32? + - args: + - name: x + value: fp64 + - name: y + value: fp64 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + nullability: DECLARED_OUTPUT + return: fp64? + - name: "mode" + description: > + Calculates mode for a set of values. + If there is no input, null is returned. + impls: + - args: + - name: x + value: i8 + nullability: DECLARED_OUTPUT + return: i8? + - args: + - name: x + value: i16 + nullability: DECLARED_OUTPUT + return: i16? + - args: + - name: x + value: i32 + nullability: DECLARED_OUTPUT + return: i32? + - args: + - name: x + value: i64 + nullability: DECLARED_OUTPUT + return: i64? + - args: + - name: x + value: fp32 + nullability: DECLARED_OUTPUT + return: fp32? + - args: + - name: x + value: fp64 + nullability: DECLARED_OUTPUT + return: fp64? + - name: "median" + description: > + Calculate the median for a set of values. + + Returns null if applied to zero records. For the integer implementations, + the rounding option determines how the median should be rounded if it ends + up midway between two values. For the floating point implementations, + they specify the usual floating point rounding mode. + impls: + - args: + - name: precision + description: > + Based on required operator performance and configured optimizations + on saving memory bandwidth, the precision of the end result can be + the highest possible accuracy or an approximation. + + - EXACT: provides the exact result, rounded if needed according + to the rounding option. + - APPROXIMATE: provides only an estimate; the result must lie + between the minimum and maximum values in the input + (inclusive), but otherwise the accuracy is left up to the + consumer. + options: [ EXACT, APPROXIMATE ] + - name: x + value: i8 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + nullability: DECLARED_OUTPUT + return: i8? + - args: + - name: precision + description: > + Based on required operator performance and configured optimizations + on saving memory bandwidth, the precision of the end result can be + the highest possible accuracy or an approximation. + + - EXACT: provides the exact result, rounded if needed according + to the rounding option. + - APPROXIMATE: provides only an estimate; the result must lie + between the minimum and maximum values in the input + (inclusive), but otherwise the accuracy is left up to the + consumer. + options: [ EXACT, APPROXIMATE ] + - name: x + value: i16 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + nullability: DECLARED_OUTPUT + return: i16? + - args: + - name: precision + description: > + Based on required operator performance and configured optimizations + on saving memory bandwidth, the precision of the end result can be + the highest possible accuracy or an approximation. + + - EXACT: provides the exact result, rounded if needed according + to the rounding option. + - APPROXIMATE: provides only an estimate; the result must lie + between the minimum and maximum values in the input + (inclusive), but otherwise the accuracy is left up to the + consumer. + options: [ EXACT, APPROXIMATE ] + - name: x + value: i32 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + nullability: DECLARED_OUTPUT + return: i32? + - args: + - name: precision + description: > + Based on required operator performance and configured optimizations + on saving memory bandwidth, the precision of the end result can be + the highest possible accuracy or an approximation. + + - EXACT: provides the exact result, rounded if needed according + to the rounding option. + - APPROXIMATE: provides only an estimate; the result must lie + between the minimum and maximum values in the input + (inclusive), but otherwise the accuracy is left up to the + consumer. + options: [ EXACT, APPROXIMATE ] + - name: x + value: i64 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + nullability: DECLARED_OUTPUT + return: i64? + - args: + - name: precision + description: > + Based on required operator performance and configured optimizations + on saving memory bandwidth, the precision of the end result can be + the highest possible accuracy or an approximation. + + - EXACT: provides the exact result, rounded if needed according + to the rounding option. + - APPROXIMATE: provides only an estimate; the result must lie + between the minimum and maximum values in the input + (inclusive), but otherwise the accuracy is left up to the + consumer. + options: [ EXACT, APPROXIMATE ] + - name: x + value: fp32 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + nullability: DECLARED_OUTPUT + return: fp32? + - args: + - name: precision + description: > + Based on required operator performance and configured optimizations + on saving memory bandwidth, the precision of the end result can be + the highest possible accuracy or an approximation. + + - EXACT: provides the exact result, rounded if needed according + to the rounding option. + - APPROXIMATE: provides only an estimate; the result must lie + between the minimum and maximum values in the input + (inclusive), but otherwise the accuracy is left up to the + consumer. + options: [ EXACT, APPROXIMATE ] + - name: x + value: fp64 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + nullability: DECLARED_OUTPUT + return: fp64? + - name: "quantile" + description: > + Calculates quantiles for a set of values. + + This function will divide the aggregated values (passed via the + distribution argument) over N equally-sized bins, where N is passed + via a constant argument. It will then return the values at the + boundaries of these bins in list form. If the input is appropriately + sorted, this computes the quantiles of the distribution. + + The function can optionally return the first and/or last element of + the input, as specified by the `boundaries` argument. If the input is + appropriately sorted, this will thus be the minimum and/or maximum + values of the distribution. + + When the boundaries do not lie exactly on elements of the incoming + distribution, the function will interpolate between the two nearby + elements. If the interpolated value cannot be represented exactly, + the `rounding` option controls how the value should be selected or + computed. + + The function fails and returns null in the following cases: + - `n` is null or less than one; + - any value in `distribution` is null. + + The function returns an empty list if `n` equals 1 and `boundaries` is + set to `NEITHER`. + + impls: + - args: + - name: boundaries + description: > + Which boundaries to include. For NEITHER, the output will have + n-1 elements, for MINIMUM and MAXIMUM it will have n elements, + and for BOTH it will have n+1 elements. + options: [ NEITHER, MINIMUM, MAXIMUM, BOTH ] + - name: precision + description: > + Based on required operator performance and configured optimizations + on saving memory bandwidth, the precision of the end result can be + the highest possible accuracy or an approximation. + + - EXACT: provides the exact result, rounded if needed according + to the rounding option. + - APPROXIMATE: provides only an estimate; the result must lie + between the minimum and maximum values in the input + (inclusive), but otherwise the accuracy is left up to the + consumer. + options: [ EXACT, APPROXIMATE ] + - value: i64 + constant: true + name: n + description: > + A positive integer which defines the number of quantile + partitions. + - value: any + name: distribution + description: > + The data for which the quantiles should be computed. + options: + rounding: + description: > + When a boundary is computed to lie somewhere between two values, + and this value cannot be exactly represented, this specifies how + to round it. For floating point numbers, it specifies the IEEE + 754 rounding mode (as it does for all other floating point + operations). For integer types: + + - TIE_TO_EVEN: round to nearest value; if exactly halfway, tie + to the even option. + - TIE_AWAY_FROM_ZERO: round to nearest value; if exactly + halfway, tie away from zero. + - TRUNCATE: always round toward zero. + - CEILING: always round toward positive infinity. + - FLOOR: always round toward negative infinity. + + For non-numeric types, the behavior is the same as for integer + types, but applied to the index of the value in distribution. + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + nullability: DECLARED_OUTPUT + ordered: true + return: LIST? + +window_functions: + - name: "row_number" + description: "the number of the current row within its partition." + impls: + - args: [] + nullability: DECLARED_OUTPUT + decomposable: NONE + return: i64? + window_type: PARTITION + - name: "rank" + description: "the rank of the current row, with gaps." + impls: + - args: [] + nullability: DECLARED_OUTPUT + decomposable: NONE + return: i64? + window_type: PARTITION + - name: "dense_rank" + description: "the rank of the current row, without gaps." + impls: + - args: [] + nullability: DECLARED_OUTPUT + decomposable: NONE + return: i64? + window_type: PARTITION + - name: "percent_rank" + description: "the relative rank of the current row." + impls: + - args: [] + nullability: DECLARED_OUTPUT + decomposable: NONE + return: fp64? + window_type: PARTITION + - name: "cume_dist" + description: "the cumulative distribution." + impls: + - args: [] + nullability: DECLARED_OUTPUT + decomposable: NONE + return: fp64? + window_type: PARTITION + - name: "ntile" + description: "Return an integer ranging from 1 to the argument value,dividing the partition as equally as possible." + impls: + - args: + - name: x + value: i32 + nullability: DECLARED_OUTPUT + decomposable: NONE + return: i32? + window_type: PARTITION + - args: + - name: x + value: i64 + nullability: DECLARED_OUTPUT + decomposable: NONE + return: i64? + window_type: PARTITION diff --git a/src/substrait/extensions/functions_arithmetic_decimal.yaml b/src/substrait/extensions/functions_arithmetic_decimal.yaml new file mode 100644 index 0000000..0fc4caa --- /dev/null +++ b/src/substrait/extensions/functions_arithmetic_decimal.yaml @@ -0,0 +1,151 @@ +%YAML 1.2 +--- +scalar_functions: + - + name: "add" + description: "Add two decimal values." + impls: + - args: + - name: x + value: decimal + - name: y + value: decimal + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + return: |- + init_scale = max(S1,S2) + init_prec = init_scale + max(P1 - S1, P2 - S2) + 1 + min_scale = min(init_scale, 6) + delta = init_prec - 38 + prec = min(init_prec, 38) + scale_after_borrow = max(init_scale - delta, min_scale) + scale = init_prec > 38 ? scale_after_borrow : init_scale + DECIMAL + - + name: "subtract" + impls: + - args: + - name: x + value: decimal + - name: y + value: decimal + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + return: |- + init_scale = max(S1,S2) + init_prec = init_scale + max(P1 - S1, P2 - S2) + 1 + min_scale = min(init_scale, 6) + delta = init_prec - 38 + prec = min(init_prec, 38) + scale_after_borrow = max(init_scale - delta, min_scale) + scale = init_prec > 38 ? scale_after_borrow : init_scale + DECIMAL + - + name: "multiply" + impls: + - args: + - name: x + value: decimal + - name: y + value: decimal + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + return: |- + init_scale = S1 + S2 + init_prec = P1 + P2 + 1 + min_scale = min(init_scale, 6) + delta = init_prec - 38 + prec = min(init_prec, 38) + scale_after_borrow = max(init_scale - delta, min_scale) + scale = init_prec > 38 ? scale_after_borrow : init_scale + DECIMAL + - + name: "divide" + impls: + - args: + - name: x + value: decimal + - name: y + value: decimal + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + return: |- + init_scale = max(6, S1 + P2 + 1) + init_prec = P1 - S1 + P2 + init_scale + min_scale = min(init_scale, 6) + delta = init_prec - 38 + prec = min(init_prec, 38) + scale_after_borrow = max(init_scale - delta, min_scale) + scale = init_prec > 38 ? scale_after_borrow : init_scale + DECIMAL + - + name: "modulus" + impls: + - args: + - name: x + value: decimal + - name: y + value: decimal + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + return: |- + init_scale = max(S1,S2) + init_prec = min(P1 - S1, P2 - S2) + init_scale + min_scale = min(init_scale, 6) + delta = init_prec - 38 + prec = min(init_prec, 38) + scale_after_borrow = max(init_scale - delta, min_scale) + scale = init_prec > 38 ? scale_after_borrow : init_scale + DECIMAL +aggregate_functions: + - name: "sum" + description: Sum a set of values. + impls: + - args: + - name: x + value: "DECIMAL" + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + nullability: DECLARED_OUTPUT + decomposable: MANY + intermediate: "DECIMAL?<38,S>" + return: "DECIMAL?<38,S>" + - name: "avg" + description: Average a set of values. + impls: + - args: + - name: x + value: "DECIMAL" + options: + overflow: + values: [ SILENT, SATURATE, ERROR ] + nullability: DECLARED_OUTPUT + decomposable: MANY + intermediate: "STRUCT,i64>" + return: "DECIMAL<38,S>" + - name: "min" + description: Min a set of values. + impls: + - args: + - name: x + value: "DECIMAL" + nullability: DECLARED_OUTPUT + decomposable: MANY + intermediate: "DECIMAL?" + return: "DECIMAL?" + - name: "max" + description: Max a set of values. + impls: + - args: + - name: x + value: "DECIMAL" + nullability: DECLARED_OUTPUT + decomposable: MANY + intermediate: "DECIMAL?" + return: "DECIMAL?" diff --git a/src/substrait/extensions/functions_boolean.yaml b/src/substrait/extensions/functions_boolean.yaml new file mode 100644 index 0000000..22ae296 --- /dev/null +++ b/src/substrait/extensions/functions_boolean.yaml @@ -0,0 +1,140 @@ +%YAML 1.2 +--- +scalar_functions: + - + name: or + description: > + The boolean `or` using Kleene logic. + + This function behaves as follows with nulls: + + true or null = true + + null or true = true + + false or null = null + + null or false = null + + null or null = null + + In other words, in this context a null value really means "unknown", and + an unknown value `or` true is always true. + + Behavior for 0 or 1 inputs is as follows: + or() -> false + or(x) -> x + impls: + - args: + - value: boolean? + name: a + variadic: + min: 0 + return: boolean? + - + name: and + description: > + The boolean `and` using Kleene logic. + + This function behaves as follows with nulls: + + true and null = null + + null and true = null + + false and null = false + + null and false = false + + null and null = null + + In other words, in this context a null value really means "unknown", and + an unknown value `and` false is always false. + + Behavior for 0 or 1 inputs is as follows: + and() -> true + and(x) -> x + impls: + - args: + - value: boolean? + name: a + variadic: + min: 0 + return: boolean? + - + name: and_not + description: > + The boolean `and` of one value and the negation of the other using Kleene logic. + + This function behaves as follows with nulls: + + true and not null = null + + null and not false = null + + false and not null = false + + null and not true = false + + null and not null = null + + In other words, in this context a null value really means "unknown", and + an unknown value `and not` true is always false, as is false `and not` an + unknown value. + impls: + - args: + - value: boolean? + name: a + - value: boolean? + name: b + return: boolean? + - + name: xor + description: > + The boolean `xor` of two values using Kleene logic. + + When a null is encountered in either input, a null is output. + impls: + - args: + - value: boolean? + name: a + - value: boolean? + name: b + return: boolean? + - + name: not + description: > + The `not` of a boolean value. + + When a null is input, a null is output. + impls: + - args: + - value: boolean? + name: a + return: boolean? + +aggregate_functions: + - + name: "bool_and" + description: > + If any value in the input is false, false is returned. If the input is + empty or only contains nulls, null is returned. Otherwise, true is + returned. + impls: + - args: + - value: boolean + name: a + nullability: DECLARED_OUTPUT + return: boolean? + - + name: "bool_or" + description: > + If any value in the input is true, true is returned. If the input is + empty or only contains nulls, null is returned. Otherwise, false is + returned. + impls: + - args: + - value: boolean + name: a + nullability: DECLARED_OUTPUT + return: boolean? diff --git a/src/substrait/extensions/functions_comparison.yaml b/src/substrait/extensions/functions_comparison.yaml new file mode 100644 index 0000000..7d11f3c --- /dev/null +++ b/src/substrait/extensions/functions_comparison.yaml @@ -0,0 +1,216 @@ +%YAML 1.2 +--- +scalar_functions: + - + name: "not_equal" + description: > + Whether two values are not_equal. + + `not_equal(x, y) := (x != y)` + + If either/both of `x` and `y` are `null`, `null` is returned. + impls: + - args: + - value: any1 + name: x + - value: any1 + name: y + return: BOOLEAN + - + name: "equal" + description: > + Whether two values are equal. + + `equal(x, y) := (x == y)` + + If either/both of `x` and `y` are `null`, `null` is returned. + impls: + - args: + - value: any1 + name: x + - value: any1 + name: y + return: BOOLEAN + - + name: "is_not_distinct_from" + description: > + Whether two values are equal. + + This function treats `null` values as comparable, so + + `is_not_distinct_from(null, null) == True` + + This is in contrast to `equal`, in which `null` values do not compare. + impls: + - args: + - value: any1 + name: x + - value: any1 + name: y + return: BOOLEAN + - + name: "lt" + description: > + Less than. + + lt(x, y) := (x < y) + + If either/both of `x` and `y` are `null`, `null` is returned. + impls: + - args: + - value: any1 + name: x + - value: any1 + name: y + return: BOOLEAN + - + name: "gt" + description: > + Greater than. + + gt(x, y) := (x > y) + + If either/both of `x` and `y` are `null`, `null` is returned. + impls: + - args: + - value: any1 + name: x + - value: any1 + name: y + return: BOOLEAN + - + name: "lte" + description: > + Less than or equal to. + + lte(x, y) := (x <= y) + + If either/both of `x` and `y` are `null`, `null` is returned. + impls: + - args: + - value: any1 + name: x + - value: any1 + name: y + return: BOOLEAN + - + name: "gte" + description: > + Greater than or equal to. + + gte(x, y) := (x >= y) + + If either/both of `x` and `y` are `null`, `null` is returned. + impls: + - args: + - value: any1 + name: x + - value: any1 + name: y + return: BOOLEAN + - + name: "between" + description: >- + Whether the `expression` is greater than or equal to `low` and less than or equal to `high`. + + `expression` BETWEEN `low` AND `high` + + If `low`, `high`, or `expression` are `null`, `null` is returned. + impls: + - args: + - value: any1 + name: expression + description: The expression to test for in the range defined by `low` and `high`. + - value: any1 + name: low + description: The value to check if greater than or equal to. + - value: any1 + name: high + description: The value to check if less than or equal to. + return: BOOLEAN + - + name: "is_null" + description: Whether a value is null. NaN is not null. + impls: + - args: + - value: any1 + name: x + return: BOOLEAN + nullability: DECLARED_OUTPUT + - + name: "is_not_null" + description: Whether a value is not null. NaN is not null. + impls: + - args: + - value: any1 + name: x + return: BOOLEAN + nullability: DECLARED_OUTPUT + - + name: "is_nan" + description: > + Whether a value is not a number. + + If `x` is `null`, `null` is returned. + impls: + - args: + - value: fp32 + name: x + return: BOOLEAN + - args: + - value: fp64 + name: x + return: BOOLEAN + - + name: "is_finite" + description: > + Whether a value is finite (neither infinite nor NaN). + + If `x` is `null`, `null` is returned. + impls: + - args: + - value: fp32 + name: x + return: BOOLEAN + - args: + - value: fp64 + name: x + return: BOOLEAN + - + name: "is_infinite" + description: > + Whether a value is infinite. + + If `x` is `null`, `null` is returned. + impls: + - args: + - value: fp32 + name: x + return: BOOLEAN + - args: + - value: fp64 + name: x + return: BOOLEAN + - + name: "nullif" + description: If two values are equal, return null. Otherwise, return the first value. + impls: + - args: + - value: any1 + name: x + - value: any1 + name: y + return: any1 + - + name: "coalesce" + description: >- + Evaluate arguments from left to right and return the first argument that is not null. Once + a non-null argument is found, the remaining arguments are not evaluated. + + If all arguments are null, return null. + impls: + - args: + - value: any1 + variadic: + min: 2 + return: any1 diff --git a/src/substrait/extensions/functions_datetime.yaml b/src/substrait/extensions/functions_datetime.yaml new file mode 100644 index 0000000..60e563f --- /dev/null +++ b/src/substrait/extensions/functions_datetime.yaml @@ -0,0 +1,690 @@ +%YAML 1.2 +--- +scalar_functions: + - + name: extract + description: >- + Extract portion of a date/time value. + * YEAR Return the year. + * ISO_YEAR Return the ISO 8601 week-numbering year. First week of an ISO year has the majority (4 or more) of + its days in January. + * US_YEAR Return the US epidemiological year. First week of US epidemiological year has the majority (4 or more) + of its days in January. Last week of US epidemiological year has the year's last Wednesday in it. US + epidemiological week starts on Sunday. + * QUARTER Return the number of the quarter within the year. January 1 through March 31 map to the first quarter, + April 1 through June 30 map to the second quarter, etc. + * MONTH Return the number of the month within the year. + * DAY Return the number of the day within the month. + * DAY_OF_YEAR Return the number of the day within the year. January 1 maps to the first day, February 1 maps to + the thirty-second day, etc. + * MONDAY_DAY_OF_WEEK Return the number of the day within the week, from Monday (first day) to Sunday (seventh + day). + * SUNDAY_DAY_OF_WEEK Return the number of the day within the week, from Sunday (first day) to Saturday (seventh + day). + * MONDAY_WEEK Return the number of the week within the year. First week starts on first Monday of January. + * SUNDAY_WEEK Return the number of the week within the year. First week starts on first Sunday of January. + * ISO_WEEK Return the number of the ISO week within the ISO year. First ISO week has the majority (4 or more) + of its days in January. ISO week starts on Monday. + * US_WEEK Return the number of the US week within the US year. First US week has the majority (4 or more) of + its days in January. US week starts on Sunday. + * HOUR Return the hour (0-23). + * MINUTE Return the minute (0-59). + * SECOND Return the second (0-59). + * MILLISECOND Return number of milliseconds since the last full second. + * MICROSECOND Return number of microseconds since the last full millisecond. + * SUBSECOND Return number of microseconds since the last full second of the given timestamp. + * UNIX_TIME Return number of seconds that have elapsed since 1970-01-01 00:00:00 UTC, ignoring leap seconds. + * TIMEZONE_OFFSET Return number of seconds of timezone offset to UTC. + + The range of values returned for QUARTER, MONTH, DAY, DAY_OF_YEAR, MONDAY_DAY_OF_WEEK, SUNDAY_DAY_OF_WEEK, + MONDAY_WEEK, SUNDAY_WEEK, ISO_WEEK, and US_WEEK depends on whether counting starts at 1 or 0. This is governed + by the indexing option. + + When indexing is ONE: + * QUARTER returns values in range 1-4 + * MONTH returns values in range 1-12 + * DAY returns values in range 1-31 + * DAY_OF_YEAR returns values in range 1-366 + * MONDAY_DAY_OF_WEEK and SUNDAY_DAY_OF_WEEK return values in range 1-7 + * MONDAY_WEEK, SUNDAY_WEEK, ISO_WEEK, and US_WEEK return values in range 1-53 + + When indexing is ZERO: + * QUARTER returns values in range 0-3 + * MONTH returns values in range 0-11 + * DAY returns values in range 0-30 + * DAY_OF_YEAR returns values in range 0-365 + * MONDAY_DAY_OF_WEEK and SUNDAY_DAY_OF_WEEK return values in range 0-6 + * MONDAY_WEEK, SUNDAY_WEEK, ISO_WEEK, and US_WEEK return values in range 0-52 + + The indexing option must be specified when the component is QUARTER, MONTH, DAY, DAY_OF_YEAR, + MONDAY_DAY_OF_WEEK, SUNDAY_DAY_OF_WEEK, MONDAY_WEEK, SUNDAY_WEEK, ISO_WEEK, or US_WEEK. The + indexing option cannot be specified when the component is YEAR, ISO_YEAR, US_YEAR, HOUR, MINUTE, SECOND, + MILLISECOND, MICROSECOND, SUBSECOND, UNIX_TIME, or TIMEZONE_OFFSET. + + Timezone strings must be as defined by IANA timezone database (https://www.iana.org/time-zones). + Examples: "Pacific/Marquesas", "Etc/GMT+1". + If timezone is invalid an error is thrown. + impls: + - args: + - name: component + options: [ YEAR, ISO_YEAR, US_YEAR, HOUR, MINUTE, SECOND, + MILLISECOND, MICROSECOND, SUBSECOND, UNIX_TIME, TIMEZONE_OFFSET ] + description: The part of the value to extract. + - name: x + value: timestamp_tz + - name: timezone + description: Timezone string from IANA tzdb. + value: string + return: i64 + - args: + - name: component + options: [ YEAR, ISO_YEAR, US_YEAR, HOUR, MINUTE, SECOND, + MILLISECOND, MICROSECOND, SUBSECOND, UNIX_TIME ] + description: The part of the value to extract. + - name: x + value: timestamp + return: i64 + - args: + - name: component + options: [ YEAR, ISO_YEAR, US_YEAR, UNIX_TIME ] + description: The part of the value to extract. + - name: x + value: date + return: i64 + - args: + - name: component + options: [ HOUR, MINUTE, SECOND, MILLISECOND, MICROSECOND, SUBSECOND ] + description: The part of the value to extract. + - name: x + value: time + return: i64 + - args: + - name: component + options: [ QUARTER, MONTH, DAY, DAY_OF_YEAR, MONDAY_DAY_OF_WEEK, + SUNDAY_DAY_OF_WEEK, MONDAY_WEEK, SUNDAY_WEEK, ISO_WEEK, US_WEEK ] + description: The part of the value to extract. + - name: indexing + options: [ ONE, ZERO ] + description: Start counting from 1 or 0. + - name: x + value: timestamp_tz + - name: timezone + description: Timezone string from IANA tzdb. + value: string + return: i64 + - args: + - name: component + options: [ QUARTER, MONTH, DAY, DAY_OF_YEAR, MONDAY_DAY_OF_WEEK, + SUNDAY_DAY_OF_WEEK, MONDAY_WEEK, SUNDAY_WEEK, ISO_WEEK, US_WEEK ] + description: The part of the value to extract. + - name: indexing + options: [ ONE, ZERO ] + description: Start counting from 1 or 0. + - name: x + value: timestamp + return: i64 + - args: + - name: component + options: [ QUARTER, MONTH, DAY, DAY_OF_YEAR, MONDAY_DAY_OF_WEEK, + SUNDAY_DAY_OF_WEEK, MONDAY_WEEK, SUNDAY_WEEK, ISO_WEEK, US_WEEK ] + description: The part of the value to extract. + - name: indexing + options: [ ONE, ZERO ] + description: Start counting from 1 or 0. + - name: x + value: date + return: i64 + - + name: "extract_boolean" + description: >- + Extract boolean values of a date/time value. + * IS_LEAP_YEAR Return true if year of the given value is a leap year and false otherwise. + * IS_DST Return true if DST (Daylight Savings Time) is observed at the given value + in the given timezone. + + Timezone strings must be as defined by IANA timezone database (https://www.iana.org/time-zones). + Examples: "Pacific/Marquesas", "Etc/GMT+1". + If timezone is invalid an error is thrown. + impls: + - args: + - name: component + options: [ IS_LEAP_YEAR ] + description: The part of the value to extract. + - name: x + value: timestamp + return: boolean + - args: + - name: component + options: [ IS_LEAP_YEAR, IS_DST ] + description: The part of the value to extract. + - name: x + value: timestamp_tz + - name: timezone + description: Timezone string from IANA tzdb. + value: string + return: boolean + - args: + - name: component + options: [ IS_LEAP_YEAR ] + description: The part of the value to extract. + - name: x + value: date + return: boolean + - + name: "add" + description: >- + Add an interval to a date/time type. + + Timezone strings must be as defined by IANA timezone database (https://www.iana.org/time-zones). + Examples: "Pacific/Marquesas", "Etc/GMT+1". + If timezone is invalid an error is thrown. + impls: + - args: + - name: x + value: timestamp + - name: y + value: interval_year + return: timestamp + - args: + - name: x + value: timestamp_tz + - name: y + value: interval_year + - name: timezone + description: Timezone string from IANA tzdb. + value: string + return: timestamp_tz + - args: + - name: x + value: date + - name: y + value: interval_year + return: timestamp + - args: + - name: x + value: timestamp + - name: y + value: interval_day + return: timestamp + - args: + - name: x + value: timestamp_tz + - name: y + value: interval_day + return: timestamp_tz + - args: + - name: x + value: date + - name: y + value: interval_day + return: timestamp + - + name: "add_intervals" + description: Add two intervals together. + impls: + - args: + - name: x + value: interval_day + - name: y + value: interval_day + return: interval_day + - args: + - name: x + value: interval_year + - name: y + value: interval_year + return: interval_year + - + name: "subtract" + description: >- + Subtract an interval from a date/time type. + + Timezone strings must be as defined by IANA timezone database (https://www.iana.org/time-zones). + Examples: "Pacific/Marquesas", "Etc/GMT+1". + If timezone is invalid an error is thrown. + impls: + - args: + - name: x + value: timestamp + - name: y + value: interval_year + return: timestamp + - args: + - name: x + value: timestamp_tz + - name: y + value: interval_year + return: timestamp_tz + - args: + - name: x + value: timestamp_tz + - name: y + value: interval_year + - name: timezone + description: Timezone string from IANA tzdb. + value: string + return: timestamp_tz + - args: + - name: x + value: date + - name: y + value: interval_year + return: date + - args: + - name: x + value: timestamp + - name: y + value: interval_day + return: timestamp + - args: + - name: x + value: timestamp_tz + - name: y + value: interval_day + return: timestamp_tz + - args: + - name: x + value: date + - name: y + value: interval_day + return: date + - + name: "lte" + description: less than or equal to + impls: + - args: + - name: x + value: timestamp + - name: y + value: timestamp + return: boolean + - args: + - name: x + value: timestamp_tz + - name: y + value: timestamp_tz + return: boolean + - args: + - name: x + value: date + - name: y + value: date + return: boolean + - args: + - name: x + value: interval_day + - name: y + value: interval_day + return: boolean + - args: + - name: x + value: interval_year + - name: y + value: interval_year + return: boolean + - + name: "lt" + description: less than + impls: + - args: + - name: x + value: timestamp + - name: y + value: timestamp + return: boolean + - args: + - name: x + value: timestamp_tz + - name: y + value: timestamp_tz + return: boolean + - args: + - name: x + value: date + - name: y + value: date + return: boolean + - args: + - name: x + value: interval_day + - name: y + value: interval_day + return: boolean + - args: + - name: x + value: interval_year + - name: y + value: interval_year + return: boolean + - + name: "gte" + description: greater than or equal to + impls: + - args: + - name: x + value: timestamp + - name: y + value: timestamp + return: boolean + - args: + - name: x + value: timestamp_tz + - name: y + value: timestamp_tz + return: boolean + - args: + - name: x + value: date + - name: y + value: date + return: boolean + - args: + - name: x + value: interval_day + - name: y + value: interval_day + return: boolean + - args: + - name: x + value: interval_year + - name: y + value: interval_year + return: boolean + - + name: "gt" + description: greater than + impls: + - args: + - name: x + value: timestamp + - name: y + value: timestamp + return: boolean + - args: + - name: x + value: timestamp_tz + - name: y + value: timestamp_tz + return: boolean + - args: + - name: x + value: date + - name: y + value: date + return: boolean + - args: + - name: x + value: interval_day + - name: y + value: interval_day + return: boolean + - args: + - name: x + value: interval_year + - name: y + value: interval_year + return: boolean + - + name: "assume_timezone" + description: >- + Convert local timestamp to UTC-relative timestamp_tz using given local time's timezone. + + Timezone strings must be as defined by IANA timezone database (https://www.iana.org/time-zones). + Examples: "Pacific/Marquesas", "Etc/GMT+1". + If timezone is invalid an error is thrown. + impls: + - args: + - name: x + value: timestamp + - name: timezone + description: Timezone string from IANA tzdb. + value: string + return: timestamp_tz + - args: + - name: x + value: date + - name: timezone + description: Timezone string from IANA tzdb. Returned timestamp_tz will have time set to 00:00:00. + value: string + return: timestamp_tz + - + name: "local_timestamp" + description: >- + Convert UTC-relative timestamp_tz to local timestamp using given local time's timezone. + + Timezone strings must be as defined by IANA timezone database (https://www.iana.org/time-zones). + Examples: "Pacific/Marquesas", "Etc/GMT+1". + If timezone is invalid an error is thrown. + impls: + - args: + - name: x + value: timestamp_tz + - name: timezone + description: Timezone string from IANA tzdb. + value: string + return: timestamp + - + name: "strptime_time" + description: >- + Parse string into time using provided format, + see https://man7.org/linux/man-pages/man3/strptime.3.html for reference. + impls: + - args: + - name: time_string + value: string + - name: format + value: string + return: time + - + name: "strptime_date" + description: >- + Parse string into date using provided format, + see https://man7.org/linux/man-pages/man3/strptime.3.html for reference. + impls: + - args: + - name: date_string + value: string + - name: format + value: string + return: date + - + name: "strptime_timestamp" + description: >- + Parse string into timestamp using provided format, + see https://man7.org/linux/man-pages/man3/strptime.3.html for reference. + If timezone is present in timestamp and provided as parameter an error is thrown. + + Timezone strings must be as defined by IANA timezone database (https://www.iana.org/time-zones). + Examples: "Pacific/Marquesas", "Etc/GMT+1". + If timezone is supplied as parameter and present in the parsed string the parsed timezone is used. + If parameter supplied timezone is invalid an error is thrown. + impls: + - args: + - name: timestamp_string + value: string + - name: format + value: string + - name: timezone + description: Timezone string from IANA tzdb. + value: string + return: timestamp_tz + - args: + - name: timestamp_string + value: string + - name: format + value: string + return: timestamp_tz + - + name: "strftime" + description: >- + Convert timestamp/date/time to string using provided format, + see https://man7.org/linux/man-pages/man3/strftime.3.html for reference. + + Timezone strings must be as defined by IANA timezone database (https://www.iana.org/time-zones). + Examples: "Pacific/Marquesas", "Etc/GMT+1". + If timezone is invalid an error is thrown. + impls: + - args: + - name: x + value: timestamp + - name: format + value: string + return: string + - args: + - name: x + value: timestamp_tz + - name: format + value: string + - name: timezone + description: Timezone string from IANA tzdb. + value: string + return: string + - args: + - name: x + value: date + - name: format + value: string + return: string + - args: + - name: x + value: time + - name: format + value: string + return: string + - + name: "round_temporal" + description: >- + Round a given timestamp/date/time to a multiple of a time unit. If the given timestamp is not already an + exact multiple from the origin in the given timezone, the resulting point is chosen as one of the + two nearest multiples. Which of these is chosen is governed by rounding: FLOOR means to use the earlier + one, CEIL means to use the later one, ROUND_TIE_DOWN means to choose the nearest and tie to the + earlier one if equidistant, ROUND_TIE_UP means to choose the nearest and tie to the later one if + equidistant. + + Timezone strings must be as defined by IANA timezone database (https://www.iana.org/time-zones). + Examples: "Pacific/Marquesas", "Etc/GMT+1". + If timezone is invalid an error is thrown. + impls: + - args: + - name: x + value: timestamp + - name: rounding + options: [ FLOOR, CEIL, ROUND_TIE_DOWN, ROUND_TIE_UP ] + - name: unit + options: [ YEAR, MONTH, WEEK, DAY, HOUR, MINUTE, SECOND, MILLISECOND, MICROSECOND ] + - name: multiple + value: i64 + - name: origin + value: timestamp + return: timestamp + - args: + - name: x + value: timestamp_tz + - name: rounding + options: [ FLOOR, CEIL, ROUND_TIE_DOWN, ROUND_TIE_UP ] + - name: unit + options: [ YEAR, MONTH, WEEK, DAY, HOUR, MINUTE, SECOND, MILLISECOND, MICROSECOND ] + - name: multiple + value: i64 + - name: timezone + description: Timezone string from IANA tzdb. + value: string + - name: origin + value: timestamp_tz + return: timestamp_tz + - args: + - name: x + value: date + - name: rounding + options: [ FLOOR, CEIL, ROUND_TIE_DOWN, ROUND_TIE_UP ] + - name: unit + options: [ YEAR, MONTH, WEEK, DAY ] + - name: multiple + value: i64 + - name: origin + value: date + return: date + - args: + - name: x + value: time + - name: rounding + options: [ FLOOR, CEIL, ROUND_TIE_DOWN, ROUND_TIE_UP ] + - name: unit + options: [ HOUR, MINUTE, SECOND, MILLISECOND, MICROSECOND ] + - name: multiple + value: i64 + - name: origin + value: time + return: time + - + name: "round_calendar" + description: >- + Round a given timestamp/date/time to a multiple of a time unit. If the given timestamp is not already an + exact multiple from the last origin unit in the given timezone, the resulting point is chosen as one of the + two nearest multiples. Which of these is chosen is governed by rounding: FLOOR means to use the earlier + one, CEIL means to use the later one, ROUND_TIE_DOWN means to choose the nearest and tie to the + earlier one if equidistant, ROUND_TIE_UP means to choose the nearest and tie to the later one if + equidistant. + + Timezone strings must be as defined by IANA timezone database (https://www.iana.org/time-zones). + Examples: "Pacific/Marquesas", "Etc/GMT+1". + If timezone is invalid an error is thrown. + + impls: + - args: + - name: x + value: timestamp + - name: rounding + options: [ FLOOR, CEIL, ROUND_TIE_DOWN, ROUND_TIE_UP ] + - name: unit + options: [ YEAR, MONTH, WEEK, DAY, HOUR, MINUTE, SECOND, MILLISECOND, MICROSECOND ] + - name: origin + options: [ YEAR, MONTH, MONDAY_WEEK, SUNDAY_WEEK, ISO_WEEK, + US_WEEK, DAY, HOUR, MINUTE, SECOND, MILLISECOND ] + - name: multiple + value: i64 + return: timestamp + - args: + - name: x + value: timestamp_tz + - name: rounding + options: [ FLOOR, CEIL, ROUND_TIE_DOWN, ROUND_TIE_UP ] + - name: unit + options: [ YEAR, MONTH, WEEK, DAY, HOUR, MINUTE, SECOND, MILLISECOND, MICROSECOND ] + - name: origin + options: [ YEAR, MONTH, MONDAY_WEEK, SUNDAY_WEEK, ISO_WEEK, + US_WEEK, DAY, HOUR, MINUTE, SECOND, MILLISECOND ] + - name: multiple + value: i64 + - name: timezone + description: Timezone string from IANA tzdb. + value: string + return: timestamp_tz + - args: + - name: x + value: date + - name: rounding + options: [ FLOOR, CEIL, ROUND_TIE_DOWN, ROUND_TIE_UP ] + - name: unit + options: [ YEAR, MONTH, WEEK, DAY ] + - name: origin + options: [ YEAR, MONTH, MONDAY_WEEK, SUNDAY_WEEK, ISO_WEEK, US_WEEK, DAY ] + - name: multiple + value: i64 + - name: origin + value: date + return: date + - args: + - name: x + value: time + - name: rounding + options: [ FLOOR, CEIL, ROUND_TIE_DOWN, ROUND_TIE_UP ] + - name: unit + options: [ DAY, HOUR, MINUTE, SECOND, MILLISECOND, MICROSECOND ] + - name: origin + options: [ DAY, HOUR, MINUTE, SECOND, MILLISECOND ] + - name: multiple + value: i64 + - name: origin + value: time + return: time diff --git a/src/substrait/extensions/functions_logarithmic.yaml b/src/substrait/extensions/functions_logarithmic.yaml new file mode 100644 index 0000000..f4b8acc --- /dev/null +++ b/src/substrait/extensions/functions_logarithmic.yaml @@ -0,0 +1,147 @@ +%YAML 1.2 +--- +scalar_functions: + - + name: "ln" + description: "Natural logarithm of the value" + impls: + - args: + - name: x + value: fp32 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + on_domain_error: + values: [ NAN, ERROR ] + on_log_zero: + values: [NAN, ERROR, MINUS_INFINITY] + return: fp32 + - args: + - name: x + value: fp64 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + on_domain_error: + values: [ NAN, ERROR ] + on_log_zero: + values: [NAN, ERROR, MINUS_INFINITY] + return: fp64 + - + name: "log10" + description: "Logarithm to base 10 of the value" + impls: + - args: + - name: x + value: fp32 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + on_domain_error: + values: [ NAN, ERROR ] + on_log_zero: + values: [NAN, ERROR, MINUS_INFINITY] + return: fp32 + - args: + - name: x + value: fp64 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + on_domain_error: + values: [ NAN, ERROR ] + on_log_zero: + values: [NAN, ERROR, MINUS_INFINITY] + return: fp64 + - + name: "log2" + description: "Logarithm to base 2 of the value" + impls: + - args: + - name: x + value: fp32 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + on_domain_error: + values: [ NAN, ERROR ] + on_log_zero: + values: [NAN, ERROR, MINUS_INFINITY] + return: fp32 + - args: + - name: x + value: fp64 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + on_domain_error: + values: [ NAN, ERROR ] + on_log_zero: + values: [NAN, ERROR, MINUS_INFINITY] + return: fp64 + - + name: "logb" + description: > + Logarithm of the value with the given base + + logb(x, b) => log_{b} (x) + impls: + - args: + - value: fp32 + name: "x" + description: "The number `x` to compute the logarithm of" + - value: fp32 + name: "base" + description: "The logarithm base `b` to use" + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + on_domain_error: + values: [ NAN, ERROR ] + on_log_zero: + values: [NAN, ERROR, MINUS_INFINITY] + return: fp32 + - args: + - value: fp64 + name: "x" + description: "The number `x` to compute the logarithm of" + - value: fp64 + name: "base" + description: "The logarithm base `b` to use" + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + on_domain_error: + values: [ NAN, ERROR ] + on_log_zero: + values: [NAN, ERROR, MINUS_INFINITY] + return: fp64 + - + name: "log1p" + description: > + Natural logarithm (base e) of 1 + x + + log1p(x) => log(1+x) + impls: + - args: + - name: x + value: fp32 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + on_domain_error: + values: [ NAN, ERROR ] + on_log_zero: + values: [NAN, ERROR, MINUS_INFINITY] + return: fp32 + - args: + - name: x + value: fp64 + options: + rounding: + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR ] + on_domain_error: + values: [ NAN, ERROR ] + on_log_zero: + values: [NAN, ERROR, MINUS_INFINITY] + return: fp64 diff --git a/src/substrait/extensions/functions_rounding.yaml b/src/substrait/extensions/functions_rounding.yaml new file mode 100644 index 0000000..09309f2 --- /dev/null +++ b/src/substrait/extensions/functions_rounding.yaml @@ -0,0 +1,270 @@ +%YAML 1.2 +--- +scalar_functions: + - + name: "ceil" + description: > + Rounding to the ceiling of the value `x`. + impls: + - args: + - value: fp32 + name: "x" + return: fp32 + - args: + - value: fp64 + name: "x" + return: fp64 + - + name: "floor" + description: > + Rounding to the floor of the value `x`. + impls: + - args: + - value: fp32 + name: "x" + return: fp32 + - args: + - value: fp64 + name: "x" + return: fp64 + - + name: "round" + description: > + Rounding the value `x` to `s` decimal places. + impls: + - args: + - value: i8 + name: "x" + description: > + Numerical expression to be rounded. + - value: i32 + name: "s" + description: > + Number of decimal places to be rounded to. + + When `s` is a positive number, nothing will happen + since `x` is an integer value. + + When `s` is a negative number, the rounding is + performed to the nearest multiple of `10^(-s)`. + options: + rounding: + description: > + When a boundary is computed to lie somewhere between two values, + and this value cannot be exactly represented, this specifies how + to round it. + + - TIE_TO_EVEN: round to nearest value; if exactly halfway, tie + to the even option. + - TIE_AWAY_FROM_ZERO: round to nearest value; if exactly + halfway, tie away from zero. + - TRUNCATE: always round toward zero. + - CEILING: always round toward positive infinity. + - FLOOR: always round toward negative infinity. + - AWAY_FROM_ZERO: round negative values with FLOOR rule, round positive values with CEILING rule + - TIE_DOWN: round ties with FLOOR rule + - TIE_UP: round ties with CEILING rule + - TIE_TOWARDS_ZERO: round ties with TRUNCATE rule + - TIE_TO_ODD: round to nearest value; if exactly halfway, tie + to the odd option. + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR, + AWAY_FROM_ZERO, TIE_DOWN, TIE_UP, TIE_TOWARDS_ZERO, TIE_TO_ODD ] + nullability: DECLARED_OUTPUT + return: i8? + - args: + - value: i16 + name: "x" + description: > + Numerical expression to be rounded. + - value: i32 + name: "s" + description: > + Number of decimal places to be rounded to. + + When `s` is a positive number, nothing will happen + since `x` is an integer value. + + When `s` is a negative number, the rounding is + performed to the nearest multiple of `10^(-s)`. + options: + rounding: + description: > + When a boundary is computed to lie somewhere between two values, + and this value cannot be exactly represented, this specifies how + to round it. + + - TIE_TO_EVEN: round to nearest value; if exactly halfway, tie + to the even option. + - TIE_AWAY_FROM_ZERO: round to nearest value; if exactly + halfway, tie away from zero. + - TRUNCATE: always round toward zero. + - CEILING: always round toward positive infinity. + - FLOOR: always round toward negative infinity. + - AWAY_FROM_ZERO: round negative values with FLOOR rule, round positive values with CEILING rule + - TIE_DOWN: round ties with FLOOR rule + - TIE_UP: round ties with CEILING rule + - TIE_TOWARDS_ZERO: round ties with TRUNCATE rule + - TIE_TO_ODD: round to nearest value; if exactly halfway, tie + to the odd option. + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR, + AWAY_FROM_ZERO, TIE_DOWN, TIE_UP, TIE_TOWARDS_ZERO, TIE_TO_ODD ] + nullability: DECLARED_OUTPUT + return: i16? + - args: + - value: i32 + name: "x" + description: > + Numerical expression to be rounded. + - value: i32 + name: "s" + description: > + Number of decimal places to be rounded to. + + When `s` is a positive number, nothing will happen + since `x` is an integer value. + + When `s` is a negative number, the rounding is + performed to the nearest multiple of `10^(-s)`. + options: + rounding: + description: > + When a boundary is computed to lie somewhere between two values, + and this value cannot be exactly represented, this specifies how + to round it. + + - TIE_TO_EVEN: round to nearest value; if exactly halfway, tie + to the even option. + - TIE_AWAY_FROM_ZERO: round to nearest value; if exactly + halfway, tie away from zero. + - TRUNCATE: always round toward zero. + - CEILING: always round toward positive infinity. + - FLOOR: always round toward negative infinity. + - AWAY_FROM_ZERO: round negative values with FLOOR rule, round positive values with CEILING rule + - TIE_DOWN: round ties with FLOOR rule + - TIE_UP: round ties with CEILING rule + - TIE_TOWARDS_ZERO: round ties with TRUNCATE rule + - TIE_TO_ODD: round to nearest value; if exactly halfway, tie + to the odd option. + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR, + AWAY_FROM_ZERO, TIE_DOWN, TIE_UP, TIE_TOWARDS_ZERO, TIE_TO_ODD ] + nullability: DECLARED_OUTPUT + return: i32? + - args: + - value: i64 + name: "x" + description: > + Numerical expression to be rounded. + - value: i32 + name: "s" + description: > + Number of decimal places to be rounded to. + + When `s` is a positive number, nothing will happen + since `x` is an integer value. + + When `s` is a negative number, the rounding is + performed to the nearest multiple of `10^(-s)`. + options: + rounding: + description: > + When a boundary is computed to lie somewhere between two values, + and this value cannot be exactly represented, this specifies how + to round it. + + - TIE_TO_EVEN: round to nearest value; if exactly halfway, tie + to the even option. + - TIE_AWAY_FROM_ZERO: round to nearest value; if exactly + halfway, tie away from zero. + - TRUNCATE: always round toward zero. + - CEILING: always round toward positive infinity. + - FLOOR: always round toward negative infinity. + - AWAY_FROM_ZERO: round negative values with FLOOR rule, round positive values with CEILING rule + - TIE_DOWN: round ties with FLOOR rule + - TIE_UP: round ties with CEILING rule + - TIE_TOWARDS_ZERO: round ties with TRUNCATE rule + - TIE_TO_ODD: round to nearest value; if exactly halfway, tie + to the odd option. + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR, + AWAY_FROM_ZERO, TIE_DOWN, TIE_UP, TIE_TOWARDS_ZERO, TIE_TO_ODD ] + nullability: DECLARED_OUTPUT + return: i64? + - args: + - value: fp32 + name: "x" + description: > + Numerical expression to be rounded. + - value: i32 + name: "s" + description: > + Number of decimal places to be rounded to. + + When `s` is a positive number, the rounding + is performed to a `s` number of decimal places. + + When `s` is a negative number, the rounding is + performed to the left side of the decimal point + as specified by `s`. + options: + rounding: + description: > + When a boundary is computed to lie somewhere between two values, + and this value cannot be exactly represented, this specifies how + to round it. + + - TIE_TO_EVEN: round to nearest value; if exactly halfway, tie + to the even option. + - TIE_AWAY_FROM_ZERO: round to nearest value; if exactly + halfway, tie away from zero. + - TRUNCATE: always round toward zero. + - CEILING: always round toward positive infinity. + - FLOOR: always round toward negative infinity. + - AWAY_FROM_ZERO: round negative values with FLOOR rule, round positive values with CEILING rule + - TIE_DOWN: round ties with FLOOR rule + - TIE_UP: round ties with CEILING rule + - TIE_TOWARDS_ZERO: round ties with TRUNCATE rule + - TIE_TO_ODD: round to nearest value; if exactly halfway, tie + to the odd option. + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR, + AWAY_FROM_ZERO, TIE_DOWN, TIE_UP, TIE_TOWARDS_ZERO, TIE_TO_ODD ] + nullability: DECLARED_OUTPUT + return: fp32? + - args: + - value: fp64 + name: "x" + description: > + Numerical expression to be rounded. + - value: i32 + name: "s" + description: > + Number of decimal places to be rounded to. + + When `s` is a positive number, the rounding + is performed to a `s` number of decimal places. + + When `s` is a negative number, the rounding is + performed to the left side of the decimal point + as specified by `s`. + options: + rounding: + description: > + When a boundary is computed to lie somewhere between two values, + and this value cannot be exactly represented, this specifies how + to round it. + + - TIE_TO_EVEN: round to nearest value; if exactly halfway, tie + to the even option. + - TIE_AWAY_FROM_ZERO: round to nearest value; if exactly + halfway, tie away from zero. + - TRUNCATE: always round toward zero. + - CEILING: always round toward positive infinity. + - FLOOR: always round toward negative infinity. + - AWAY_FROM_ZERO: round negative values with FLOOR rule, round positive values with CEILING rule + - TIE_DOWN: round ties with FLOOR rule + - TIE_UP: round ties with CEILING rule + - TIE_TOWARDS_ZERO: round ties with TRUNCATE rule + - TIE_TO_ODD: round to nearest value; if exactly halfway, tie + to the odd option. + values: [ TIE_TO_EVEN, TIE_AWAY_FROM_ZERO, TRUNCATE, CEILING, FLOOR, + AWAY_FROM_ZERO, TIE_DOWN, TIE_UP, TIE_TOWARDS_ZERO, TIE_TO_ODD ] + nullability: DECLARED_OUTPUT + return: fp64? diff --git a/src/substrait/extensions/functions_set.yaml b/src/substrait/extensions/functions_set.yaml new file mode 100644 index 0000000..ce02bf3 --- /dev/null +++ b/src/substrait/extensions/functions_set.yaml @@ -0,0 +1,27 @@ +%YAML 1.2 +--- +scalar_functions: + - + name: "index_in" + description: > + Checks the membership of a value in a list of values + + Returns the first 0-based index value of some input `T` if `T` is equal to + any element in `List`. Returns `NULL` if not found. + + If `T` is `NULL`, returns `NULL`. + + If `T` is `NaN`: + - Returns 0-based index of `NaN` in `List` (default) + - Returns `NULL` (if `NAN_IS_NOT_NAN` is specified) + impls: + - args: + - name: x + value: T + - name: y + value: List + options: + nan_equality: + values: [ NAN_IS_NAN, NAN_IS_NOT_NAN ] + nullability: DECLARED_OUTPUT + return: int64? diff --git a/src/substrait/extensions/functions_string.yaml b/src/substrait/extensions/functions_string.yaml new file mode 100644 index 0000000..11f2d18 --- /dev/null +++ b/src/substrait/extensions/functions_string.yaml @@ -0,0 +1,1397 @@ +%YAML 1.2 +--- +scalar_functions: + - + name: concat + description: Concatenate strings. + impls: + - args: + - value: "varchar" + name: "input" + variadic: + min: 1 + return: "varchar" + - args: + - value: "string" + name: "input" + variadic: + min: 1 + return: "string" + - + name: like + description: >- + Are two strings like each other. + + The `case_sensitivity` option applies to the `match` argument. + impls: + - args: + - value: "varchar" + name: "input" + description: The input string. + - value: "varchar" + name: "match" + description: The string to match against the input string. + options: + case_sensitivity: + values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] + return: "BOOLEAN" + - args: + - value: "string" + name: "input" + description: The input string. + - value: "string" + name: "match" + description: The string to match against the input string. + options: + case_sensitivity: + values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] + return: "BOOLEAN" + - + name: substring + description: >- + Extract a substring of a specified `length` starting from position `start`. + A `start` value of 1 refers to the first characters of the string. + impls: + - args: + - value: "varchar" + name: "input" + - value: i32 + name: "start" + - value: i32 + name: "length" + return: "varchar" + - args: + - value: "string" + name: "input" + - value: i32 + name: "start" + - value: i32 + name: "length" + return: "string" + - args: + - value: "fixedchar" + name: "input" + - value: i32 + name: "start" + - value: i32 + name: "length" + return: "string" + - + name: regexp_match_substring + description: >- + Extract a substring that matches the given regular expression pattern. The regular expression + pattern should follow the International Components for Unicode implementation + (https://unicode-org.github.io/icu/userguide/strings/regexp.html). The occurrence of the + pattern to be extracted is specified using the `occurrence` argument. Specifying `1` means + the first occurrence will be extracted, `2` means the second occurrence, and so on. + The `occurrence` argument should be a positive non-zero integer. The number of characters + from the beginning of the string to begin starting to search for pattern matches can be + specified using the `position` argument. Specifying `1` means to search for matches + starting at the first character of the input string, `2` means the second character, and so + on. The `position` argument should be a positive non-zero integer. The regular + expression capture group can be specified using the `group` argument. Specifying `0` + will return the substring matching the full regular expression. Specifying `1` will + return the substring matching only the first capture group, and so on. The `group` + argument should be a non-negative integer. + + The `case_sensitivity` option specifies case-sensitive or case-insensitive matching. + Enabling the `multiline` option will treat the input string as multiple lines. This makes + the `^` and `$` characters match at the beginning and end of any line, instead of just the + beginning and end of the input string. Enabling the `dotall` option makes the `.` character + match line terminator characters in a string. + + Behavior is undefined if the regex fails to compile, the occurrence value is out of range, + the position value is out of range, or the group value is out of range. + impls: + - args: + - value: "varchar" + name: "input" + - value: "varchar" + name: "pattern" + - value: i64 + name: "position" + - value: i64 + name: "occurrence" + - value: i64 + name: "group" + options: + case_sensitivity: + values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] + multiline: + values: [ MULTILINE_DISABLED, MULTILINE_ENABLED ] + dotall: + values: [ DOTALL_DISABLED, DOTALL_ENABLED ] + return: "varchar" + - args: + - value: "string" + name: "input" + - value: "string" + name: "pattern" + - value: i64 + name: "position" + - value: i64 + name: "occurrence" + - value: i64 + name: "group" + options: + case_sensitivity: + values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] + multiline: + values: [ MULTILINE_DISABLED, MULTILINE_ENABLED ] + dotall: + values: [ DOTALL_DISABLED, DOTALL_ENABLED ] + return: "string" + - + name: regexp_match_substring_all + description: >- + Extract all substrings that match the given regular expression pattern. This will return a + list of extracted strings with one value for each occurrence of a match. The regular expression + pattern should follow the International Components for Unicode implementation + (https://unicode-org.github.io/icu/userguide/strings/regexp.html). The number of characters + from the beginning of the string to begin starting to search for pattern matches can be + specified using the `position` argument. Specifying `1` means to search for matches + starting at the first character of the input string, `2` means the second character, and so + on. The `position` argument should be a positive non-zero integer. The regular + expression capture group can be specified using the `group` argument. Specifying `0` + will return substrings matching the full regular expression. Specifying `1` will return + substrings matching only the first capture group, and so on. The `group` argument should + be a non-negative integer. + + The `case_sensitivity` option specifies case-sensitive or case-insensitive matching. + Enabling the `multiline` option will treat the input string as multiple lines. This makes + the `^` and `$` characters match at the beginning and end of any line, instead of just the + beginning and end of the input string. Enabling the `dotall` option makes the `.` character + match line terminator characters in a string. + + Behavior is undefined if the regex fails to compile, the position value is out of range, + or the group value is out of range. + impls: + - args: + - value: "varchar" + name: "input" + - value: "varchar" + name: "pattern" + - value: i64 + name: "position" + - value: i64 + name: "group" + options: + case_sensitivity: + values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] + multiline: + values: [ MULTILINE_DISABLED, MULTILINE_ENABLED ] + dotall: + values: [ DOTALL_DISABLED, DOTALL_ENABLED ] + return: "List>" + - args: + - value: "string" + name: "input" + - value: "string" + name: "pattern" + - value: i64 + name: "position" + - value: i64 + name: "group" + options: + case_sensitivity: + values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] + multiline: + values: [ MULTILINE_DISABLED, MULTILINE_ENABLED ] + dotall: + values: [ DOTALL_DISABLED, DOTALL_ENABLED ] + return: "List" + - + name: starts_with + description: >- + Whether the `input` string starts with the `substring`. + + The `case_sensitivity` option applies to the `substring` argument. + impls: + - args: + - value: "varchar" + name: "input" + description: The input string. + - value: "varchar" + name: "substring" + description: The substring to search for. + options: + case_sensitivity: + values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] + return: "BOOLEAN" + - args: + - value: "varchar" + name: "input" + description: The input string. + - value: "string" + name: "substring" + description: The substring to search for. + options: + case_sensitivity: + values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] + return: "BOOLEAN" + - args: + - value: "varchar" + name: "input" + description: The input string. + - value: "fixedchar" + name: "substring" + description: The substring to search for. + options: + case_sensitivity: + values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] + return: "BOOLEAN" + - args: + - value: "string" + name: "input" + description: The input string. + - value: "string" + name: "substring" + description: The substring to search for. + options: + case_sensitivity: + values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] + return: "BOOLEAN" + - args: + - value: "string" + name: "input" + description: The input string. + - value: "varchar" + name: "substring" + description: The substring to search for. + options: + case_sensitivity: + values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] + return: "BOOLEAN" + - args: + - value: "string" + name: "input" + description: The input string. + - value: "fixedchar" + name: "substring" + description: The substring to search for. + options: + case_sensitivity: + values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] + return: "BOOLEAN" + - args: + - value: "fixedchar" + name: "input" + description: The input string. + - value: "fixedchar" + name: "substring" + description: The substring to search for. + options: + case_sensitivity: + values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] + return: "BOOLEAN" + - args: + - value: "fixedchar" + name: "input" + description: The input string. + - value: "string" + name: "substring" + description: The substring to search for. + options: + case_sensitivity: + values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] + return: "BOOLEAN" + - args: + - value: "fixedchar" + name: "input" + description: The input string. + - value: "varchar" + name: "substring" + description: The substring to search for. + options: + case_sensitivity: + values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] + return: "BOOLEAN" + - + name: ends_with + description: >- + Whether `input` string ends with the substring. + + The `case_sensitivity` option applies to the `substring` argument. + impls: + - args: + - value: "varchar" + name: "input" + description: The input string. + - value: "varchar" + name: "substring" + description: The substring to search for. + options: + case_sensitivity: + values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] + return: "BOOLEAN" + - args: + - value: "varchar" + name: "input" + description: The input string. + - value: "string" + name: "substring" + description: The substring to search for. + options: + case_sensitivity: + values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] + return: "BOOLEAN" + - args: + - value: "varchar" + name: "input" + description: The input string. + - value: "fixedchar" + name: "substring" + description: The substring to search for. + options: + case_sensitivity: + values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] + return: "BOOLEAN" + - args: + - value: "string" + name: "input" + description: The input string. + - value: "string" + name: "substring" + description: The substring to search for. + options: + case_sensitivity: + values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] + return: "BOOLEAN" + - args: + - value: "string" + name: "input" + description: The input string. + - value: "varchar" + name: "substring" + description: The substring to search for. + options: + case_sensitivity: + values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] + return: "BOOLEAN" + - args: + - value: "string" + name: "input" + description: The input string. + - value: "fixedchar" + name: "substring" + description: The substring to search for. + options: + case_sensitivity: + values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] + return: "BOOLEAN" + - args: + - value: "fixedchar" + name: "input" + description: The input string. + - value: "fixedchar" + name: "substring" + description: The substring to search for. + options: + case_sensitivity: + values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] + return: "BOOLEAN" + - args: + - value: "fixedchar" + name: "input" + description: The input string. + - value: "string" + name: "substring" + description: The substring to search for. + options: + case_sensitivity: + values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] + return: "BOOLEAN" + - args: + - value: "fixedchar" + name: "input" + description: The input string. + - value: "varchar" + name: "substring" + description: The substring to search for. + options: + case_sensitivity: + values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] + return: "BOOLEAN" + - + name: contains + description: >- + Whether the `input` string contains the `substring`. + + The `case_sensitivity` option applies to the `substring` argument. + impls: + - args: + - value: "varchar" + name: "input" + description: The input string. + - value: "varchar" + name: "substring" + description: The substring to search for. + options: + case_sensitivity: + values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] + return: "BOOLEAN" + - args: + - value: "varchar" + name: "input" + description: The input string. + - value: "string" + name: "substring" + description: The substring to search for. + options: + case_sensitivity: + values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] + return: "BOOLEAN" + - args: + - value: "varchar" + name: "input" + description: The input string. + - value: "fixedchar" + name: "substring" + description: The substring to search for. + options: + case_sensitivity: + values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] + return: "BOOLEAN" + - args: + - value: "string" + name: "input" + description: The input string. + - value: "string" + name: "substring" + description: The substring to search for. + options: + case_sensitivity: + values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] + return: "BOOLEAN" + - args: + - value: "string" + name: "input" + description: The input string. + - value: "varchar" + name: "substring" + description: The substring to search for. + options: + case_sensitivity: + values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] + return: "BOOLEAN" + - args: + - value: "string" + name: "input" + description: The input string. + - value: "fixedchar" + name: "substring" + description: The substring to search for. + options: + case_sensitivity: + values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] + return: "BOOLEAN" + - args: + - value: "fixedchar" + name: "input" + description: The input string. + - value: "fixedchar" + name: "substring" + description: The substring to search for. + options: + case_sensitivity: + values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] + return: "BOOLEAN" + - args: + - value: "fixedchar" + name: "input" + description: The input string. + - value: "string" + name: "substring" + description: The substring to search for. + options: + case_sensitivity: + values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] + return: "BOOLEAN" + - args: + - value: "fixedchar" + name: "input" + description: The input string. + - value: "varchar" + name: "substring" + description: The substring to search for. + options: + case_sensitivity: + values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] + return: "BOOLEAN" + - + name: strpos + description: >- + Return the position of the first occurrence of a string in another string. The first + character of the string is at position 1. If no occurrence is found, 0 is returned. + + The `case_sensitivity` option applies to the `substring` argument. + impls: + - args: + - value: "string" + name: "input" + description: The input string. + - value: "string" + name: "substring" + description: The substring to search for. + options: + case_sensitivity: + values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] + return: i64 + - args: + - value: "varchar" + name: "input" + description: The input string. + - value: "varchar" + name: "substring" + description: The substring to search for. + options: + case_sensitivity: + values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] + return: i64 + - args: + - value: "fixedchar" + name: "input" + description: The input string. + - value: "fixedchar" + name: "substring" + description: The substring to search for. + options: + case_sensitivity: + values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] + return: i64 + - + name: regexp_strpos + description: >- + Return the position of an occurrence of the given regular expression pattern in a + string. The first character of the string is at position 1. The regular expression pattern + should follow the International Components for Unicode implementation + (https://unicode-org.github.io/icu/userguide/strings/regexp.html). The number of characters + from the beginning of the string to begin starting to search for pattern matches can be + specified using the `position` argument. Specifying `1` means to search for matches + starting at the first character of the input string, `2` means the second character, and so + on. The `position` argument should be a positive non-zero integer. Which occurrence to + return the position of is specified using the `occurrence` argument. Specifying `1` means + the position first occurrence will be returned, `2` means the position of the second + occurrence, and so on. The `occurrence` argument should be a positive non-zero integer. If + no occurrence is found, 0 is returned. + + The `case_sensitivity` option specifies case-sensitive or case-insensitive matching. + Enabling the `multiline` option will treat the input string as multiple lines. This makes + the `^` and `$` characters match at the beginning and end of any line, instead of just the + beginning and end of the input string. Enabling the `dotall` option makes the `.` character + match line terminator characters in a string. + + Behavior is undefined if the regex fails to compile, the occurrence value is out of range, or + the position value is out of range. + impls: + - args: + - value: "varchar" + name: "input" + - value: "varchar" + name: "pattern" + - value: i64 + name: "position" + - value: i64 + name: "occurrence" + options: + case_sensitivity: + values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] + multiline: + values: [ MULTILINE_DISABLED, MULTILINE_ENABLED ] + dotall: + values: [ DOTALL_DISABLED, DOTALL_ENABLED ] + return: i64 + - args: + - value: "string" + name: "input" + - value: "string" + name: "pattern" + - value: i64 + name: "position" + - value: i64 + name: "occurrence" + options: + case_sensitivity: + values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] + multiline: + values: [ MULTILINE_DISABLED, MULTILINE_ENABLED ] + dotall: + values: [ DOTALL_DISABLED, DOTALL_ENABLED ] + return: i64 + - + name: count_substring + description: >- + Return the number of non-overlapping occurrences of a substring in an input string. + + The `case_sensitivity` option applies to the `substring` argument. + impls: + - args: + - value: "string" + name: "input" + description: The input string. + - value: "string" + name: "substring" + description: The substring to count. + options: + case_sensitivity: + values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] + return: i64 + - args: + - value: "varchar" + name: "input" + description: The input string. + - value: "varchar" + name: "substring" + description: The substring to count. + options: + case_sensitivity: + values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] + return: i64 + - args: + - value: "fixedchar" + name: "input" + description: The input string. + - value: "fixedchar" + name: "substring" + description: The substring to count. + options: + case_sensitivity: + values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] + return: i64 + - + name: regexp_count_substring + description: >- + Return the number of non-overlapping occurrences of a regular expression pattern in an input + string. The regular expression pattern should follow the International Components for + Unicode implementation (https://unicode-org.github.io/icu/userguide/strings/regexp.html). + The number of characters from the beginning of the string to begin starting to search for + pattern matches can be specified using the `position` argument. Specifying `1` means to + search for matches starting at the first character of the input string, `2` means the + second character, and so on. The `position` argument should be a positive non-zero integer. + + The `case_sensitivity` option specifies case-sensitive or case-insensitive matching. + Enabling the `multiline` option will treat the input string as multiple lines. This makes + the `^` and `$` characters match at the beginning and end of any line, instead of just the + beginning and end of the input string. Enabling the `dotall` option makes the `.` character + match line terminator characters in a string. + + Behavior is undefined if the regex fails to compile or the position value is out of range. + impls: + - args: + - value: "string" + name: "input" + - value: "string" + name: "pattern" + - value: i64 + name: "position" + options: + case_sensitivity: + values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] + multiline: + values: [ MULTILINE_DISABLED, MULTILINE_ENABLED ] + dotall: + values: [ DOTALL_DISABLED, DOTALL_ENABLED ] + return: i64 + - args: + - value: "varchar" + name: "input" + - value: "varchar" + name: "pattern" + - value: i64 + name: "position" + options: + case_sensitivity: + values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] + multiline: + values: [ MULTILINE_DISABLED, MULTILINE_ENABLED ] + dotall: + values: [ DOTALL_DISABLED, DOTALL_ENABLED ] + return: i64 + - args: + - value: "fixedchar" + name: "input" + - value: "fixedchar" + name: "pattern" + - value: i64 + name: "position" + options: + case_sensitivity: + values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] + multiline: + values: [ MULTILINE_DISABLED, MULTILINE_ENABLED ] + dotall: + values: [ DOTALL_DISABLED, DOTALL_ENABLED ] + return: i64 + - + name: replace + description: >- + Replace all occurrences of the substring with the replacement string. + + The `case_sensitivity` option applies to the `substring` argument. + impls: + - args: + - value: "string" + name: "input" + description: Input string. + - value: "string" + name: "substring" + description: The substring to replace. + - value: "string" + name: "replacement" + description: The replacement string. + options: + case_sensitivity: + values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] + return: "string" + - args: + - value: "varchar" + name: "input" + description: Input string. + - value: "varchar" + name: "substring" + description: The substring to replace. + - value: "varchar" + name: "replacement" + description: The replacement string. + options: + case_sensitivity: + values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] + return: "varchar" + - + name: concat_ws + description: Concatenate strings together separated by a separator. + impls: + - args: + - value: "string" + name: "separator" + description: Character to separate strings by. + - value: "string" + name: "string_arguments" + description: Strings to be concatenated. + variadic: + min: 1 + return: "string" + - args: + - value: "varchar" + name: "separator" + description: Character to separate strings by. + - value: "varchar" + name: "string_arguments" + description: Strings to be concatenated. + variadic: + min: 1 + return: "varchar" + - + name: repeat + description: Repeat a string `count` number of times. + impls: + - args: + - value: "string" + name: "input" + - value: i64 + name: "count" + return: "string" + - args: + - value: "varchar" + - value: i64 + name: "input" + - value: i64 + name: "count" + return: "varchar" + - + name: reverse + description: Returns the string in reverse order. + impls: + - args: + - value: "string" + name: "input" + return: "string" + - args: + - value: "varchar" + name: "input" + return: "varchar" + - args: + - value: "fixedchar" + name: "input" + return: "fixedchar" + - + name: replace_slice + description: >- + Replace a slice of the input string. A specified 'length' of characters will be deleted from + the input string beginning at the 'start' position and will be replaced by a new string. A + start value of 1 indicates the first character of the input string. If start is negative + or zero, or greater than the length of the input string, a null string is returned. If 'length' + is negative, a null string is returned. If 'length' is zero, inserting of the new string + occurs at the specified 'start' position and no characters are deleted. If 'length' is + greater than the input string, deletion will occur up to the last character of the input string. + impls: + - args: + - value: "string" + name: "input" + description: Input string. + - value: i64 + name: "start" + description: The position in the string to start deleting/inserting characters. + - value: i64 + name: "length" + description: The number of characters to delete from the input string. + - value: "string" + name: "replacement" + description: The new string to insert at the start position. + return: "string" + - args: + - value: "varchar" + name: "input" + description: Input string. + - value: i64 + name: "start" + description: The position in the string to start deleting/inserting characters. + - value: i64 + name: "length" + description: The number of characters to delete from the input string. + - value: "varchar" + name: "replacement" + description: The new string to insert at the start position. + return: "varchar" + - + name: lower + description: >- + Transform the string to lower case characters. Implementation should follow the utf8_unicode_ci + collations according to the Unicode Collation Algorithm described at http://www.unicode.org/reports/tr10/. + impls: + - args: + - value: "string" + name: "input" + options: + char_set: + values: [ UTF8, ASCII_ONLY ] + return: "string" + - args: + - value: "varchar" + name: "input" + options: + char_set: + values: [ UTF8, ASCII_ONLY ] + return: "varchar" + - args: + - value: "fixedchar" + name: "input" + options: + char_set: + values: [ UTF8, ASCII_ONLY ] + return: "fixedchar" + - + name: upper + description: >- + Transform the string to upper case characters. Implementation should follow the utf8_unicode_ci + collations according to the Unicode Collation Algorithm described at http://www.unicode.org/reports/tr10/. + impls: + - args: + - value: "string" + name: "input" + options: + char_set: + values: [ UTF8, ASCII_ONLY ] + return: "string" + - args: + - value: "varchar" + name: "input" + options: + char_set: + values: [ UTF8, ASCII_ONLY ] + return: "varchar" + - args: + - value: "fixedchar" + name: "input" + options: + char_set: + values: [ UTF8, ASCII_ONLY ] + return: "fixedchar" + - + name: swapcase + description: >- + Transform the string's lowercase characters to uppercase and uppercase characters to + lowercase. Implementation should follow the utf8_unicode_ci collations according to the + Unicode Collation Algorithm described at http://www.unicode.org/reports/tr10/. + impls: + - args: + - value: "string" + name: "input" + options: + char_set: + values: [ UTF8, ASCII_ONLY ] + return: "string" + - args: + - value: "varchar" + name: "input" + options: + char_set: + values: [ UTF8, ASCII_ONLY ] + return: "varchar" + - args: + - value: "fixedchar" + name: "input" + options: + char_set: + values: [ UTF8, ASCII_ONLY ] + return: "fixedchar" + - + name: capitalize + description: >- + Capitalize the first character of the input string. Implementation should follow the + utf8_unicode_ci collations according to the Unicode Collation Algorithm described at + http://www.unicode.org/reports/tr10/. + impls: + - args: + - value: "string" + name: "input" + options: + char_set: + values: [ UTF8, ASCII_ONLY ] + return: "string" + - args: + - value: "varchar" + name: "input" + options: + char_set: + values: [ UTF8, ASCII_ONLY ] + return: "varchar" + - args: + - value: "fixedchar" + name: "input" + options: + char_set: + values: [ UTF8, ASCII_ONLY ] + return: "fixedchar" + - + name: title + description: >- + Converts the input string into titlecase. Capitalize the first character of each word in the + input string except for articles (a, an, the). Implementation should follow the + utf8_unicode_ci collations according to the Unicode Collation Algorithm described at + http://www.unicode.org/reports/tr10/. + impls: + - args: + - value: "string" + name: "input" + options: + char_set: + values: [ UTF8, ASCII_ONLY ] + return: "string" + - args: + - value: "varchar" + name: "input" + options: + char_set: + values: [ UTF8, ASCII_ONLY ] + return: "varchar" + - args: + - value: "fixedchar" + name: "input" + options: + char_set: + values: [ UTF8, ASCII_ONLY ] + return: "fixedchar" + - + name: char_length + description: >- + Return the number of characters in the input string. The length includes trailing spaces. + impls: + - args: + - value: "string" + name: "input" + return: i64 + - args: + - value: "varchar" + name: "input" + return: i64 + - args: + - value: "fixedchar" + name: "input" + return: i64 + - + name: bit_length + description: Return the number of bits in the input string. + impls: + - args: + - value: "string" + name: "input" + return: i64 + - args: + - value: "varchar" + name: "input" + return: i64 + - args: + - value: "fixedchar" + name: "input" + return: i64 + - + name: octet_length + description: Return the number of bytes in the input string. + impls: + - args: + - value: "string" + name: "input" + return: i64 + - args: + - value: "varchar" + name: "input" + return: i64 + - args: + - value: "fixedchar" + name: "input" + return: i64 + - + name: regexp_replace + description: >- + Search a string for a substring that matches a given regular expression pattern and replace + it with a replacement string. The regular expression pattern should follow the + International Components for Unicode implementation (https://unicode-org.github + .io/icu/userguide/strings/regexp.html). The occurrence of the pattern to be replaced is + specified using the `occurrence` argument. Specifying `1` means only the first occurrence + will be replaced, `2` means the second occurrence, and so on. Specifying `0` means all + occurrences will be replaced. The number of characters from the beginning of the string to + begin starting to search for pattern matches can be specified using the `position` argument. + Specifying `1` means to search for matches starting at the first character of the input + string, `2` means the second character, and so on. The `position` argument should be a + positive non-zero integer. The replacement string can capture groups using numbered + backreferences. + + The `case_sensitivity` option specifies case-sensitive or case-insensitive matching. + Enabling the `multiline` option will treat the input string as multiple lines. This makes + the `^` and `$` characters match at the beginning and end of any line, instead of just the + beginning and end of the input string. Enabling the `dotall` option makes the `.` character + match line terminator characters in a string. + + Behavior is undefined if the regex fails to compile, the replacement contains an illegal + back-reference, the occurrence value is out of range, or the position value is out of range. + impls: + - args: + - value: "string" + name: "input" + description: The input string. + - value: "string" + name: "pattern" + description: The regular expression to search for within the input string. + - value: "string" + name: "replacement" + description: The replacement string. + - value: i64 + name: "position" + description: The position to start the search. + - value: i64 + name: "occurrence" + description: Which occurrence of the match to replace. + options: + case_sensitivity: + values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] + multiline: + values: [ MULTILINE_DISABLED, MULTILINE_ENABLED ] + dotall: + values: [ DOTALL_DISABLED, DOTALL_ENABLED ] + return: "string" + - args: + - value: "varchar" + name: "input" + description: The input string. + - value: "varchar" + name: "pattern" + description: The regular expression to search for within the input string. + - value: "varchar" + name: "replacement" + description: The replacement string. + - value: i64 + name: "position" + description: The position to start the search. + - value: i64 + name: "occurrence" + description: Which occurrence of the match to replace. + options: + case_sensitivity: + values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] + multiline: + values: [ MULTILINE_DISABLED, MULTILINE_ENABLED ] + dotall: + values: [ DOTALL_DISABLED, DOTALL_ENABLED ] + return: "varchar" + - + name: ltrim + description: >- + Remove any occurrence of the characters from the left side of the string. + If no characters are specified, spaces are removed. + impls: + - args: + - value: "varchar" + name: "input" + description: "The string to remove characters from." + - value: "varchar" + name: "characters" + description: "The set of characters to remove." + return: "varchar" + - args: + - value: "string" + name: "input" + description: "The string to remove characters from." + - value: "string" + name: "characters" + description: "The set of characters to remove." + return: "string" + - + name: rtrim + description: >- + Remove any occurrence of the characters from the right side of the string. + If no characters are specified, spaces are removed. + impls: + - args: + - value: "varchar" + name: "input" + description: "The string to remove characters from." + - value: "varchar" + name: "characters" + description: "The set of characters to remove." + return: "varchar" + - args: + - value: "string" + name: "input" + description: "The string to remove characters from." + - value: "string" + name: "characters" + description: "The set of characters to remove." + return: "string" + - + name: trim + description: >- + Remove any occurrence of the characters from the left and right sides of + the string. If no characters are specified, spaces are removed. + impls: + - args: + - value: "varchar" + name: "input" + description: "The string to remove characters from." + - value: "varchar" + name: "characters" + description: "The set of characters to remove." + return: "varchar" + - args: + - value: "string" + name: "input" + description: "The string to remove characters from." + - value: "string" + name: "characters" + description: "The set of characters to remove." + return: "string" + - + name: lpad + description: >- + Left-pad the input string with the string of 'characters' until the specified length of the + string has been reached. If the input string is longer than 'length', remove characters from + the right-side to shorten it to 'length' characters. If the string of 'characters' is longer + than the remaining 'length' needed to be filled, only pad until 'length' has been reached. + If 'characters' is not specified, the default value is a single space. + impls: + - args: + - value: "varchar" + name: "input" + description: "The string to pad." + - value: i32 + name: "length" + description: "The length of the output string." + - value: "varchar" + name: "characters" + description: "The string of characters to use for padding." + return: "varchar" + - args: + - value: "string" + name: "input" + description: "The string to pad." + - value: i32 + name: "length" + description: "The length of the output string." + - value: "string" + name: "characters" + description: "The string of characters to use for padding." + return: "string" + - + name: rpad + description: >- + Right-pad the input string with the string of 'characters' until the specified length of the + string has been reached. If the input string is longer than 'length', remove characters from + the left-side to shorten it to 'length' characters. If the string of 'characters' is longer + than the remaining 'length' needed to be filled, only pad until 'length' has been reached. + If 'characters' is not specified, the default value is a single space. + impls: + - args: + - value: "varchar" + name: "input" + description: "The string to pad." + - value: i32 + name: "length" + description: "The length of the output string." + - value: "varchar" + name: "characters" + description: "The string of characters to use for padding." + return: "varchar" + - args: + - value: "string" + name: "input" + description: "The string to pad." + - value: i32 + name: "length" + description: "The length of the output string." + - value: "string" + name: "characters" + description: "The string of characters to use for padding." + return: "string" + - + name: center + description: >- + Center the input string by padding the sides with a single `character` until the specified + `length` of the string has been reached. By default, if the `length` will be reached with + an uneven number of padding, the extra padding will be applied to the right side. + The side with extra padding can be controlled with the `padding` option. + + Behavior is undefined if the number of characters passed to the `character` argument is not 1. + impls: + - args: + - value: "varchar" + name: "input" + description: "The string to pad." + - value: i32 + name: "length" + description: "The length of the output string." + - value: "varchar" + name: "character" + description: "The character to use for padding." + options: + padding: + values: [ RIGHT, LEFT ] + return: "varchar" + - args: + - value: "string" + name: "input" + description: "The string to pad." + - value: i32 + name: "length" + description: "The length of the output string." + - value: "string" + name: "character" + description: "The character to use for padding." + options: + padding: + values: [ RIGHT, LEFT ] + return: "string" + - + name: left + description: Extract `count` characters starting from the left of the string. + impls: + - args: + - value: "varchar" + name: "input" + - value: i32 + name: "count" + return: "varchar" + - args: + - value: "string" + name: "input" + - value: i32 + name: "count" + return: "string" + - + name: right + description: Extract `count` characters starting from the right of the string. + impls: + - args: + - value: "varchar" + name: "input" + - value: i32 + name: "count" + return: "varchar" + - args: + - value: "string" + name: "input" + - value: i32 + name: "count" + return: "string" + - + name: string_split + description: >- + Split a string into a list of strings, based on a specified `separator` character. + impls: + - args: + - value: "varchar" + name: "input" + description: The input string. + - value: "varchar" + name: "separator" + description: A character used for splitting the string. + return: "List>" + - args: + - value: "string" + name: "input" + description: The input string. + - value: "string" + name: "separator" + description: A character used for splitting the string. + return: "List" + - + name: regexp_string_split + description: >- + Split a string into a list of strings, based on a regular expression pattern. The + substrings matched by the pattern will be used as the separators to split the input + string and will not be included in the resulting list. The regular expression + pattern should follow the International Components for Unicode implementation + (https://unicode-org.github.io/icu/userguide/strings/regexp.html). + + The `case_sensitivity` option specifies case-sensitive or case-insensitive matching. + Enabling the `multiline` option will treat the input string as multiple lines. This makes + the `^` and `$` characters match at the beginning and end of any line, instead of just the + beginning and end of the input string. Enabling the `dotall` option makes the `.` character + match line terminator characters in a string. + impls: + - args: + - value: "varchar" + name: "input" + description: The input string. + - value: "varchar" + name: "pattern" + description: The regular expression to search for within the input string. + options: + case_sensitivity: + values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] + multiline: + values: [ MULTILINE_DISABLED, MULTILINE_ENABLED ] + dotall: + values: [ DOTALL_DISABLED, DOTALL_ENABLED ] + return: "List>" + - args: + - value: "string" + name: "input" + description: The input string. + - value: "string" + name: "pattern" + description: The regular expression to search for within the input string. + options: + case_sensitivity: + values: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] + multiline: + values: [ MULTILINE_DISABLED, MULTILINE_ENABLED ] + dotall: + values: [ DOTALL_DISABLED, DOTALL_ENABLED ] + return: "List" + +aggregate_functions: + + - + name: string_agg + description: Concatenates a column of string values with a separator. + impls: + - args: + - value: "string" + name: "input" + description: "Column of string values." + - value: "string" + name: "separator" + constant: true + description: "Separator for concatenated strings" + ordered: true + return: "string" diff --git a/src/substrait/extensions/type_variations.yaml b/src/substrait/extensions/type_variations.yaml new file mode 100644 index 0000000..f6f96d5 --- /dev/null +++ b/src/substrait/extensions/type_variations.yaml @@ -0,0 +1,25 @@ +%YAML 1.2 +--- +type_variations: + - parent: string + name: dict4 + description: a four-byte dictionary encoded string + functions: INHERITS + - parent: string + name: bigoffset + description: >- + The arrow large string representation of strings, still restricted to the default string size defined in + Substrait. + functions: SEPARATE + - parent: struct + name: avro + description: an avro encoded struct + functions: SEPARATE + - parent: struct + name: cstruct + description: a cstruct representation of the struct + functions: SEPARATE + - parent: struct + name: dict2 + description: a 2-byte dictionary encoded string. + functions: INHERITS diff --git a/src/substrait/extensions/unknown.yaml b/src/substrait/extensions/unknown.yaml new file mode 100644 index 0000000..3b0e6c1 --- /dev/null +++ b/src/substrait/extensions/unknown.yaml @@ -0,0 +1,66 @@ +%YAML 1.2 +--- +types: + - name: unknown +scalar_functions: + - name: "add" + impls: + - args: + - value: unknown + - value: unknown + return: unknown + - name: "subtract" + impls: + - args: + - value: unknown + - value: unknown + return: unknown + - name: "multiply" + impls: + - args: + - value: unknown + - value: unknown + return: unknown + - name: "divide" + impls: + - args: + - value: unknown + - value: unknown + return: unknown + - name: "modulus" + impls: + - args: + - value: unknown + - value: unknown + return: unknown +aggregate_functions: + - name: "sum" + impls: + - args: + - value: unknown + intermediate: unknown + return: unknown + - name: "avg" + impls: + - args: + - value: unknown + intermediate: unknown + return: unknown + - name: "min" + impls: + - args: + - value: unknown + intermediate: unknown + return: unknown + - name: "max" + impls: + - args: + - value: unknown + intermediate: unknown + return: unknown + - name: "count" + impls: + - args: + - value: unknown + intermediate: unknown + return: unknown