From f44f7685638e4afb97f8d6d536d8538b5292b47d Mon Sep 17 00:00:00 2001 From: sgilmore10 <74676073+sgilmore10@users.noreply.github.com> Date: Sat, 3 Jun 2023 17:57:26 -0400 Subject: [PATCH] GH-35676: [MATLAB] Add an `InferNulls` name-value pair for controlling null value inference during construction of `arrow.array.Array` (#35827) ### Rationale for this change This change lets users control toggle the automatic null-value detection behavior. By default, values MATLAB considers to be missing (e.g. `NaN` for `double`, `` for `string`, and `NaT` for `datetime`) will be treated as `null` values. Users can toggle this behavior on and off using the `InferNulls` name-value pair. **Example** ```matlab >> matlabArray = [1 NaN 3]' matlabArray = 1 NaN 3 % Treat NaN as a null value >> arrowArray1 = arrow.array.Float64Array(maltabArray, InferNulls=true) arrowArray1 = [ 1, null, 3 ] % Don't treat NaN as a null value >> arrowArray2 = arrow.array.Float64Array(maltabArray, InferNulls=false) arrowArray2 = [ 1, nan, 3 ] ``` We've only added this nv-pair to `arrow.array.Float64Array` for now. We'll add this nv-pair to the other types in a followup changelist. ### What changes are included in this PR? 1. Added `InferNulls` name-value pair to `arrow.array.Float64Array`. 2. Added common validation function `arrow.args.validateTypeAndShape` to remove duplicate validation code among the numeric classes. 3. Added a function called `arrow.args.parseValidElements` that the `arrow.array.Array` classes will be able to share for generating the logical mask of valid elements. ### Are these changes tested? Yes, we added a test pointed called `InferNulls` to the test class`tFloat64Array.m`. ### Are there any user-facing changes? Yes, users can now control how `NaN` values are treated when creating an `arrow.array.Float64Array`. ### Future Directions 1. Add a name-value pair to allow users to specify the valid elements themselves. 2. Extend null support to other numeric types. 3. We've been working on adding error-handling support to `mathworks/libmexclass`. We have a prototype to do this using status-like and result-like objects already pushed to a [branch](https://github.com/mathworks/libmexclass/tree/33). Once this branch is merged with the `main` branch of `mathworks/libmexclass`, we'll port it over. ### Notes Thank you @ kevingurney for all the help with this PR! * Closes: #35676 Lead-authored-by: Sarah Gilmore Co-authored-by: Kevin Gurney Signed-off-by: Sutou Kouhei --- .../matlab/+arrow/+args/parseValidElements.m | 28 +++++++++++++++ .../+arrow/+args/validateTypeAndShape.m | 36 +++++++++++++++++++ .../src/matlab/+arrow/+array/Float32Array.m | 3 +- .../src/matlab/+arrow/+array/Float64Array.m | 14 +++----- matlab/src/matlab/+arrow/+array/Int16Array.m | 4 +-- matlab/src/matlab/+arrow/+array/Int32Array.m | 3 +- matlab/src/matlab/+arrow/+array/Int64Array.m | 3 +- matlab/src/matlab/+arrow/+array/Int8Array.m | 4 +-- matlab/src/matlab/+arrow/+array/UInt16Array.m | 3 +- matlab/src/matlab/+arrow/+array/UInt32Array.m | 3 +- matlab/src/matlab/+arrow/+array/UInt64Array.m | 3 +- matlab/src/matlab/+arrow/+array/UInt8Array.m | 3 +- matlab/test/arrow/array/hNumericArray.m | 10 +++--- matlab/test/arrow/array/tFloat64Array.m | 18 +++++++++- 14 files changed, 102 insertions(+), 33 deletions(-) create mode 100644 matlab/src/matlab/+arrow/+args/parseValidElements.m create mode 100644 matlab/src/matlab/+arrow/+args/validateTypeAndShape.m diff --git a/matlab/src/matlab/+arrow/+args/parseValidElements.m b/matlab/src/matlab/+arrow/+args/parseValidElements.m new file mode 100644 index 0000000000000..90d26b5315344 --- /dev/null +++ b/matlab/src/matlab/+arrow/+args/parseValidElements.m @@ -0,0 +1,28 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +function validElements = parseValidElements(data, inferNulls) + % Returns a logical vector of the validElements in data. If inferNulls + % is true, calls ismissing on data to determine which elements are + % null. + + if inferNulls + % TODO: consider making validElements empty if everything is valid. + validElements = ~ismissing(data); + else + % TODO: consider making this an empty array if everything is valid + validElements = true([numel(data) 1]); + end +end diff --git a/matlab/src/matlab/+arrow/+args/validateTypeAndShape.m b/matlab/src/matlab/+arrow/+args/validateTypeAndShape.m new file mode 100644 index 0000000000000..08c3312bc1373 --- /dev/null +++ b/matlab/src/matlab/+arrow/+args/validateTypeAndShape.m @@ -0,0 +1,36 @@ +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +function validateTypeAndShape(data, type) + % Validates data has the expected type and is a vector or empty 2D + % matrix. If data is numeric, validates is real and nonsparse. + + arguments + data + type(1, 1) string + end + + % If data is empty, only require it's shape to be 2D to support 0x0 + % arrays. Otherwise, require data to be a vector. + % + % TODO: Consider supporting nonvector 2D arrays. We chould reshape them + % to column vectors if needed. + + expectedShape = "vector"; + if isempty(data) + expectedShape = "2d"; + end + validateattributes(data, type, [expectedShape, "nonsparse", "real"]); +end \ No newline at end of file diff --git a/matlab/src/matlab/+arrow/+array/Float32Array.m b/matlab/src/matlab/+arrow/+array/Float32Array.m index 3b4635a815152..f641463aa7f13 100644 --- a/matlab/src/matlab/+arrow/+array/Float32Array.m +++ b/matlab/src/matlab/+arrow/+array/Float32Array.m @@ -26,8 +26,7 @@ data opts.DeepCopy = false end - validateattributes(data, "single", ["2d", "nonsparse", "real"]); - if ~isempty(data), validateattributes(data, "single", "vector"); end + arrow.args.validateTypeAndShape(data, "single"); obj@arrow.array.Array("Name", "arrow.array.proxy.Float32Array", "ConstructorArguments", {data, opts.DeepCopy}); % Store a reference to the array if not doing a deep copy if (~opts.DeepCopy), obj.MatlabArray = data; end diff --git a/matlab/src/matlab/+arrow/+array/Float64Array.m b/matlab/src/matlab/+arrow/+array/Float64Array.m index 841bbcc6e7950..f8d8a671dbee3 100644 --- a/matlab/src/matlab/+arrow/+array/Float64Array.m +++ b/matlab/src/matlab/+arrow/+array/Float64Array.m @@ -25,16 +25,11 @@ function obj = Float64Array(data, opts) arguments data - opts.DeepCopy = false + opts.DeepCopy(1, 1) logical = false + opts.InferNulls(1, 1) logical = true end - - validateattributes(data, "double", ["2d", "nonsparse", "real"]); - if ~isempty(data), validateattributes(data, "double", "vector"); end - % Extract missing (i.e. null) values. - % TODO: Determine a more robust approach to handling "detection" of null values. - % For example - add a name-value pair to allow clients to choose which values - % should be considered null (if any). - validElements = ~isnan(data); + arrow.args.validateTypeAndShape(data, "double"); + validElements = arrow.args.parseValidElements(data, opts.InferNulls); obj@arrow.array.Array("Name", "arrow.array.proxy.Float64Array", "ConstructorArguments", {data, opts.DeepCopy, validElements}); % Store a reference to the array if not doing a deep copy if (~opts.DeepCopy), obj.MatlabArray = data; end @@ -50,3 +45,4 @@ end end end + diff --git a/matlab/src/matlab/+arrow/+array/Int16Array.m b/matlab/src/matlab/+arrow/+array/Int16Array.m index f2b760412958a..e4c8589594d2e 100644 --- a/matlab/src/matlab/+arrow/+array/Int16Array.m +++ b/matlab/src/matlab/+arrow/+array/Int16Array.m @@ -26,8 +26,8 @@ data opts.DeepCopy = false end - validateattributes(data, "int16", ["2d", "nonsparse", "real"]); - if ~isempty(data), validateattributes(data, "int16", "vector"); end + + arrow.args.validateTypeAndShape(data, "int16"); obj@arrow.array.Array("Name", "arrow.array.proxy.Int16Array", "ConstructorArguments", {data, opts.DeepCopy}); % Store a reference to the array if not doing a deep copy if (~opts.DeepCopy), obj.MatlabArray = data; end diff --git a/matlab/src/matlab/+arrow/+array/Int32Array.m b/matlab/src/matlab/+arrow/+array/Int32Array.m index d493ee8ceb466..840321f65ef80 100644 --- a/matlab/src/matlab/+arrow/+array/Int32Array.m +++ b/matlab/src/matlab/+arrow/+array/Int32Array.m @@ -26,8 +26,7 @@ data opts.DeepCopy = false end - validateattributes(data, "int32", ["2d", "nonsparse", "real"]); - if ~isempty(data), validateattributes(data, "int32", "vector"); end + arrow.args.validateTypeAndShape(data, "int32"); obj@arrow.array.Array("Name", "arrow.array.proxy.Int32Array", "ConstructorArguments", {data, opts.DeepCopy}); % Store a reference to the array if not doing a deep copy if (~opts.DeepCopy), obj.MatlabArray = data; end diff --git a/matlab/src/matlab/+arrow/+array/Int64Array.m b/matlab/src/matlab/+arrow/+array/Int64Array.m index 85e9f2e62f64d..90d551242274e 100644 --- a/matlab/src/matlab/+arrow/+array/Int64Array.m +++ b/matlab/src/matlab/+arrow/+array/Int64Array.m @@ -26,8 +26,7 @@ data opts.DeepCopy = false end - validateattributes(data, "int64", ["2d", "nonsparse", "real"]); - if ~isempty(data), validateattributes(data, "int64", "vector"); end + arrow.args.validateTypeAndShape(data, "int64"); obj@arrow.array.Array("Name", "arrow.array.proxy.Int64Array", "ConstructorArguments", {data, opts.DeepCopy}); % Store a reference to the array if not doing a deep copy if (~opts.DeepCopy), obj.MatlabArray = data; end diff --git a/matlab/src/matlab/+arrow/+array/Int8Array.m b/matlab/src/matlab/+arrow/+array/Int8Array.m index 3452dd2d0f193..632e72e7eaddc 100644 --- a/matlab/src/matlab/+arrow/+array/Int8Array.m +++ b/matlab/src/matlab/+arrow/+array/Int8Array.m @@ -26,8 +26,8 @@ data opts.DeepCopy = false end - validateattributes(data, "int8", ["2d", "nonsparse", "real"]); - if ~isempty(data), validateattributes(data, "int8", "vector"); end + + arrow.args.validateTypeAndShape(data, "int8"); obj@arrow.array.Array("Name", "arrow.array.proxy.Int8Array", "ConstructorArguments", {data, opts.DeepCopy}); % Store a reference to the array if not doing a deep copy if (~opts.DeepCopy), obj.MatlabArray = data; end diff --git a/matlab/src/matlab/+arrow/+array/UInt16Array.m b/matlab/src/matlab/+arrow/+array/UInt16Array.m index 4abcf33241368..14318de1160c7 100644 --- a/matlab/src/matlab/+arrow/+array/UInt16Array.m +++ b/matlab/src/matlab/+arrow/+array/UInt16Array.m @@ -26,8 +26,7 @@ data opts.DeepCopy = false end - validateattributes(data, "uint16", ["2d", "nonsparse", "real"]); - if ~isempty(data), validateattributes(data, "uint16", "vector"); end + arrow.args.validateTypeAndShape(data, "uint16"); obj@arrow.array.Array("Name", "arrow.array.proxy.UInt16Array", "ConstructorArguments", {data, opts.DeepCopy}); % Store a reference to the array if not doing a deep copy if (~opts.DeepCopy), obj.MatlabArray = data; end diff --git a/matlab/src/matlab/+arrow/+array/UInt32Array.m b/matlab/src/matlab/+arrow/+array/UInt32Array.m index 6113c816d8b62..9a5f63b2437df 100644 --- a/matlab/src/matlab/+arrow/+array/UInt32Array.m +++ b/matlab/src/matlab/+arrow/+array/UInt32Array.m @@ -26,8 +26,7 @@ data opts.DeepCopy = false end - validateattributes(data, "uint32", ["2d", "nonsparse", "real"]); - if ~isempty(data), validateattributes(data, "uint32", "vector"); end + arrow.args.validateTypeAndShape(data, "uint32"); obj@arrow.array.Array("Name", "arrow.array.proxy.UInt32Array", "ConstructorArguments", {data, opts.DeepCopy}); % Store a reference to the array if not doing a deep copy if (~opts.DeepCopy), obj.MatlabArray = data; end diff --git a/matlab/src/matlab/+arrow/+array/UInt64Array.m b/matlab/src/matlab/+arrow/+array/UInt64Array.m index dd5f2f1274488..c5f819b33d720 100644 --- a/matlab/src/matlab/+arrow/+array/UInt64Array.m +++ b/matlab/src/matlab/+arrow/+array/UInt64Array.m @@ -26,8 +26,7 @@ data opts.DeepCopy = false end - validateattributes(data, "uint64", ["2d", "nonsparse", "real"]); - if ~isempty(data), validateattributes(data, "uint64", "vector"); end + arrow.args.validateTypeAndShape(data, "uint64"); obj@arrow.array.Array("Name", "arrow.array.proxy.UInt64Array", "ConstructorArguments", {data, opts.DeepCopy}); % Store a reference to the array if not doing a deep copy if (~opts.DeepCopy), obj.MatlabArray = data; end diff --git a/matlab/src/matlab/+arrow/+array/UInt8Array.m b/matlab/src/matlab/+arrow/+array/UInt8Array.m index 400baee24476a..04c634be41353 100644 --- a/matlab/src/matlab/+arrow/+array/UInt8Array.m +++ b/matlab/src/matlab/+arrow/+array/UInt8Array.m @@ -26,8 +26,7 @@ data opts.DeepCopy = false end - validateattributes(data, "uint8", ["2d", "nonsparse", "real"]); - if ~isempty(data), validateattributes(data, "uint8", "vector"); end + arrow.args.validateTypeAndShape(data, "uint8"); obj@arrow.array.Array("Name", "arrow.array.proxy.UInt8Array", "ConstructorArguments", {data, opts.DeepCopy}); % Store a reference to the array if not doing a deep copy if (~opts.DeepCopy), obj.MatlabArray = data; end diff --git a/matlab/test/arrow/array/hNumericArray.m b/matlab/test/arrow/array/hNumericArray.m index f76daa0469465..3c6742ef2f7c5 100644 --- a/matlab/test/arrow/array/hNumericArray.m +++ b/matlab/test/arrow/array/hNumericArray.m @@ -117,17 +117,17 @@ function ErrorIfComplex(tc, MakeDeepCopy) tc.verifyError(fcn, "MATLAB:expectedReal"); end - function ErrorIfNotTwoDimensional(tc, MakeDeepCopy) + function ErrorIfNonVector(tc, MakeDeepCopy) data = tc.MatlabArrayFcn([1 2 3 4 5 6 7 8 9]); data = reshape(data, 3, 1, 3); fcn = @() tc.ArrowArrayConstructor(tc.MatlabArrayFcn(data), DeepCopy=MakeDeepCopy); - tc.verifyError(fcn, "MATLAB:expected2D"); + tc.verifyError(fcn, "MATLAB:expectedVector"); end - function ErrorIfNonVector(tc, MakeDeepCopy) - data = tc.MatlabArrayFcn([1 2; 3 4]); + function ErrorIfEmptyArrayIsNotTwoDimensional(tc, MakeDeepCopy) + data = tc.MatlabArrayFcn(reshape([], [1 0 0])); fcn = @() tc.ArrowArrayConstructor(data, DeepCopy=MakeDeepCopy); - tc.verifyError(fcn, "MATLAB:expectedVector"); + tc.verifyError(fcn, "MATLAB:expected2D"); end end end \ No newline at end of file diff --git a/matlab/test/arrow/array/tFloat64Array.m b/matlab/test/arrow/array/tFloat64Array.m index b166fd3195ec7..5358ffc8887fd 100755 --- a/matlab/test/arrow/array/tFloat64Array.m +++ b/matlab/test/arrow/array/tFloat64Array.m @@ -40,12 +40,29 @@ function ErrorIfSparse(testCase, MakeDeepCopy) function ValidBasic(testCase, MakeDeepCopy) % Create a MATLAB array with one null value (i.e. one NaN). + % Verify NaN is considered a null value by default. matlabArray = [1, NaN, 3]'; arrowArray = arrow.array.Float64Array(matlabArray, DeepCopy=MakeDeepCopy); expectedValid = [true, false, true]'; testCase.verifyEqual(arrowArray.Valid, expectedValid); end + function InferNulls(testCase, MakeDeepCopy) + matlabArray = [1, NaN, 3]; + + % Verify NaN is treated as a null value when InferNulls=true. + arrowArray1 = arrow.array.Float64Array(matlabArray, InferNulls=true, DeepCopy=MakeDeepCopy); + expectedValid1 = [true false true]'; + testCase.verifyEqual(arrowArray1.Valid, expectedValid1); + testCase.verifyEqual(toMATLAB(arrowArray1), matlabArray'); + + % Verify NaN is not treated as a null value when InferNulls=false. + arrowArray2 = arrow.array.Float64Array(matlabArray, InferNulls=false, DeepCopy=MakeDeepCopy); + expectedValid2 = [true true true]'; + testCase.verifyEqual(arrowArray2.Valid, expectedValid2); + testCase.verifyEqual(toMATLAB(arrowArray2), matlabArray'); + end + function ValidNoNulls(testCase, MakeDeepCopy) % Create a MATLAB array with no null values (i.e. no NaNs). matlabArray = [1, 2, 3]'; @@ -79,6 +96,5 @@ function ValidEmpty(testCase, MakeDeepCopy) arrowArray = arrow.array.Float64Array(matlabArray, DeepCopy=MakeDeepCopy); testCase.verifyEqual(arrowArray.Valid, expectedValid); end - end end