From 86adb8c035864975072ee2b37fb1806ad08455ce Mon Sep 17 00:00:00 2001 From: andrewaikens87 <16356240+andrewaikens87@users.noreply.github.com> Date: Tue, 12 Jul 2022 10:47:52 -0700 Subject: [PATCH] Improve icolumn.py cc 4/4 test_string_column (#414) Summary: Pull Request resolved: https://github.com/pytorch/torcharrow/pull/414 Improves test coverage for icolumn.py through test_string_column. icolumn.py has ~55% code coverage currently and this diff brings it to 78.3. Reviewed By: wenleix Differential Revision: D37289878 fbshipit-source-id: de03567d6beaf67267c78f6b2427bdcd018a9a90 --- torcharrow/icolumn.py | 16 +- torcharrow/test/test_numerical_column.py | 3 +- torcharrow/test/test_string_column.py | 189 ++++++++++++++++++++++ torcharrow/test/test_string_column_cpu.py | 48 ++++++ torcharrow/velox_rt/string_column_cpu.py | 17 -- 5 files changed, 247 insertions(+), 26 deletions(-) diff --git a/torcharrow/icolumn.py b/torcharrow/icolumn.py index 03ef09123..5442fabd3 100644 --- a/torcharrow/icolumn.py +++ b/torcharrow/icolumn.py @@ -270,7 +270,9 @@ def __len__(self): def __str__(self): item_padding = "'" if dt.is_string(self.dtype) else "" - return f"Column([{', '.join(f'{item_padding}{i}{item_padding}' for i in self)}], id = {self.id})" + return ( + f"Column([{', '.join(f'{item_padding}{i}{item_padding}' for i in self)}])" + ) def __repr__(self): item_padding = "'" if dt.is_string(self.dtype) else "" @@ -705,7 +707,7 @@ def filter( dtype: boolean, length: 2, null_count: 0 """ if columns is not None: - raise TypeError(f"columns parameter for flat columns not supported") + raise TypeError("columns parameter for flat columns not supported") if not isinstance(predicate, ty.Iterable) and not callable(predicate): raise TypeError( @@ -1006,8 +1008,6 @@ def fill_null(self, fill_value: ty.Union[dt.ScalarTypes, ty.Dict]): """ self._prototype_support_warning("fill_null") - if not isinstance(fill_value, Column._scalar_types): - raise TypeError(f"fill_null with {type(fill_value)} is not supported") if isinstance(fill_value, Column._scalar_types): res = Scope._EmptyColumn(self.dtype.constructor(nullable=False)) for m, i in self._items(): @@ -1017,7 +1017,9 @@ def fill_null(self, fill_value: ty.Union[dt.ScalarTypes, ty.Dict]): res._append_value(fill_value) return res._finalize() else: - raise TypeError(f"fill_null with {type(fill_value)} is not supported") + raise TypeError( + f"fill_null with {type(fill_value).__name__} is not supported" + ) @trace @expression @@ -1050,7 +1052,7 @@ def drop_null(self, how: ty.Literal["any", "all", None] = None): if how is not None: # "any or "all" is only used for DataFrame - raise TypeError(f"how parameter for flat columns not supported") + raise TypeError("how parameter for flat columns not supported") if dt.is_primitive(self.dtype): res = Scope._EmptyColumn(self.dtype.constructor(nullable=False)) @@ -1076,7 +1078,7 @@ def drop_duplicates( # TODO Add functionality for first and last assert keep == "first" if subset is not None: - raise TypeError(f"subset parameter for flat columns not supported") + raise TypeError("subset parameter for flat columns not supported") res = Scope._EmptyColumn(self._dtype) res._extend(list(OrderedDict.fromkeys(self))) return res._finalize() diff --git a/torcharrow/test/test_numerical_column.py b/torcharrow/test/test_numerical_column.py index 6b86798ff..14ae6a611 100644 --- a/torcharrow/test/test_numerical_column.py +++ b/torcharrow/test/test_numerical_column.py @@ -839,9 +839,8 @@ def base_test_batch_collate(self): def base_test_str(self): c = ta.column(list(range(5)), device=self.device) - c.id = 123 - expected = "Column([0, 1, 2, 3, 4], id = 123)" + expected = "Column([0, 1, 2, 3, 4])" self.assertEqual(expected, str(c)) def base_test_repr(self): diff --git a/torcharrow/test/test_string_column.py b/torcharrow/test/test_string_column.py index d254685cc..712260f36 100644 --- a/torcharrow/test/test_string_column.py +++ b/torcharrow/test/test_string_column.py @@ -345,6 +345,195 @@ def base_test_regular_expressions(self): ], ) + def base_test_is_unique(self): + unique_column = ta.column( + [f"test{x}" for x in range(3)], + device=self.device, + ) + + self.assertTrue(unique_column.is_unique) + + non_unique_column = ta.column( + [ + "test", + "test", + ], + device=self.device, + ) + + self.assertFalse(non_unique_column.is_unique) + + def base_test_is_monotonic_increasing(self): + c = ta.column([f"test{x}" for x in range(5)], device=self.device) + self.assertTrue(c.is_monotonic_increasing) + self.assertFalse(c.is_monotonic_decreasing) + + def base_test_is_monotonic_decreasing(self): + c = ta.column([f"test{x}" for x in range(5, 0, -1)], device=self.device) + self.assertFalse(c.is_monotonic_increasing) + self.assertTrue(c.is_monotonic_decreasing) + + def base_test_if_else(self): + left_repr = ["a1", "a2", "a3", "a4"] + right_repr = ["b1", "b2", "b3", "b4"] + float_type = ta.column( + [1.22, 2.22, 3.22, 4.22], dtype=dt.float32, device=self.device + ) + cond_repr = [True, False, True, False] + cond = ta.column(cond_repr, device=self.device) + left = ta.column(left_repr, device=self.device) + right = ta.column(right_repr, device=self.device) + + # Ensure py-iterables work as intended + expected = [left_repr[0], right_repr[1], left_repr[2], right_repr[3]] + result = ta.if_else(cond, left_repr, right_repr) + self.assertEqual(expected, list(result)) + + # Non common dtype + with self.assertRaisesRegex( + expected_exception=TypeError, + expected_regex="then and else branches must have compatible types, got.*and.*, respectively", + ): + ta.if_else(cond, left, float_type) + + # Invalid condition input + with self.assertRaisesRegex( + expected_exception=TypeError, + expected_regex="condition must be a boolean vector", + ): + ta.if_else( + cond=left, + left=left, + right=right, + ) + + def base_test_str(self): + c = ta.column([f"test{x}" for x in range(5)], device=self.device) + + expected = "Column(['test0', 'test1', 'test2', 'test3', 'test4'])" + self.assertEqual(expected, str(c)) + + def base_test_repr(self): + c = ta.column([f"test{x}" for x in range(5)], device=self.device) + + expected = ( + "0 'test0'\n" + "1 'test1'\n" + "2 'test2'\n" + "3 'test3'\n" + "4 'test4'\n" + f"dtype: string, length: 5, null_count: 0, device: {self.device}" + ) + self.assertEqual(expected, repr(c)) + + def base_test_is_valid_at(self): + c = ta.column([f"test{x}" for x in range(5)], device=self.device) + + self.assertTrue(all(c.is_valid_at(x) for x in range(5))) + + def base_test_cast(self): + c_repr = ["0", "1", "2", "3", "4", None] + c_repr_after_cast = [0, 1, 2, 3, 4, None] + c = ta.column(c_repr, device=self.device) + + result = c.cast(dt.int64) + self.assertEqual(c_repr_after_cast, list(result)) + + def base_test_drop_null(self): + c_repr = ["0", "1", "2", "3", "4", None] + c = ta.column(c_repr, device=self.device) + + result = c.drop_null() + + self.assertEqual(c_repr[:-1], list(result)) + + with self.assertRaisesRegex( + expected_exception=TypeError, + expected_regex="how parameter for flat columns not supported", + ): + c.drop_null(how="any") + + def base_test_drop_duplicates(self): + c_repr = ["test", "test2", "test3", "test"] + c = ta.column(c_repr, device=self.device) + + result = c.drop_duplicates() + + self.assertEqual(c_repr[:-1], list(result)) + + # TODO: Add functionality for last + with self.assertRaises(expected_exception=AssertionError): + c.drop_duplicates(keep="last") + + with self.assertRaisesRegex( + expected_exception=TypeError, + expected_regex="subset parameter for flat columns not supported", + ): + c.drop_duplicates(subset=c_repr[:2]) + + def base_test_fill_null(self): + c_repr = ["0", "1", None, "3", "4", None] + expected_fill = "TEST" + expected_repr = ["0", "1", expected_fill, "3", "4", expected_fill] + c = ta.column(c_repr, device=self.device) + + result = c.fill_null(expected_fill) + + self.assertEqual(expected_repr, list(result)) + + with self.assertRaisesRegex( + expected_exception=TypeError, + expected_regex="fill_null with bytes is not supported", + ): + c.fill_null(expected_fill.encode()) + + def base_test_isin(self): + c_repr = [f"test{x}" for x in range(5)] + c = ta.column(c_repr, device=self.device) + self.assertTrue(all(c.isin(values=c_repr + ["test_123"]))) + self.assertFalse(any(c.isin(values=["test5", "test6", "test7"]))) + + def base_test_bool(self): + c = ta.column([f"test{x}" for x in range(5)], device=self.device) + with self.assertRaisesRegex( + expected_exception=ValueError, + expected_regex=r"The truth value of a.*is ambiguous. Use a.any\(\) or a.all\(\).", + ): + bool(c) + + def base_test_flatmap(self): + c = ta.column(["test1", "test2", None, None, "test3"], device=self.device) + expected_result = [ + "test1", + "test1", + "test2", + "test2", + None, + None, + None, + None, + "test3", + "test3", + ] + result = c.flatmap(lambda xs: [xs, xs]) + self.assertEqual(expected_result, list(result)) + + def base_test_any(self): + c_some = ta.column(["test1", "test2", None, None, "test3"], device=self.device) + c_none = ta.column([], dtype=dt.string, device=self.device) + c_none = c_none.append([None]) + self.assertTrue(c_some.any()) + self.assertFalse(c_none.any()) + + def base_test_all(self): + c_all = ta.column(["test", "test2", "test3"], device=self.device) + c_partial = ta.column(["test", "test2", None, None], device=self.device) + c_none = ta.column([], dtype=dt.string, device=self.device) + c_none = c_none.append([None]) + self.assertTrue(c_all.all()) + self.assertTrue(c_partial.all()) + self.assertTrue(c_none.all()) + if __name__ == "__main__": unittest.main() diff --git a/torcharrow/test/test_string_column_cpu.py b/torcharrow/test/test_string_column_cpu.py index e58f38e55..b32d7a6e0 100644 --- a/torcharrow/test/test_string_column_cpu.py +++ b/torcharrow/test/test_string_column_cpu.py @@ -57,6 +57,54 @@ def test_string_pattern_matching_methods(self): def test_regular_expressions(self): self.base_test_regular_expressions() + def test_is_unique(self): + self.base_test_is_unique() + + def test_is_monotonic_increasing(self): + self.base_test_is_monotonic_increasing() + + def test_is_monotonic_decreasing(self): + self.base_test_is_monotonic_decreasing() + + def test_if_else(self): + self.base_test_if_else() + + def test_repr(self): + self.base_test_repr() + + def test_str(self): + self.base_test_str() + + def test_is_valid_at(self): + self.base_test_is_valid_at() + + def test_cast(self): + self.base_test_cast() + + def test_drop_null(self): + self.base_test_drop_null() + + def test_drop_duplicates(self): + self.base_test_drop_duplicates() + + def test_fill_null(self): + self.base_test_fill_null() + + def test_isin(self): + self.base_test_isin() + + def test_bool(self): + self.base_test_bool() + + def test_flatmap(self): + self.base_test_flatmap() + + def test_any(self): + self.base_test_any() + + def test_all(self): + self.base_test_all() + if __name__ == "__main__": unittest.main() diff --git a/torcharrow/velox_rt/string_column_cpu.py b/torcharrow/velox_rt/string_column_cpu.py index 8a32512b9..23576e8a4 100644 --- a/torcharrow/velox_rt/string_column_cpu.py +++ b/torcharrow/velox_rt/string_column_cpu.py @@ -187,23 +187,6 @@ def __gt__(self, other): def __ge__(self, other): return self._checked_binary_op_call(other, "gte") - # printing ---------------------------------------------------------------- - - def __str__(self): - def quote(x): - return f"'{x}'" - - return f"Column([{', '.join('None' if i is None else quote(i) for i in self)}])" - - def __repr__(self): - tab = tabulate( - [["None" if i is None else f"'{i}'"] for i in self], - tablefmt="plain", - showindex=True, - ) - typ = f"dtype: {self.dtype}, length: {self.length}, null_count: {self.null_count}, device: cpu" - return tab + dt.NL + typ - # interop def _to_tensor_default(self): # there are no string tensors, so we're using regular python list conversion