From f8e1521f114c473cfa22f7b0bde822e38f61ef0e Mon Sep 17 00:00:00 2001 From: Andrew Aikens Date: Wed, 29 Jun 2022 11:04:17 -0700 Subject: [PATCH 1/4] Improve icolumn.py cc 2/n test_map_column Differential Revision: D37492649 fbshipit-source-id: 2fe5232c6e50e12e08c72e977dfad3a7169b1747 --- torcharrow/test/test_map_column.py | 52 ++++++++++++++++++++++++++ torcharrow/test/test_map_column_cpu.py | 9 +++++ 2 files changed, 61 insertions(+) diff --git a/torcharrow/test/test_map_column.py b/torcharrow/test/test_map_column.py index 13531fdaf..be5754625 100644 --- a/torcharrow/test/test_map_column.py +++ b/torcharrow/test/test_map_column.py @@ -63,6 +63,58 @@ def base_test_keys_values_get(self): self.assertEqual(list(c.maps.values()), [[123], [45, 67], None]) self.assertEqual(list(c.maps.get("de", 0)), [0, 45, None]) + def base_test_get_operator(self): + col_rep = [ + {"helsinki": [-1.0, 21.0], "moscow": [-4.0, 24.0]}, + {}, + {"nowhere": [], "algiers": [11.0, 25, 2], "kinshasa": [22.0, 26.0]}, + ] + c = ta.column( + col_rep, + device=self.device, + ) + indicies = [0, 2] + expected = [col_rep[i] for i in indicies] + result = [c[i] for i in indicies] + self.assertEqual(expected, result) + + def base_test_slice_operation(self): + col_rep = [ + {"helsinki": [-1.0, 21.0], "moscow": [-4.0, 24.0]}, + {}, + {"nowhere": [], "algiers": [11.0, 25, 2], "kinshasa": [22.0, 26.0]}, + {"london": [], "new york": [500]}, + ] + c = ta.column( + col_rep, + device=self.device, + ) + expected_slice_every_other = col_rep[0:4:2] + result_every_other = c[0:4:2] + self.assertEqual(expected_slice_every_other, list(result_every_other)) + + expected_slice_most = col_rep[1:] + result_most = c[1:4:1] + self.assertEqual(expected_slice_most, list(result_most)) + + def base_test_equality_operators(self): + col_rep = [ + {"helsinki": [-1.0, 21.0], "moscow": [-4.0, 24.0]}, + {"boston": [-4.0]}, + {"nowhere": [], "algiers": [11.0, 25, 2], "kinshasa": [22.0, 26.0]}, + {"london": [], "new york": [500]}, + ] + c = ta.column( + col_rep, + device=self.device, + ) + c2 = ta.column( + col_rep, + device=self.device, + ) + self.assertTrue(all(c == c2)) + self.assertFalse(any(c != c2)) + if __name__ == "__main__": unittest.main() diff --git a/torcharrow/test/test_map_column_cpu.py b/torcharrow/test/test_map_column_cpu.py index 59b521447..5e5d7f3a0 100644 --- a/torcharrow/test/test_map_column_cpu.py +++ b/torcharrow/test/test_map_column_cpu.py @@ -22,6 +22,15 @@ def test_infer(self): def test_keys_values_get(self): self.base_test_keys_values_get() + def test_get_operator(self): + self.base_test_get_operator() + + def test_slice_operation(self): + self.base_test_slice_operation() + + def test_equality_operators(self): + self.base_test_equality_operators() + if __name__ == "__main__": unittest.main() From a46443e57b6b2203fcf3d7f0ad951ee7b4100409 Mon Sep 17 00:00:00 2001 From: Andrew Aikens Date: Wed, 29 Jun 2022 11:04:17 -0700 Subject: [PATCH 2/4] Improve icolumn.py cc 3/n test_list_column Differential Revision: D37492647 fbshipit-source-id: 962bce3f41616f30793f509317f62889f0787ef0 --- torcharrow/icolumn.py | 4 ++-- torcharrow/test/test_list_column.py | 17 +++++++++++++++++ torcharrow/test/test_list_column_cpu.py | 3 +++ 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/torcharrow/icolumn.py b/torcharrow/icolumn.py index dc6c1ff34..03ef09123 100644 --- a/torcharrow/icolumn.py +++ b/torcharrow/icolumn.py @@ -254,8 +254,8 @@ def cast(self, dtype): res._append_value(fun(i)) return res._finalize() else: - raise TypeError('f"{astype}({dtype}) is not supported")') - raise TypeError('f"{astype} for {type(self).__name__} is not supported")') + raise TypeError(f"{dtype} for {type(self).__name__} is not supported") + raise TypeError(f"{self.dtype} for {type(self).__name__} is not supported") # public simple observers ------------------------------------------------- diff --git a/torcharrow/test/test_list_column.py b/torcharrow/test/test_list_column.py index c81c1efa7..b18db0b52 100644 --- a/torcharrow/test/test_list_column.py +++ b/torcharrow/test/test_list_column.py @@ -208,6 +208,23 @@ def base_test_fixed_size_list(self): f"Unexpected failure reason: {str(ex.exception)}", ) + def base_test_cast(self): + list_dtype = dt.List(item_dtype=dt.int64, fixed_size=2) + c_list = ta.column( + [[1, 2], [3, 4]], + dtype=list_dtype, + device=self.device, + ) + + int_dtype = dt.int64 + # TODO: Nested cast should be supported in the future + for arg in (int_dtype, list_dtype): + with self.assertRaisesRegexp( + expected_exception=TypeError, + expected_regex=r"List\(int64, fixed_size=2\) for.*is not supported", + ): + c_list.cast(arg) + if __name__ == "__main__": unittest.main() diff --git a/torcharrow/test/test_list_column_cpu.py b/torcharrow/test/test_list_column_cpu.py index bd774bc7f..72db1bae7 100644 --- a/torcharrow/test/test_list_column_cpu.py +++ b/torcharrow/test/test_list_column_cpu.py @@ -46,6 +46,9 @@ def test_map_reduce_etc(self): def test_fixed_size_list(self): self.base_test_fixed_size_list() + def test_cast(self): + self.base_test_cast() + if __name__ == "__main__": unittest.main() From a52ec8c08f1432ee6b4c9d34d00ae426c4541be7 Mon Sep 17 00:00:00 2001 From: Andrew Aikens Date: Wed, 29 Jun 2022 11:04:17 -0700 Subject: [PATCH 3/4] Enforce 90% cc for icolumn.py Differential Revision: D37289878 fbshipit-source-id: de712533ae8efea297088ea3224bbb306f393d70 --- torcharrow/icolumn.py | 12 +- torcharrow/test/test_string_column.py | 221 ++++++++++++++++++++++ torcharrow/test/test_string_column_cpu.py | 48 +++++ torcharrow/velox_rt/string_column_cpu.py | 17 -- 4 files changed, 275 insertions(+), 23 deletions(-) diff --git a/torcharrow/icolumn.py b/torcharrow/icolumn.py index 03ef09123..158fe6907 100644 --- a/torcharrow/icolumn.py +++ b/torcharrow/icolumn.py @@ -705,7 +705,7 @@ def filter( dtype: boolean, length: 2, null_count: 0 """ if columns is not None: - raise TypeError(f"columns parameter for flat columns not supported") + raise TypeError("columns parameter for flat columns not supported") if not isinstance(predicate, ty.Iterable) and not callable(predicate): raise TypeError( @@ -1006,8 +1006,6 @@ def fill_null(self, fill_value: ty.Union[dt.ScalarTypes, ty.Dict]): """ self._prototype_support_warning("fill_null") - if not isinstance(fill_value, Column._scalar_types): - raise TypeError(f"fill_null with {type(fill_value)} is not supported") if isinstance(fill_value, Column._scalar_types): res = Scope._EmptyColumn(self.dtype.constructor(nullable=False)) for m, i in self._items(): @@ -1017,7 +1015,9 @@ def fill_null(self, fill_value: ty.Union[dt.ScalarTypes, ty.Dict]): res._append_value(fill_value) return res._finalize() else: - raise TypeError(f"fill_null with {type(fill_value)} is not supported") + raise TypeError( + f"fill_null with {type(fill_value).__name__} is not supported" + ) @trace @expression @@ -1050,7 +1050,7 @@ def drop_null(self, how: ty.Literal["any", "all", None] = None): if how is not None: # "any or "all" is only used for DataFrame - raise TypeError(f"how parameter for flat columns not supported") + raise TypeError("how parameter for flat columns not supported") if dt.is_primitive(self.dtype): res = Scope._EmptyColumn(self.dtype.constructor(nullable=False)) @@ -1076,7 +1076,7 @@ def drop_duplicates( # TODO Add functionality for first and last assert keep == "first" if subset is not None: - raise TypeError(f"subset parameter for flat columns not supported") + raise TypeError("subset parameter for flat columns not supported") res = Scope._EmptyColumn(self._dtype) res._extend(list(OrderedDict.fromkeys(self))) return res._finalize() diff --git a/torcharrow/test/test_string_column.py b/torcharrow/test/test_string_column.py index d254685cc..171966d38 100644 --- a/torcharrow/test/test_string_column.py +++ b/torcharrow/test/test_string_column.py @@ -6,6 +6,7 @@ import typing as ty import unittest +from unittest import mock import torcharrow as ta import torcharrow.dtypes as dt @@ -345,6 +346,226 @@ def base_test_regular_expressions(self): ], ) + def base_test_is_unique(self): + unique_column = ta.column( + [f"test{x}" for x in range(3)], + device=self.device, + ) + + self.assertTrue(unique_column.is_unique) + + non_unique_column = ta.column( + [ + "test", + "test", + ], + device=self.device, + ) + + self.assertFalse(non_unique_column.is_unique) + + def base_test_is_monotonic_increasing(self): + c = ta.column([f"test{x}" for x in range(5)], device=self.device) + self.assertTrue(c.is_monotonic_increasing) + self.assertFalse(c.is_monotonic_decreasing) + + def base_test_is_monotonic_decreasing(self): + c = ta.column([f"test{x}" for x in range(5, 0, -1)], device=self.device) + self.assertFalse(c.is_monotonic_increasing) + self.assertTrue(c.is_monotonic_decreasing) + + def base_test_if_else(self): + left_repr = ["a1", "a2", "a3", "a4"] + right_repr = ["b1", "b2", "b3", "b4"] + cond_repr = [True, False, True, False] + cond = ta.column(cond_repr, device=self.device) + left = ta.column(left_repr, device=self.device) + right = ta.column(right_repr, device=self.device) + + # Ensure py-iterables work as intended + expected = [left_repr[0], right_repr[1], left_repr[2], right_repr[3]] + result = ta.if_else(cond, left_repr, right_repr) + self.assertEqual(expected, list(result)) + + # Non common d type + with self.assertRaisesRegex( + expected_exception=TypeError, + expected_regex="then and else branches must have compatible types, got.*and.*, respectively", + ), mock.patch("torcharrow.icolumn.dt.common_dtype") as mock_common_dtype: + mock_common_dtype.return_value = None + + ta.if_else(cond, left, right) + + mock_common_dtype.assert_called_once_with( + left.dtype, + right.dtype, + ) + + with self.assertRaisesRegex( + expected_exception=TypeError, + expected_regex="then and else branches must have compatible types, got.*and.*, respectively", + ), mock.patch( + "torcharrow.icolumn.dt.common_dtype" + ) as mock_common_dtype, mock.patch( + "torcharrow.icolumn.dt.is_void" + ) as mock_is_void: + mock_is_void.return_value = True + mock_common_dtype.return_value = dt.int64 + + ta.if_else(cond, left, right) + mock_common_dtype.assert_called_once_with( + left.dtype, + right.dtype, + ) + mock_is_void.assert_called_once_with(mock_common_dtype.return_value) + + # Invalid condition input + with self.assertRaisesRegex( + expected_exception=TypeError, + expected_regex="condition must be a boolean vector", + ): + ta.if_else( + cond=left, + left=left, + right=right, + ) + + def base_test_str(self): + c = ta.column([f"test{x}" for x in range(5)], device=self.device) + c.id = 321 + + expected = "Column(['test0', 'test1', 'test2', 'test3', 'test4'], id = 321)" + self.assertEqual(expected, str(c)) + + def base_test_repr(self): + c = ta.column([f"test{x}" for x in range(5)], device=self.device) + + expected = ( + "0 'test0'\n" + "1 'test1'\n" + "2 'test2'\n" + "3 'test3'\n" + "4 'test4'\n" + f"dtype: string, length: 5, null_count: 0, device: {self.device}" + ) + self.assertEqual(expected, repr(c)) + + def base_test_is_valid_at(self): + c = ta.column([f"test{x}" for x in range(5)], device=self.device) + + # Normal access + self.assertTrue(all(c.is_valid_at(x) for x in range(5))) + + # Negative access + self.assertTrue(c.is_valid_at(-1)) + + # Out of bounds access + with self.assertRaises(expected_exception=RuntimeError): + self.assertFalse(c.is_valid_at(10)) + + def base_test_cast(self): + c_repr = ["0", "1", "2", "3", "4", None] + c_repr_after_cast = [0, 1, 2, 3, 4, None] + c = ta.column(c_repr, device=self.device) + + result = c.cast(dt.int64) + self.assertEqual(c_repr_after_cast, list(result)) + + def base_test_drop_null(self): + c_repr = ["0", "1", "2", "3", "4", None] + c = ta.column(c_repr, device=self.device) + + result = c.drop_null() + + self.assertEqual(c_repr[:-1], list(result)) + + with self.assertRaisesRegex( + expected_exception=TypeError, + expected_regex="how parameter for flat columns not supported", + ): + c.drop_null(how="any") + + def base_test_drop_duplicates(self): + c_repr = ["test", "test2", "test3", "test"] + c = ta.column(c_repr, device=self.device) + + result = c.drop_duplicates() + + self.assertEqual(c_repr[:-1], list(result)) + + # TODO: Add functionality for last + with self.assertRaises(expected_exception=AssertionError): + c.drop_duplicates(keep="last") + + with self.assertRaisesRegex( + expected_exception=TypeError, + expected_regex="subset parameter for flat columns not supported", + ): + c.drop_duplicates(subset=c_repr[:2]) + + def base_test_fill_null(self): + c_repr = ["0", "1", None, "3", "4", None] + expected_fill = "TEST" + expected_repr = ["0", "1", expected_fill, "3", "4", expected_fill] + c = ta.column(c_repr, device=self.device) + + result = c.fill_null(expected_fill) + + self.assertEqual(expected_repr, list(result)) + + with self.assertRaisesRegex( + expected_exception=TypeError, + expected_regex="fill_null with bytes is not supported", + ): + c.fill_null(expected_fill.encode()) + + def base_test_isin(self): + c_repr = [f"test{x}" for x in range(5)] + c = ta.column(c_repr, device=self.device) + self.assertTrue(all(c.isin(values=c_repr + ["test_123"]))) + self.assertFalse(any(c.isin(values=["test5", "test6", "test7"]))) + + def base_test_bool(self): + c = ta.column([f"test{x}" for x in range(5)], device=self.device) + with self.assertRaisesRegex( + expected_exception=ValueError, + expected_regex=r"The truth value of a.*is ambiguous. Use a.any\(\) or a.all\(\).", + ): + bool(c) + + def base_test_flatmap(self): + c = ta.column(["test1", "test2", None, None, "test3"], device=self.device) + expected_result = [ + "test1", + "test1", + "test2", + "test2", + None, + None, + None, + None, + "test3", + "test3", + ] + result = c.flatmap(lambda xs: [xs, xs]) + self.assertEqual(expected_result, list(result)) + + def base_test_any(self): + c_some = ta.column(["test1", "test2", None, None, "test3"], device=self.device) + c_none = ta.column([], dtype=dt.string, device=self.device) + c_none = c_none.append([None]) + self.assertTrue(any(c_some)) + self.assertFalse(any(c_none)) + + def base_test_all(self): + c_all = ta.column(["test", "test2", "test3"], device=self.device) + c_partial = ta.column(["test", "test2", None, None], device=self.device) + c_none = ta.column([], dtype=dt.string, device=self.device) + c_none = c_none.append([None]) + self.assertTrue(all(c_all)) + self.assertFalse(all(c_partial)) + self.assertFalse(all(c_none)) + if __name__ == "__main__": unittest.main() diff --git a/torcharrow/test/test_string_column_cpu.py b/torcharrow/test/test_string_column_cpu.py index e58f38e55..b32d7a6e0 100644 --- a/torcharrow/test/test_string_column_cpu.py +++ b/torcharrow/test/test_string_column_cpu.py @@ -57,6 +57,54 @@ def test_string_pattern_matching_methods(self): def test_regular_expressions(self): self.base_test_regular_expressions() + def test_is_unique(self): + self.base_test_is_unique() + + def test_is_monotonic_increasing(self): + self.base_test_is_monotonic_increasing() + + def test_is_monotonic_decreasing(self): + self.base_test_is_monotonic_decreasing() + + def test_if_else(self): + self.base_test_if_else() + + def test_repr(self): + self.base_test_repr() + + def test_str(self): + self.base_test_str() + + def test_is_valid_at(self): + self.base_test_is_valid_at() + + def test_cast(self): + self.base_test_cast() + + def test_drop_null(self): + self.base_test_drop_null() + + def test_drop_duplicates(self): + self.base_test_drop_duplicates() + + def test_fill_null(self): + self.base_test_fill_null() + + def test_isin(self): + self.base_test_isin() + + def test_bool(self): + self.base_test_bool() + + def test_flatmap(self): + self.base_test_flatmap() + + def test_any(self): + self.base_test_any() + + def test_all(self): + self.base_test_all() + if __name__ == "__main__": unittest.main() diff --git a/torcharrow/velox_rt/string_column_cpu.py b/torcharrow/velox_rt/string_column_cpu.py index 8a32512b9..23576e8a4 100644 --- a/torcharrow/velox_rt/string_column_cpu.py +++ b/torcharrow/velox_rt/string_column_cpu.py @@ -187,23 +187,6 @@ def __gt__(self, other): def __ge__(self, other): return self._checked_binary_op_call(other, "gte") - # printing ---------------------------------------------------------------- - - def __str__(self): - def quote(x): - return f"'{x}'" - - return f"Column([{', '.join('None' if i is None else quote(i) for i in self)}])" - - def __repr__(self): - tab = tabulate( - [["None" if i is None else f"'{i}'"] for i in self], - tablefmt="plain", - showindex=True, - ) - typ = f"dtype: {self.dtype}, length: {self.length}, null_count: {self.null_count}, device: cpu" - return tab + dt.NL + typ - # interop def _to_tensor_default(self): # there are no string tensors, so we're using regular python list conversion From 224a4dfc580ce421c1571101fb62ba4ae7dfc49f Mon Sep 17 00:00:00 2001 From: andrewaikens87 <16356240+andrewaikens87@users.noreply.github.com> Date: Wed, 29 Jun 2022 11:07:26 -0700 Subject: [PATCH 4/4] Remove unused icolumn.py private methods Summary: Removes unused private methods in icolumn.py bringing code coverage to 79.2%. Differential Revision: D37503197 fbshipit-source-id: 2508872b42700a91f83c309f0b6160e2dd0c534e --- torcharrow/icolumn.py | 37 ------------------------------------- 1 file changed, 37 deletions(-) diff --git a/torcharrow/icolumn.py b/torcharrow/icolumn.py index 158fe6907..34f76c7ca 100644 --- a/torcharrow/icolumn.py +++ b/torcharrow/icolumn.py @@ -1417,40 +1417,3 @@ def _to_tensor_default(self): def _count(self): """Return number of non-NA/null observations pgf the column/frame""" return len(self) - self.null_count - - @trace - @expression - def _nlargest( - self, - n=5, - columns: ty.Optional[ty.List[str]] = None, - keep: ty.Literal["last", "first"] = "first", - ): - """Returns a new data of the *n* largest element.""" - # keep="all" not supported - if columns is not None: - raise TypeError( - "computing n-largest on non-structured column can't have 'columns' parameter" - ) - return self.sort(ascending=False).head(n) - - @trace - @expression - def _nsmallest(self, n=5, columns: ty.Optional[ty.List[str]] = None, keep="first"): - """Returns a new data of the *n* smallest element.""" - # keep="all" not supported - if columns is not None: - raise TypeError( - "computing n-smallest on non-structured column can't have 'columns' parameter" - ) - - return self.sort(ascending=True).head(n) - - @trace - @expression - def _nunique(self, drop_null=True): - """Returns the number of unique values of the column""" - if not drop_null: - return len(set(self)) - else: - return len(set(i for i in self if i is not None))