From 86adb8c035864975072ee2b37fb1806ad08455ce Mon Sep 17 00:00:00 2001
From: andrewaikens87 <16356240+andrewaikens87@users.noreply.github.com>
Date: Tue, 12 Jul 2022 10:47:52 -0700
Subject: [PATCH] Improve icolumn.py cc 4/4 test_string_column (#414)

Summary:
Pull Request resolved: https://github.com/pytorch/torcharrow/pull/414

Improves test coverage for icolumn.py through test_string_column.

icolumn.py has ~55% code coverage currently and this diff brings it to 78.3.

Reviewed By: wenleix

Differential Revision: D37289878

fbshipit-source-id: de03567d6beaf67267c78f6b2427bdcd018a9a90
---
 torcharrow/icolumn.py                     |  16 +-
 torcharrow/test/test_numerical_column.py  |   3 +-
 torcharrow/test/test_string_column.py     | 189 ++++++++++++++++++++++
 torcharrow/test/test_string_column_cpu.py |  48 ++++++
 torcharrow/velox_rt/string_column_cpu.py  |  17 --
 5 files changed, 247 insertions(+), 26 deletions(-)

diff --git a/torcharrow/icolumn.py b/torcharrow/icolumn.py
index 03ef09123..5442fabd3 100644
--- a/torcharrow/icolumn.py
+++ b/torcharrow/icolumn.py
@@ -270,7 +270,9 @@ def __len__(self):
 
     def __str__(self):
         item_padding = "'" if dt.is_string(self.dtype) else ""
-        return f"Column([{', '.join(f'{item_padding}{i}{item_padding}' for i in self)}], id = {self.id})"
+        return (
+            f"Column([{', '.join(f'{item_padding}{i}{item_padding}' for i in self)}])"
+        )
 
     def __repr__(self):
         item_padding = "'" if dt.is_string(self.dtype) else ""
@@ -705,7 +707,7 @@ def filter(
         dtype: boolean, length: 2, null_count: 0
         """
         if columns is not None:
-            raise TypeError(f"columns parameter for flat columns not supported")
+            raise TypeError("columns parameter for flat columns not supported")
 
         if not isinstance(predicate, ty.Iterable) and not callable(predicate):
             raise TypeError(
@@ -1006,8 +1008,6 @@ def fill_null(self, fill_value: ty.Union[dt.ScalarTypes, ty.Dict]):
         """
         self._prototype_support_warning("fill_null")
 
-        if not isinstance(fill_value, Column._scalar_types):
-            raise TypeError(f"fill_null with {type(fill_value)} is not supported")
         if isinstance(fill_value, Column._scalar_types):
             res = Scope._EmptyColumn(self.dtype.constructor(nullable=False))
             for m, i in self._items():
@@ -1017,7 +1017,9 @@ def fill_null(self, fill_value: ty.Union[dt.ScalarTypes, ty.Dict]):
                     res._append_value(fill_value)
             return res._finalize()
         else:
-            raise TypeError(f"fill_null with {type(fill_value)} is not supported")
+            raise TypeError(
+                f"fill_null with {type(fill_value).__name__} is not supported"
+            )
 
     @trace
     @expression
@@ -1050,7 +1052,7 @@ def drop_null(self, how: ty.Literal["any", "all", None] = None):
 
         if how is not None:
             # "any or "all" is only used for DataFrame
-            raise TypeError(f"how parameter for flat columns not supported")
+            raise TypeError("how parameter for flat columns not supported")
 
         if dt.is_primitive(self.dtype):
             res = Scope._EmptyColumn(self.dtype.constructor(nullable=False))
@@ -1076,7 +1078,7 @@ def drop_duplicates(
         # TODO Add functionality for first and last
         assert keep == "first"
         if subset is not None:
-            raise TypeError(f"subset parameter for flat columns not supported")
+            raise TypeError("subset parameter for flat columns not supported")
         res = Scope._EmptyColumn(self._dtype)
         res._extend(list(OrderedDict.fromkeys(self)))
         return res._finalize()
diff --git a/torcharrow/test/test_numerical_column.py b/torcharrow/test/test_numerical_column.py
index 6b86798ff..14ae6a611 100644
--- a/torcharrow/test/test_numerical_column.py
+++ b/torcharrow/test/test_numerical_column.py
@@ -839,9 +839,8 @@ def base_test_batch_collate(self):
 
     def base_test_str(self):
         c = ta.column(list(range(5)), device=self.device)
-        c.id = 123
 
-        expected = "Column([0, 1, 2, 3, 4], id = 123)"
+        expected = "Column([0, 1, 2, 3, 4])"
         self.assertEqual(expected, str(c))
 
     def base_test_repr(self):
diff --git a/torcharrow/test/test_string_column.py b/torcharrow/test/test_string_column.py
index d254685cc..712260f36 100644
--- a/torcharrow/test/test_string_column.py
+++ b/torcharrow/test/test_string_column.py
@@ -345,6 +345,195 @@ def base_test_regular_expressions(self):
             ],
         )
 
+    def base_test_is_unique(self):
+        unique_column = ta.column(
+            [f"test{x}" for x in range(3)],
+            device=self.device,
+        )
+
+        self.assertTrue(unique_column.is_unique)
+
+        non_unique_column = ta.column(
+            [
+                "test",
+                "test",
+            ],
+            device=self.device,
+        )
+
+        self.assertFalse(non_unique_column.is_unique)
+
+    def base_test_is_monotonic_increasing(self):
+        c = ta.column([f"test{x}" for x in range(5)], device=self.device)
+        self.assertTrue(c.is_monotonic_increasing)
+        self.assertFalse(c.is_monotonic_decreasing)
+
+    def base_test_is_monotonic_decreasing(self):
+        c = ta.column([f"test{x}" for x in range(5, 0, -1)], device=self.device)
+        self.assertFalse(c.is_monotonic_increasing)
+        self.assertTrue(c.is_monotonic_decreasing)
+
+    def base_test_if_else(self):
+        left_repr = ["a1", "a2", "a3", "a4"]
+        right_repr = ["b1", "b2", "b3", "b4"]
+        float_type = ta.column(
+            [1.22, 2.22, 3.22, 4.22], dtype=dt.float32, device=self.device
+        )
+        cond_repr = [True, False, True, False]
+        cond = ta.column(cond_repr, device=self.device)
+        left = ta.column(left_repr, device=self.device)
+        right = ta.column(right_repr, device=self.device)
+
+        # Ensure py-iterables work as intended
+        expected = [left_repr[0], right_repr[1], left_repr[2], right_repr[3]]
+        result = ta.if_else(cond, left_repr, right_repr)
+        self.assertEqual(expected, list(result))
+
+        # Non common dtype
+        with self.assertRaisesRegex(
+            expected_exception=TypeError,
+            expected_regex="then and else branches must have compatible types, got.*and.*, respectively",
+        ):
+            ta.if_else(cond, left, float_type)
+
+        # Invalid condition input
+        with self.assertRaisesRegex(
+            expected_exception=TypeError,
+            expected_regex="condition must be a boolean vector",
+        ):
+            ta.if_else(
+                cond=left,
+                left=left,
+                right=right,
+            )
+
+    def base_test_str(self):
+        c = ta.column([f"test{x}" for x in range(5)], device=self.device)
+
+        expected = "Column(['test0', 'test1', 'test2', 'test3', 'test4'])"
+        self.assertEqual(expected, str(c))
+
+    def base_test_repr(self):
+        c = ta.column([f"test{x}" for x in range(5)], device=self.device)
+
+        expected = (
+            "0  'test0'\n"
+            "1  'test1'\n"
+            "2  'test2'\n"
+            "3  'test3'\n"
+            "4  'test4'\n"
+            f"dtype: string, length: 5, null_count: 0, device: {self.device}"
+        )
+        self.assertEqual(expected, repr(c))
+
+    def base_test_is_valid_at(self):
+        c = ta.column([f"test{x}" for x in range(5)], device=self.device)
+
+        self.assertTrue(all(c.is_valid_at(x) for x in range(5)))
+
+    def base_test_cast(self):
+        c_repr = ["0", "1", "2", "3", "4", None]
+        c_repr_after_cast = [0, 1, 2, 3, 4, None]
+        c = ta.column(c_repr, device=self.device)
+
+        result = c.cast(dt.int64)
+        self.assertEqual(c_repr_after_cast, list(result))
+
+    def base_test_drop_null(self):
+        c_repr = ["0", "1", "2", "3", "4", None]
+        c = ta.column(c_repr, device=self.device)
+
+        result = c.drop_null()
+
+        self.assertEqual(c_repr[:-1], list(result))
+
+        with self.assertRaisesRegex(
+            expected_exception=TypeError,
+            expected_regex="how parameter for flat columns not supported",
+        ):
+            c.drop_null(how="any")
+
+    def base_test_drop_duplicates(self):
+        c_repr = ["test", "test2", "test3", "test"]
+        c = ta.column(c_repr, device=self.device)
+
+        result = c.drop_duplicates()
+
+        self.assertEqual(c_repr[:-1], list(result))
+
+        # TODO: Add functionality for last
+        with self.assertRaises(expected_exception=AssertionError):
+            c.drop_duplicates(keep="last")
+
+        with self.assertRaisesRegex(
+            expected_exception=TypeError,
+            expected_regex="subset parameter for flat columns not supported",
+        ):
+            c.drop_duplicates(subset=c_repr[:2])
+
+    def base_test_fill_null(self):
+        c_repr = ["0", "1", None, "3", "4", None]
+        expected_fill = "TEST"
+        expected_repr = ["0", "1", expected_fill, "3", "4", expected_fill]
+        c = ta.column(c_repr, device=self.device)
+
+        result = c.fill_null(expected_fill)
+
+        self.assertEqual(expected_repr, list(result))
+
+        with self.assertRaisesRegex(
+            expected_exception=TypeError,
+            expected_regex="fill_null with bytes is not supported",
+        ):
+            c.fill_null(expected_fill.encode())
+
+    def base_test_isin(self):
+        c_repr = [f"test{x}" for x in range(5)]
+        c = ta.column(c_repr, device=self.device)
+        self.assertTrue(all(c.isin(values=c_repr + ["test_123"])))
+        self.assertFalse(any(c.isin(values=["test5", "test6", "test7"])))
+
+    def base_test_bool(self):
+        c = ta.column([f"test{x}" for x in range(5)], device=self.device)
+        with self.assertRaisesRegex(
+            expected_exception=ValueError,
+            expected_regex=r"The truth value of a.*is ambiguous. Use a.any\(\) or a.all\(\).",
+        ):
+            bool(c)
+
+    def base_test_flatmap(self):
+        c = ta.column(["test1", "test2", None, None, "test3"], device=self.device)
+        expected_result = [
+            "test1",
+            "test1",
+            "test2",
+            "test2",
+            None,
+            None,
+            None,
+            None,
+            "test3",
+            "test3",
+        ]
+        result = c.flatmap(lambda xs: [xs, xs])
+        self.assertEqual(expected_result, list(result))
+
+    def base_test_any(self):
+        c_some = ta.column(["test1", "test2", None, None, "test3"], device=self.device)
+        c_none = ta.column([], dtype=dt.string, device=self.device)
+        c_none = c_none.append([None])
+        self.assertTrue(c_some.any())
+        self.assertFalse(c_none.any())
+
+    def base_test_all(self):
+        c_all = ta.column(["test", "test2", "test3"], device=self.device)
+        c_partial = ta.column(["test", "test2", None, None], device=self.device)
+        c_none = ta.column([], dtype=dt.string, device=self.device)
+        c_none = c_none.append([None])
+        self.assertTrue(c_all.all())
+        self.assertTrue(c_partial.all())
+        self.assertTrue(c_none.all())
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/torcharrow/test/test_string_column_cpu.py b/torcharrow/test/test_string_column_cpu.py
index e58f38e55..b32d7a6e0 100644
--- a/torcharrow/test/test_string_column_cpu.py
+++ b/torcharrow/test/test_string_column_cpu.py
@@ -57,6 +57,54 @@ def test_string_pattern_matching_methods(self):
     def test_regular_expressions(self):
         self.base_test_regular_expressions()
 
+    def test_is_unique(self):
+        self.base_test_is_unique()
+
+    def test_is_monotonic_increasing(self):
+        self.base_test_is_monotonic_increasing()
+
+    def test_is_monotonic_decreasing(self):
+        self.base_test_is_monotonic_decreasing()
+
+    def test_if_else(self):
+        self.base_test_if_else()
+
+    def test_repr(self):
+        self.base_test_repr()
+
+    def test_str(self):
+        self.base_test_str()
+
+    def test_is_valid_at(self):
+        self.base_test_is_valid_at()
+
+    def test_cast(self):
+        self.base_test_cast()
+
+    def test_drop_null(self):
+        self.base_test_drop_null()
+
+    def test_drop_duplicates(self):
+        self.base_test_drop_duplicates()
+
+    def test_fill_null(self):
+        self.base_test_fill_null()
+
+    def test_isin(self):
+        self.base_test_isin()
+
+    def test_bool(self):
+        self.base_test_bool()
+
+    def test_flatmap(self):
+        self.base_test_flatmap()
+
+    def test_any(self):
+        self.base_test_any()
+
+    def test_all(self):
+        self.base_test_all()
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/torcharrow/velox_rt/string_column_cpu.py b/torcharrow/velox_rt/string_column_cpu.py
index 8a32512b9..23576e8a4 100644
--- a/torcharrow/velox_rt/string_column_cpu.py
+++ b/torcharrow/velox_rt/string_column_cpu.py
@@ -187,23 +187,6 @@ def __gt__(self, other):
     def __ge__(self, other):
         return self._checked_binary_op_call(other, "gte")
 
-    # printing ----------------------------------------------------------------
-
-    def __str__(self):
-        def quote(x):
-            return f"'{x}'"
-
-        return f"Column([{', '.join('None' if i is None else quote(i) for i in self)}])"
-
-    def __repr__(self):
-        tab = tabulate(
-            [["None" if i is None else f"'{i}'"] for i in self],
-            tablefmt="plain",
-            showindex=True,
-        )
-        typ = f"dtype: {self.dtype}, length: {self.length}, null_count: {self.null_count}, device: cpu"
-        return tab + dt.NL + typ
-
     # interop
     def _to_tensor_default(self):
         # there are no string tensors, so we're using regular python list conversion