From f8e1521f114c473cfa22f7b0bde822e38f61ef0e Mon Sep 17 00:00:00 2001
From: Andrew Aikens <andrewaikens@fb.com>
Date: Wed, 29 Jun 2022 11:04:17 -0700
Subject: [PATCH 1/4] Improve icolumn.py cc 2/n test_map_column

Differential Revision: D37492649

fbshipit-source-id: 2fe5232c6e50e12e08c72e977dfad3a7169b1747
---
 torcharrow/test/test_map_column.py     | 52 ++++++++++++++++++++++++++
 torcharrow/test/test_map_column_cpu.py |  9 +++++
 2 files changed, 61 insertions(+)

diff --git a/torcharrow/test/test_map_column.py b/torcharrow/test/test_map_column.py
index 13531fdaf..be5754625 100644
--- a/torcharrow/test/test_map_column.py
+++ b/torcharrow/test/test_map_column.py
@@ -63,6 +63,58 @@ def base_test_keys_values_get(self):
         self.assertEqual(list(c.maps.values()), [[123], [45, 67], None])
         self.assertEqual(list(c.maps.get("de", 0)), [0, 45, None])
 
+    def base_test_get_operator(self):
+        col_rep = [
+            {"helsinki": [-1.0, 21.0], "moscow": [-4.0, 24.0]},
+            {},
+            {"nowhere": [], "algiers": [11.0, 25, 2], "kinshasa": [22.0, 26.0]},
+        ]
+        c = ta.column(
+            col_rep,
+            device=self.device,
+        )
+        indicies = [0, 2]
+        expected = [col_rep[i] for i in indicies]
+        result = [c[i] for i in indicies]
+        self.assertEqual(expected, result)
+
+    def base_test_slice_operation(self):
+        col_rep = [
+            {"helsinki": [-1.0, 21.0], "moscow": [-4.0, 24.0]},
+            {},
+            {"nowhere": [], "algiers": [11.0, 25, 2], "kinshasa": [22.0, 26.0]},
+            {"london": [], "new york": [500]},
+        ]
+        c = ta.column(
+            col_rep,
+            device=self.device,
+        )
+        expected_slice_every_other = col_rep[0:4:2]
+        result_every_other = c[0:4:2]
+        self.assertEqual(expected_slice_every_other, list(result_every_other))
+
+        expected_slice_most = col_rep[1:]
+        result_most = c[1:4:1]
+        self.assertEqual(expected_slice_most, list(result_most))
+
+    def base_test_equality_operators(self):
+        col_rep = [
+            {"helsinki": [-1.0, 21.0], "moscow": [-4.0, 24.0]},
+            {"boston": [-4.0]},
+            {"nowhere": [], "algiers": [11.0, 25, 2], "kinshasa": [22.0, 26.0]},
+            {"london": [], "new york": [500]},
+        ]
+        c = ta.column(
+            col_rep,
+            device=self.device,
+        )
+        c2 = ta.column(
+            col_rep,
+            device=self.device,
+        )
+        self.assertTrue(all(c == c2))
+        self.assertFalse(any(c != c2))
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/torcharrow/test/test_map_column_cpu.py b/torcharrow/test/test_map_column_cpu.py
index 59b521447..5e5d7f3a0 100644
--- a/torcharrow/test/test_map_column_cpu.py
+++ b/torcharrow/test/test_map_column_cpu.py
@@ -22,6 +22,15 @@ def test_infer(self):
     def test_keys_values_get(self):
         self.base_test_keys_values_get()
 
+    def test_get_operator(self):
+        self.base_test_get_operator()
+
+    def test_slice_operation(self):
+        self.base_test_slice_operation()
+
+    def test_equality_operators(self):
+        self.base_test_equality_operators()
+
 
 if __name__ == "__main__":
     unittest.main()

From a46443e57b6b2203fcf3d7f0ad951ee7b4100409 Mon Sep 17 00:00:00 2001
From: Andrew Aikens <andrewaikens@fb.com>
Date: Wed, 29 Jun 2022 11:04:17 -0700
Subject: [PATCH 2/4] Improve icolumn.py cc 3/n test_list_column

Differential Revision: D37492647

fbshipit-source-id: 962bce3f41616f30793f509317f62889f0787ef0
---
 torcharrow/icolumn.py                   |  4 ++--
 torcharrow/test/test_list_column.py     | 17 +++++++++++++++++
 torcharrow/test/test_list_column_cpu.py |  3 +++
 3 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/torcharrow/icolumn.py b/torcharrow/icolumn.py
index dc6c1ff34..03ef09123 100644
--- a/torcharrow/icolumn.py
+++ b/torcharrow/icolumn.py
@@ -254,8 +254,8 @@ def cast(self, dtype):
                         res._append_value(fun(i))
                 return res._finalize()
             else:
-                raise TypeError('f"{astype}({dtype}) is not supported")')
-        raise TypeError('f"{astype} for {type(self).__name__} is not supported")')
+                raise TypeError(f"{dtype} for {type(self).__name__} is not supported")
+        raise TypeError(f"{self.dtype} for {type(self).__name__} is not supported")
 
     # public simple observers -------------------------------------------------
 
diff --git a/torcharrow/test/test_list_column.py b/torcharrow/test/test_list_column.py
index c81c1efa7..b18db0b52 100644
--- a/torcharrow/test/test_list_column.py
+++ b/torcharrow/test/test_list_column.py
@@ -208,6 +208,23 @@ def base_test_fixed_size_list(self):
             f"Unexpected failure reason: {str(ex.exception)}",
         )
 
+    def base_test_cast(self):
+        list_dtype = dt.List(item_dtype=dt.int64, fixed_size=2)
+        c_list = ta.column(
+            [[1, 2], [3, 4]],
+            dtype=list_dtype,
+            device=self.device,
+        )
+
+        int_dtype = dt.int64
+        # TODO: Nested cast should be supported in the future
+        for arg in (int_dtype, list_dtype):
+            with self.assertRaisesRegexp(
+                expected_exception=TypeError,
+                expected_regex=r"List\(int64, fixed_size=2\) for.*is not supported",
+            ):
+                c_list.cast(arg)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/torcharrow/test/test_list_column_cpu.py b/torcharrow/test/test_list_column_cpu.py
index bd774bc7f..72db1bae7 100644
--- a/torcharrow/test/test_list_column_cpu.py
+++ b/torcharrow/test/test_list_column_cpu.py
@@ -46,6 +46,9 @@ def test_map_reduce_etc(self):
     def test_fixed_size_list(self):
         self.base_test_fixed_size_list()
 
+    def test_cast(self):
+        self.base_test_cast()
+
 
 if __name__ == "__main__":
     unittest.main()

From a52ec8c08f1432ee6b4c9d34d00ae426c4541be7 Mon Sep 17 00:00:00 2001
From: Andrew Aikens <andrewaikens@fb.com>
Date: Wed, 29 Jun 2022 11:04:17 -0700
Subject: [PATCH 3/4] Enforce 90% cc for icolumn.py

Differential Revision: D37289878

fbshipit-source-id: de712533ae8efea297088ea3224bbb306f393d70
---
 torcharrow/icolumn.py                     |  12 +-
 torcharrow/test/test_string_column.py     | 221 ++++++++++++++++++++++
 torcharrow/test/test_string_column_cpu.py |  48 +++++
 torcharrow/velox_rt/string_column_cpu.py  |  17 --
 4 files changed, 275 insertions(+), 23 deletions(-)

diff --git a/torcharrow/icolumn.py b/torcharrow/icolumn.py
index 03ef09123..158fe6907 100644
--- a/torcharrow/icolumn.py
+++ b/torcharrow/icolumn.py
@@ -705,7 +705,7 @@ def filter(
         dtype: boolean, length: 2, null_count: 0
         """
         if columns is not None:
-            raise TypeError(f"columns parameter for flat columns not supported")
+            raise TypeError("columns parameter for flat columns not supported")
 
         if not isinstance(predicate, ty.Iterable) and not callable(predicate):
             raise TypeError(
@@ -1006,8 +1006,6 @@ def fill_null(self, fill_value: ty.Union[dt.ScalarTypes, ty.Dict]):
         """
         self._prototype_support_warning("fill_null")
 
-        if not isinstance(fill_value, Column._scalar_types):
-            raise TypeError(f"fill_null with {type(fill_value)} is not supported")
         if isinstance(fill_value, Column._scalar_types):
             res = Scope._EmptyColumn(self.dtype.constructor(nullable=False))
             for m, i in self._items():
@@ -1017,7 +1015,9 @@ def fill_null(self, fill_value: ty.Union[dt.ScalarTypes, ty.Dict]):
                     res._append_value(fill_value)
             return res._finalize()
         else:
-            raise TypeError(f"fill_null with {type(fill_value)} is not supported")
+            raise TypeError(
+                f"fill_null with {type(fill_value).__name__} is not supported"
+            )
 
     @trace
     @expression
@@ -1050,7 +1050,7 @@ def drop_null(self, how: ty.Literal["any", "all", None] = None):
 
         if how is not None:
             # "any or "all" is only used for DataFrame
-            raise TypeError(f"how parameter for flat columns not supported")
+            raise TypeError("how parameter for flat columns not supported")
 
         if dt.is_primitive(self.dtype):
             res = Scope._EmptyColumn(self.dtype.constructor(nullable=False))
@@ -1076,7 +1076,7 @@ def drop_duplicates(
         # TODO Add functionality for first and last
         assert keep == "first"
         if subset is not None:
-            raise TypeError(f"subset parameter for flat columns not supported")
+            raise TypeError("subset parameter for flat columns not supported")
         res = Scope._EmptyColumn(self._dtype)
         res._extend(list(OrderedDict.fromkeys(self)))
         return res._finalize()
diff --git a/torcharrow/test/test_string_column.py b/torcharrow/test/test_string_column.py
index d254685cc..171966d38 100644
--- a/torcharrow/test/test_string_column.py
+++ b/torcharrow/test/test_string_column.py
@@ -6,6 +6,7 @@
 
 import typing as ty
 import unittest
+from unittest import mock
 
 import torcharrow as ta
 import torcharrow.dtypes as dt
@@ -345,6 +346,226 @@ def base_test_regular_expressions(self):
             ],
         )
 
+    def base_test_is_unique(self):
+        unique_column = ta.column(
+            [f"test{x}" for x in range(3)],
+            device=self.device,
+        )
+
+        self.assertTrue(unique_column.is_unique)
+
+        non_unique_column = ta.column(
+            [
+                "test",
+                "test",
+            ],
+            device=self.device,
+        )
+
+        self.assertFalse(non_unique_column.is_unique)
+
+    def base_test_is_monotonic_increasing(self):
+        c = ta.column([f"test{x}" for x in range(5)], device=self.device)
+        self.assertTrue(c.is_monotonic_increasing)
+        self.assertFalse(c.is_monotonic_decreasing)
+
+    def base_test_is_monotonic_decreasing(self):
+        c = ta.column([f"test{x}" for x in range(5, 0, -1)], device=self.device)
+        self.assertFalse(c.is_monotonic_increasing)
+        self.assertTrue(c.is_monotonic_decreasing)
+
+    def base_test_if_else(self):
+        left_repr = ["a1", "a2", "a3", "a4"]
+        right_repr = ["b1", "b2", "b3", "b4"]
+        cond_repr = [True, False, True, False]
+        cond = ta.column(cond_repr, device=self.device)
+        left = ta.column(left_repr, device=self.device)
+        right = ta.column(right_repr, device=self.device)
+
+        # Ensure py-iterables work as intended
+        expected = [left_repr[0], right_repr[1], left_repr[2], right_repr[3]]
+        result = ta.if_else(cond, left_repr, right_repr)
+        self.assertEqual(expected, list(result))
+
+        # Non common d type
+        with self.assertRaisesRegex(
+            expected_exception=TypeError,
+            expected_regex="then and else branches must have compatible types, got.*and.*, respectively",
+        ), mock.patch("torcharrow.icolumn.dt.common_dtype") as mock_common_dtype:
+            mock_common_dtype.return_value = None
+
+            ta.if_else(cond, left, right)
+
+            mock_common_dtype.assert_called_once_with(
+                left.dtype,
+                right.dtype,
+            )
+
+        with self.assertRaisesRegex(
+            expected_exception=TypeError,
+            expected_regex="then and else branches must have compatible types, got.*and.*, respectively",
+        ), mock.patch(
+            "torcharrow.icolumn.dt.common_dtype"
+        ) as mock_common_dtype, mock.patch(
+            "torcharrow.icolumn.dt.is_void"
+        ) as mock_is_void:
+            mock_is_void.return_value = True
+            mock_common_dtype.return_value = dt.int64
+
+            ta.if_else(cond, left, right)
+            mock_common_dtype.assert_called_once_with(
+                left.dtype,
+                right.dtype,
+            )
+            mock_is_void.assert_called_once_with(mock_common_dtype.return_value)
+
+        # Invalid condition input
+        with self.assertRaisesRegex(
+            expected_exception=TypeError,
+            expected_regex="condition must be a boolean vector",
+        ):
+            ta.if_else(
+                cond=left,
+                left=left,
+                right=right,
+            )
+
+    def base_test_str(self):
+        c = ta.column([f"test{x}" for x in range(5)], device=self.device)
+        c.id = 321
+
+        expected = "Column(['test0', 'test1', 'test2', 'test3', 'test4'], id = 321)"
+        self.assertEqual(expected, str(c))
+
+    def base_test_repr(self):
+        c = ta.column([f"test{x}" for x in range(5)], device=self.device)
+
+        expected = (
+            "0  'test0'\n"
+            "1  'test1'\n"
+            "2  'test2'\n"
+            "3  'test3'\n"
+            "4  'test4'\n"
+            f"dtype: string, length: 5, null_count: 0, device: {self.device}"
+        )
+        self.assertEqual(expected, repr(c))
+
+    def base_test_is_valid_at(self):
+        c = ta.column([f"test{x}" for x in range(5)], device=self.device)
+
+        # Normal access
+        self.assertTrue(all(c.is_valid_at(x) for x in range(5)))
+
+        # Negative access
+        self.assertTrue(c.is_valid_at(-1))
+
+        # Out of bounds access
+        with self.assertRaises(expected_exception=RuntimeError):
+            self.assertFalse(c.is_valid_at(10))
+
+    def base_test_cast(self):
+        c_repr = ["0", "1", "2", "3", "4", None]
+        c_repr_after_cast = [0, 1, 2, 3, 4, None]
+        c = ta.column(c_repr, device=self.device)
+
+        result = c.cast(dt.int64)
+        self.assertEqual(c_repr_after_cast, list(result))
+
+    def base_test_drop_null(self):
+        c_repr = ["0", "1", "2", "3", "4", None]
+        c = ta.column(c_repr, device=self.device)
+
+        result = c.drop_null()
+
+        self.assertEqual(c_repr[:-1], list(result))
+
+        with self.assertRaisesRegex(
+            expected_exception=TypeError,
+            expected_regex="how parameter for flat columns not supported",
+        ):
+            c.drop_null(how="any")
+
+    def base_test_drop_duplicates(self):
+        c_repr = ["test", "test2", "test3", "test"]
+        c = ta.column(c_repr, device=self.device)
+
+        result = c.drop_duplicates()
+
+        self.assertEqual(c_repr[:-1], list(result))
+
+        # TODO: Add functionality for last
+        with self.assertRaises(expected_exception=AssertionError):
+            c.drop_duplicates(keep="last")
+
+        with self.assertRaisesRegex(
+            expected_exception=TypeError,
+            expected_regex="subset parameter for flat columns not supported",
+        ):
+            c.drop_duplicates(subset=c_repr[:2])
+
+    def base_test_fill_null(self):
+        c_repr = ["0", "1", None, "3", "4", None]
+        expected_fill = "TEST"
+        expected_repr = ["0", "1", expected_fill, "3", "4", expected_fill]
+        c = ta.column(c_repr, device=self.device)
+
+        result = c.fill_null(expected_fill)
+
+        self.assertEqual(expected_repr, list(result))
+
+        with self.assertRaisesRegex(
+            expected_exception=TypeError,
+            expected_regex="fill_null with bytes is not supported",
+        ):
+            c.fill_null(expected_fill.encode())
+
+    def base_test_isin(self):
+        c_repr = [f"test{x}" for x in range(5)]
+        c = ta.column(c_repr, device=self.device)
+        self.assertTrue(all(c.isin(values=c_repr + ["test_123"])))
+        self.assertFalse(any(c.isin(values=["test5", "test6", "test7"])))
+
+    def base_test_bool(self):
+        c = ta.column([f"test{x}" for x in range(5)], device=self.device)
+        with self.assertRaisesRegex(
+            expected_exception=ValueError,
+            expected_regex=r"The truth value of a.*is ambiguous. Use a.any\(\) or a.all\(\).",
+        ):
+            bool(c)
+
+    def base_test_flatmap(self):
+        c = ta.column(["test1", "test2", None, None, "test3"], device=self.device)
+        expected_result = [
+            "test1",
+            "test1",
+            "test2",
+            "test2",
+            None,
+            None,
+            None,
+            None,
+            "test3",
+            "test3",
+        ]
+        result = c.flatmap(lambda xs: [xs, xs])
+        self.assertEqual(expected_result, list(result))
+
+    def base_test_any(self):
+        c_some = ta.column(["test1", "test2", None, None, "test3"], device=self.device)
+        c_none = ta.column([], dtype=dt.string, device=self.device)
+        c_none = c_none.append([None])
+        self.assertTrue(any(c_some))
+        self.assertFalse(any(c_none))
+
+    def base_test_all(self):
+        c_all = ta.column(["test", "test2", "test3"], device=self.device)
+        c_partial = ta.column(["test", "test2", None, None], device=self.device)
+        c_none = ta.column([], dtype=dt.string, device=self.device)
+        c_none = c_none.append([None])
+        self.assertTrue(all(c_all))
+        self.assertFalse(all(c_partial))
+        self.assertFalse(all(c_none))
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/torcharrow/test/test_string_column_cpu.py b/torcharrow/test/test_string_column_cpu.py
index e58f38e55..b32d7a6e0 100644
--- a/torcharrow/test/test_string_column_cpu.py
+++ b/torcharrow/test/test_string_column_cpu.py
@@ -57,6 +57,54 @@ def test_string_pattern_matching_methods(self):
     def test_regular_expressions(self):
         self.base_test_regular_expressions()
 
+    def test_is_unique(self):
+        self.base_test_is_unique()
+
+    def test_is_monotonic_increasing(self):
+        self.base_test_is_monotonic_increasing()
+
+    def test_is_monotonic_decreasing(self):
+        self.base_test_is_monotonic_decreasing()
+
+    def test_if_else(self):
+        self.base_test_if_else()
+
+    def test_repr(self):
+        self.base_test_repr()
+
+    def test_str(self):
+        self.base_test_str()
+
+    def test_is_valid_at(self):
+        self.base_test_is_valid_at()
+
+    def test_cast(self):
+        self.base_test_cast()
+
+    def test_drop_null(self):
+        self.base_test_drop_null()
+
+    def test_drop_duplicates(self):
+        self.base_test_drop_duplicates()
+
+    def test_fill_null(self):
+        self.base_test_fill_null()
+
+    def test_isin(self):
+        self.base_test_isin()
+
+    def test_bool(self):
+        self.base_test_bool()
+
+    def test_flatmap(self):
+        self.base_test_flatmap()
+
+    def test_any(self):
+        self.base_test_any()
+
+    def test_all(self):
+        self.base_test_all()
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/torcharrow/velox_rt/string_column_cpu.py b/torcharrow/velox_rt/string_column_cpu.py
index 8a32512b9..23576e8a4 100644
--- a/torcharrow/velox_rt/string_column_cpu.py
+++ b/torcharrow/velox_rt/string_column_cpu.py
@@ -187,23 +187,6 @@ def __gt__(self, other):
     def __ge__(self, other):
         return self._checked_binary_op_call(other, "gte")
 
-    # printing ----------------------------------------------------------------
-
-    def __str__(self):
-        def quote(x):
-            return f"'{x}'"
-
-        return f"Column([{', '.join('None' if i is None else quote(i) for i in self)}])"
-
-    def __repr__(self):
-        tab = tabulate(
-            [["None" if i is None else f"'{i}'"] for i in self],
-            tablefmt="plain",
-            showindex=True,
-        )
-        typ = f"dtype: {self.dtype}, length: {self.length}, null_count: {self.null_count}, device: cpu"
-        return tab + dt.NL + typ
-
     # interop
     def _to_tensor_default(self):
         # there are no string tensors, so we're using regular python list conversion

From 224a4dfc580ce421c1571101fb62ba4ae7dfc49f Mon Sep 17 00:00:00 2001
From: andrewaikens87 <16356240+andrewaikens87@users.noreply.github.com>
Date: Wed, 29 Jun 2022 11:07:26 -0700
Subject: [PATCH 4/4] Remove unused icolumn.py private methods

Summary: Removes unused private methods in icolumn.py bringing code coverage to 79.2%.

Differential Revision: D37503197

fbshipit-source-id: 2508872b42700a91f83c309f0b6160e2dd0c534e
---
 torcharrow/icolumn.py | 37 -------------------------------------
 1 file changed, 37 deletions(-)

diff --git a/torcharrow/icolumn.py b/torcharrow/icolumn.py
index 158fe6907..34f76c7ca 100644
--- a/torcharrow/icolumn.py
+++ b/torcharrow/icolumn.py
@@ -1417,40 +1417,3 @@ def _to_tensor_default(self):
     def _count(self):
         """Return number of non-NA/null observations pgf the column/frame"""
         return len(self) - self.null_count
-
-    @trace
-    @expression
-    def _nlargest(
-        self,
-        n=5,
-        columns: ty.Optional[ty.List[str]] = None,
-        keep: ty.Literal["last", "first"] = "first",
-    ):
-        """Returns a new data of the *n* largest element."""
-        # keep="all" not supported
-        if columns is not None:
-            raise TypeError(
-                "computing n-largest on non-structured column can't have 'columns' parameter"
-            )
-        return self.sort(ascending=False).head(n)
-
-    @trace
-    @expression
-    def _nsmallest(self, n=5, columns: ty.Optional[ty.List[str]] = None, keep="first"):
-        """Returns a new data of the *n* smallest element."""
-        # keep="all" not supported
-        if columns is not None:
-            raise TypeError(
-                "computing n-smallest on non-structured column can't have 'columns' parameter"
-            )
-
-        return self.sort(ascending=True).head(n)
-
-    @trace
-    @expression
-    def _nunique(self, drop_null=True):
-        """Returns the number of unique values of the column"""
-        if not drop_null:
-            return len(set(self))
-        else:
-            return len(set(i for i in self if i is not None))