Skip to content
This repository was archived by the owner on Nov 1, 2024. It is now read-only.
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 9 additions & 7 deletions torcharrow/icolumn.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,9 @@ def __len__(self):

def __str__(self):
item_padding = "'" if dt.is_string(self.dtype) else ""
return f"Column([{', '.join(f'{item_padding}{i}{item_padding}' for i in self)}], id = {self.id})"
return (
f"Column([{', '.join(f'{item_padding}{i}{item_padding}' for i in self)}])"
)

def __repr__(self):
item_padding = "'" if dt.is_string(self.dtype) else ""
Expand Down Expand Up @@ -705,7 +707,7 @@ def filter(
dtype: boolean, length: 2, null_count: 0
"""
if columns is not None:
raise TypeError(f"columns parameter for flat columns not supported")
raise TypeError("columns parameter for flat columns not supported")

if not isinstance(predicate, ty.Iterable) and not callable(predicate):
raise TypeError(
Expand Down Expand Up @@ -1006,8 +1008,6 @@ def fill_null(self, fill_value: ty.Union[dt.ScalarTypes, ty.Dict]):
"""
self._prototype_support_warning("fill_null")

if not isinstance(fill_value, Column._scalar_types):
raise TypeError(f"fill_null with {type(fill_value)} is not supported")
if isinstance(fill_value, Column._scalar_types):
res = Scope._EmptyColumn(self.dtype.constructor(nullable=False))
for m, i in self._items():
Expand All @@ -1017,7 +1017,9 @@ def fill_null(self, fill_value: ty.Union[dt.ScalarTypes, ty.Dict]):
res._append_value(fill_value)
return res._finalize()
else:
raise TypeError(f"fill_null with {type(fill_value)} is not supported")
raise TypeError(
f"fill_null with {type(fill_value).__name__} is not supported"
)

@trace
@expression
Expand Down Expand Up @@ -1050,7 +1052,7 @@ def drop_null(self, how: ty.Literal["any", "all", None] = None):

if how is not None:
# "any or "all" is only used for DataFrame
raise TypeError(f"how parameter for flat columns not supported")
raise TypeError("how parameter for flat columns not supported")

if dt.is_primitive(self.dtype):
res = Scope._EmptyColumn(self.dtype.constructor(nullable=False))
Expand All @@ -1076,7 +1078,7 @@ def drop_duplicates(
# TODO Add functionality for first and last
assert keep == "first"
if subset is not None:
raise TypeError(f"subset parameter for flat columns not supported")
raise TypeError("subset parameter for flat columns not supported")
res = Scope._EmptyColumn(self._dtype)
res._extend(list(OrderedDict.fromkeys(self)))
return res._finalize()
Expand Down
3 changes: 1 addition & 2 deletions torcharrow/test/test_numerical_column.py
Original file line number Diff line number Diff line change
Expand Up @@ -839,9 +839,8 @@ def base_test_batch_collate(self):

def base_test_str(self):
c = ta.column(list(range(5)), device=self.device)
c.id = 123

expected = "Column([0, 1, 2, 3, 4], id = 123)"
expected = "Column([0, 1, 2, 3, 4])"
self.assertEqual(expected, str(c))

def base_test_repr(self):
Expand Down
189 changes: 189 additions & 0 deletions torcharrow/test/test_string_column.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,195 @@ def base_test_regular_expressions(self):
],
)

def base_test_is_unique(self):
unique_column = ta.column(
[f"test{x}" for x in range(3)],
device=self.device,
)

self.assertTrue(unique_column.is_unique)

non_unique_column = ta.column(
[
"test",
"test",
],
device=self.device,
)

self.assertFalse(non_unique_column.is_unique)

def base_test_is_monotonic_increasing(self):
c = ta.column([f"test{x}" for x in range(5)], device=self.device)
self.assertTrue(c.is_monotonic_increasing)
self.assertFalse(c.is_monotonic_decreasing)

def base_test_is_monotonic_decreasing(self):
c = ta.column([f"test{x}" for x in range(5, 0, -1)], device=self.device)
self.assertFalse(c.is_monotonic_increasing)
self.assertTrue(c.is_monotonic_decreasing)

def base_test_if_else(self):
left_repr = ["a1", "a2", "a3", "a4"]
right_repr = ["b1", "b2", "b3", "b4"]
float_type = ta.column(
[1.22, 2.22, 3.22, 4.22], dtype=dt.float32, device=self.device
)
cond_repr = [True, False, True, False]
cond = ta.column(cond_repr, device=self.device)
left = ta.column(left_repr, device=self.device)
right = ta.column(right_repr, device=self.device)

# Ensure py-iterables work as intended
expected = [left_repr[0], right_repr[1], left_repr[2], right_repr[3]]
result = ta.if_else(cond, left_repr, right_repr)
self.assertEqual(expected, list(result))

# Non common dtype
with self.assertRaisesRegex(
expected_exception=TypeError,
expected_regex="then and else branches must have compatible types, got.*and.*, respectively",
):
ta.if_else(cond, left, float_type)

# Invalid condition input
with self.assertRaisesRegex(
expected_exception=TypeError,
expected_regex="condition must be a boolean vector",
):
ta.if_else(
cond=left,
left=left,
right=right,
)

def base_test_str(self):
c = ta.column([f"test{x}" for x in range(5)], device=self.device)

expected = "Column(['test0', 'test1', 'test2', 'test3', 'test4'])"
self.assertEqual(expected, str(c))

def base_test_repr(self):
c = ta.column([f"test{x}" for x in range(5)], device=self.device)

expected = (
"0 'test0'\n"
"1 'test1'\n"
"2 'test2'\n"
"3 'test3'\n"
"4 'test4'\n"
f"dtype: string, length: 5, null_count: 0, device: {self.device}"
)
self.assertEqual(expected, repr(c))

def base_test_is_valid_at(self):
c = ta.column([f"test{x}" for x in range(5)], device=self.device)

self.assertTrue(all(c.is_valid_at(x) for x in range(5)))

def base_test_cast(self):
c_repr = ["0", "1", "2", "3", "4", None]
c_repr_after_cast = [0, 1, 2, 3, 4, None]
c = ta.column(c_repr, device=self.device)

result = c.cast(dt.int64)
self.assertEqual(c_repr_after_cast, list(result))

def base_test_drop_null(self):
c_repr = ["0", "1", "2", "3", "4", None]
c = ta.column(c_repr, device=self.device)

result = c.drop_null()

self.assertEqual(c_repr[:-1], list(result))

with self.assertRaisesRegex(
expected_exception=TypeError,
expected_regex="how parameter for flat columns not supported",
):
c.drop_null(how="any")

def base_test_drop_duplicates(self):
c_repr = ["test", "test2", "test3", "test"]
c = ta.column(c_repr, device=self.device)

result = c.drop_duplicates()

self.assertEqual(c_repr[:-1], list(result))

# TODO: Add functionality for last
with self.assertRaises(expected_exception=AssertionError):
c.drop_duplicates(keep="last")

with self.assertRaisesRegex(
expected_exception=TypeError,
expected_regex="subset parameter for flat columns not supported",
):
c.drop_duplicates(subset=c_repr[:2])

def base_test_fill_null(self):
c_repr = ["0", "1", None, "3", "4", None]
expected_fill = "TEST"
expected_repr = ["0", "1", expected_fill, "3", "4", expected_fill]
c = ta.column(c_repr, device=self.device)

result = c.fill_null(expected_fill)

self.assertEqual(expected_repr, list(result))

with self.assertRaisesRegex(
expected_exception=TypeError,
expected_regex="fill_null with bytes is not supported",
):
c.fill_null(expected_fill.encode())

def base_test_isin(self):
c_repr = [f"test{x}" for x in range(5)]
c = ta.column(c_repr, device=self.device)
self.assertTrue(all(c.isin(values=c_repr + ["test_123"])))
self.assertFalse(any(c.isin(values=["test5", "test6", "test7"])))

def base_test_bool(self):
c = ta.column([f"test{x}" for x in range(5)], device=self.device)
with self.assertRaisesRegex(
expected_exception=ValueError,
expected_regex=r"The truth value of a.*is ambiguous. Use a.any\(\) or a.all\(\).",
):
bool(c)

def base_test_flatmap(self):
c = ta.column(["test1", "test2", None, None, "test3"], device=self.device)
expected_result = [
"test1",
"test1",
"test2",
"test2",
None,
None,
None,
None,
"test3",
"test3",
]
result = c.flatmap(lambda xs: [xs, xs])
self.assertEqual(expected_result, list(result))

def base_test_any(self):
c_some = ta.column(["test1", "test2", None, None, "test3"], device=self.device)
c_none = ta.column([], dtype=dt.string, device=self.device)
c_none = c_none.append([None])
self.assertTrue(c_some.any())
self.assertFalse(c_none.any())

def base_test_all(self):
c_all = ta.column(["test", "test2", "test3"], device=self.device)
c_partial = ta.column(["test", "test2", None, None], device=self.device)
c_none = ta.column([], dtype=dt.string, device=self.device)
c_none = c_none.append([None])
self.assertTrue(c_all.all())
self.assertTrue(c_partial.all())
self.assertTrue(c_none.all())


if __name__ == "__main__":
unittest.main()
48 changes: 48 additions & 0 deletions torcharrow/test/test_string_column_cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,54 @@ def test_string_pattern_matching_methods(self):
def test_regular_expressions(self):
self.base_test_regular_expressions()

def test_is_unique(self):
self.base_test_is_unique()

def test_is_monotonic_increasing(self):
self.base_test_is_monotonic_increasing()

def test_is_monotonic_decreasing(self):
self.base_test_is_monotonic_decreasing()

def test_if_else(self):
self.base_test_if_else()

def test_repr(self):
self.base_test_repr()

def test_str(self):
self.base_test_str()

def test_is_valid_at(self):
self.base_test_is_valid_at()

def test_cast(self):
self.base_test_cast()

def test_drop_null(self):
self.base_test_drop_null()

def test_drop_duplicates(self):
self.base_test_drop_duplicates()

def test_fill_null(self):
self.base_test_fill_null()

def test_isin(self):
self.base_test_isin()

def test_bool(self):
self.base_test_bool()

def test_flatmap(self):
self.base_test_flatmap()

def test_any(self):
self.base_test_any()

def test_all(self):
self.base_test_all()


if __name__ == "__main__":
unittest.main()
17 changes: 0 additions & 17 deletions torcharrow/velox_rt/string_column_cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,23 +187,6 @@ def __gt__(self, other):
def __ge__(self, other):
return self._checked_binary_op_call(other, "gte")

# printing ----------------------------------------------------------------

def __str__(self):
def quote(x):
return f"'{x}'"

return f"Column([{', '.join('None' if i is None else quote(i) for i in self)}])"

def __repr__(self):
tab = tabulate(
[["None" if i is None else f"'{i}'"] for i in self],
tablefmt="plain",
showindex=True,
)
typ = f"dtype: {self.dtype}, length: {self.length}, null_count: {self.null_count}, device: cpu"
return tab + dt.NL + typ

# interop
def _to_tensor_default(self):
# there are no string tensors, so we're using regular python list conversion
Expand Down