Skip to content
This repository was archived by the owner on Nov 1, 2024. It is now read-only.
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 9 additions & 35 deletions torcharrow/icolumn.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,9 @@ def __len__(self):

def __str__(self):
item_padding = "'" if dt.is_string(self.dtype) else ""
return f"Column([{', '.join(f'{item_padding}{i}{item_padding}' for i in self)}], id = {self.id})"
return (
f"Column([{', '.join(f'{item_padding}{i}{item_padding}' for i in self)}])"
)

def __repr__(self):
item_padding = "'" if dt.is_string(self.dtype) else ""
Expand Down Expand Up @@ -705,7 +707,7 @@ def filter(
dtype: boolean, length: 2, null_count: 0
"""
if columns is not None:
raise TypeError(f"columns parameter for flat columns not supported")
raise TypeError("columns parameter for flat columns not supported")

if not isinstance(predicate, ty.Iterable) and not callable(predicate):
raise TypeError(
Expand Down Expand Up @@ -1006,8 +1008,6 @@ def fill_null(self, fill_value: ty.Union[dt.ScalarTypes, ty.Dict]):
"""
self._prototype_support_warning("fill_null")

if not isinstance(fill_value, Column._scalar_types):
raise TypeError(f"fill_null with {type(fill_value)} is not supported")
if isinstance(fill_value, Column._scalar_types):
res = Scope._EmptyColumn(self.dtype.constructor(nullable=False))
for m, i in self._items():
Expand All @@ -1017,7 +1017,9 @@ def fill_null(self, fill_value: ty.Union[dt.ScalarTypes, ty.Dict]):
res._append_value(fill_value)
return res._finalize()
else:
raise TypeError(f"fill_null with {type(fill_value)} is not supported")
raise TypeError(
f"fill_null with {type(fill_value).__name__} is not supported"
)

@trace
@expression
Expand Down Expand Up @@ -1050,7 +1052,7 @@ def drop_null(self, how: ty.Literal["any", "all", None] = None):

if how is not None:
# "any or "all" is only used for DataFrame
raise TypeError(f"how parameter for flat columns not supported")
raise TypeError("how parameter for flat columns not supported")

if dt.is_primitive(self.dtype):
res = Scope._EmptyColumn(self.dtype.constructor(nullable=False))
Expand All @@ -1076,7 +1078,7 @@ def drop_duplicates(
# TODO Add functionality for first and last
assert keep == "first"
if subset is not None:
raise TypeError(f"subset parameter for flat columns not supported")
raise TypeError("subset parameter for flat columns not supported")
res = Scope._EmptyColumn(self._dtype)
res._extend(list(OrderedDict.fromkeys(self)))
return res._finalize()
Expand Down Expand Up @@ -1418,34 +1420,6 @@ def _count(self):
"""Return number of non-NA/null observations pgf the column/frame"""
return len(self) - self.null_count

@trace
@expression
def _nlargest(
self,
n=5,
columns: ty.Optional[ty.List[str]] = None,
keep: ty.Literal["last", "first"] = "first",
):
"""Returns a new data of the *n* largest element."""
# keep="all" not supported
if columns is not None:
raise TypeError(
"computing n-largest on non-structured column can't have 'columns' parameter"
)
return self.sort(ascending=False).head(n)

@trace
@expression
def _nsmallest(self, n=5, columns: ty.Optional[ty.List[str]] = None, keep="first"):
"""Returns a new data of the *n* smallest element."""
# keep="all" not supported
if columns is not None:
raise TypeError(
"computing n-smallest on non-structured column can't have 'columns' parameter"
)

return self.sort(ascending=True).head(n)

@trace
@expression
def _nunique(self, drop_null=True):
Expand Down
3 changes: 1 addition & 2 deletions torcharrow/test/test_numerical_column.py
Original file line number Diff line number Diff line change
Expand Up @@ -839,9 +839,8 @@ def base_test_batch_collate(self):

def base_test_str(self):
c = ta.column(list(range(5)), device=self.device)
c.id = 123

expected = "Column([0, 1, 2, 3, 4], id = 123)"
expected = "Column([0, 1, 2, 3, 4])"
self.assertEqual(expected, str(c))

def base_test_repr(self):
Expand Down
189 changes: 189 additions & 0 deletions torcharrow/test/test_string_column.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,195 @@ def base_test_regular_expressions(self):
],
)

def base_test_is_unique(self):
unique_column = ta.column(
[f"test{x}" for x in range(3)],
device=self.device,
)

self.assertTrue(unique_column.is_unique)

non_unique_column = ta.column(
[
"test",
"test",
],
device=self.device,
)

self.assertFalse(non_unique_column.is_unique)

def base_test_is_monotonic_increasing(self):
c = ta.column([f"test{x}" for x in range(5)], device=self.device)
self.assertTrue(c.is_monotonic_increasing)
self.assertFalse(c.is_monotonic_decreasing)

def base_test_is_monotonic_decreasing(self):
c = ta.column([f"test{x}" for x in range(5, 0, -1)], device=self.device)
self.assertFalse(c.is_monotonic_increasing)
self.assertTrue(c.is_monotonic_decreasing)

def base_test_if_else(self):
left_repr = ["a1", "a2", "a3", "a4"]
right_repr = ["b1", "b2", "b3", "b4"]
float_type = ta.column(
[1.22, 2.22, 3.22, 4.22], dtype=dt.float32, device=self.device
)
cond_repr = [True, False, True, False]
cond = ta.column(cond_repr, device=self.device)
left = ta.column(left_repr, device=self.device)
right = ta.column(right_repr, device=self.device)

# Ensure py-iterables work as intended
expected = [left_repr[0], right_repr[1], left_repr[2], right_repr[3]]
result = ta.if_else(cond, left_repr, right_repr)
self.assertEqual(expected, list(result))

# Non common dtype
with self.assertRaisesRegex(
expected_exception=TypeError,
expected_regex="then and else branches must have compatible types, got.*and.*, respectively",
):
ta.if_else(cond, left, float_type)

# Invalid condition input
with self.assertRaisesRegex(
expected_exception=TypeError,
expected_regex="condition must be a boolean vector",
):
ta.if_else(
cond=left,
left=left,
right=right,
)

def base_test_str(self):
c = ta.column([f"test{x}" for x in range(5)], device=self.device)

expected = "Column(['test0', 'test1', 'test2', 'test3', 'test4'])"
self.assertEqual(expected, str(c))

def base_test_repr(self):
c = ta.column([f"test{x}" for x in range(5)], device=self.device)

expected = (
"0 'test0'\n"
"1 'test1'\n"
"2 'test2'\n"
"3 'test3'\n"
"4 'test4'\n"
f"dtype: string, length: 5, null_count: 0, device: {self.device}"
)
self.assertEqual(expected, repr(c))

def base_test_is_valid_at(self):
c = ta.column([f"test{x}" for x in range(5)], device=self.device)

self.assertTrue(all(c.is_valid_at(x) for x in range(5)))

def base_test_cast(self):
c_repr = ["0", "1", "2", "3", "4", None]
c_repr_after_cast = [0, 1, 2, 3, 4, None]
c = ta.column(c_repr, device=self.device)

result = c.cast(dt.int64)
self.assertEqual(c_repr_after_cast, list(result))

def base_test_drop_null(self):
c_repr = ["0", "1", "2", "3", "4", None]
c = ta.column(c_repr, device=self.device)

result = c.drop_null()

self.assertEqual(c_repr[:-1], list(result))

with self.assertRaisesRegex(
expected_exception=TypeError,
expected_regex="how parameter for flat columns not supported",
):
c.drop_null(how="any")

def base_test_drop_duplicates(self):
c_repr = ["test", "test2", "test3", "test"]
c = ta.column(c_repr, device=self.device)

result = c.drop_duplicates()

self.assertEqual(c_repr[:-1], list(result))

# TODO: Add functionality for last
with self.assertRaises(expected_exception=AssertionError):
c.drop_duplicates(keep="last")

with self.assertRaisesRegex(
expected_exception=TypeError,
expected_regex="subset parameter for flat columns not supported",
):
c.drop_duplicates(subset=c_repr[:2])

def base_test_fill_null(self):
c_repr = ["0", "1", None, "3", "4", None]
expected_fill = "TEST"
expected_repr = ["0", "1", expected_fill, "3", "4", expected_fill]
c = ta.column(c_repr, device=self.device)

result = c.fill_null(expected_fill)

self.assertEqual(expected_repr, list(result))

with self.assertRaisesRegex(
expected_exception=TypeError,
expected_regex="fill_null with bytes is not supported",
):
c.fill_null(expected_fill.encode())

def base_test_isin(self):
c_repr = [f"test{x}" for x in range(5)]
c = ta.column(c_repr, device=self.device)
self.assertTrue(all(c.isin(values=c_repr + ["test_123"])))
self.assertFalse(any(c.isin(values=["test5", "test6", "test7"])))

def base_test_bool(self):
c = ta.column([f"test{x}" for x in range(5)], device=self.device)
with self.assertRaisesRegex(
expected_exception=ValueError,
expected_regex=r"The truth value of a.*is ambiguous. Use a.any\(\) or a.all\(\).",
):
bool(c)

def base_test_flatmap(self):
c = ta.column(["test1", "test2", None, None, "test3"], device=self.device)
expected_result = [
"test1",
"test1",
"test2",
"test2",
None,
None,
None,
None,
"test3",
"test3",
]
result = c.flatmap(lambda xs: [xs, xs])
self.assertEqual(expected_result, list(result))

def base_test_any(self):
c_some = ta.column(["test1", "test2", None, None, "test3"], device=self.device)
c_none = ta.column([], dtype=dt.string, device=self.device)
c_none = c_none.append([None])
self.assertTrue(c_some.any())
self.assertFalse(c_none.any())

def base_test_all(self):
c_all = ta.column(["test", "test2", "test3"], device=self.device)
c_partial = ta.column(["test", "test2", None, None], device=self.device)
c_none = ta.column([], dtype=dt.string, device=self.device)
c_none = c_none.append([None])
self.assertTrue(c_all.all())
self.assertTrue(c_partial.all())
self.assertTrue(c_none.all())


if __name__ == "__main__":
unittest.main()
48 changes: 48 additions & 0 deletions torcharrow/test/test_string_column_cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,54 @@ def test_string_pattern_matching_methods(self):
def test_regular_expressions(self):
self.base_test_regular_expressions()

def test_is_unique(self):
self.base_test_is_unique()

def test_is_monotonic_increasing(self):
self.base_test_is_monotonic_increasing()

def test_is_monotonic_decreasing(self):
self.base_test_is_monotonic_decreasing()

def test_if_else(self):
self.base_test_if_else()

def test_repr(self):
self.base_test_repr()

def test_str(self):
self.base_test_str()

def test_is_valid_at(self):
self.base_test_is_valid_at()

def test_cast(self):
self.base_test_cast()

def test_drop_null(self):
self.base_test_drop_null()

def test_drop_duplicates(self):
self.base_test_drop_duplicates()

def test_fill_null(self):
self.base_test_fill_null()

def test_isin(self):
self.base_test_isin()

def test_bool(self):
self.base_test_bool()

def test_flatmap(self):
self.base_test_flatmap()

def test_any(self):
self.base_test_any()

def test_all(self):
self.base_test_all()


if __name__ == "__main__":
unittest.main()
23 changes: 0 additions & 23 deletions torcharrow/velox_rt/dataframe_cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -617,29 +617,6 @@ def func(tup):
res._extend([None] * self.null_count)
return res._finalize()

@trace
@expression
def _nlargest(
self,
n=5,
columns: Optional[List[str]] = None,
keep="first",
):
"""Returns a new dataframe of the *n* largest elements."""
# Todo add keep arg
return self.sort(by=columns, ascending=False).head(n)

@trace
@expression
def _nsmallest(
self,
n=5,
columns: Optional[List[str]] = None,
keep="first",
):
"""Returns a new dataframe of the *n* smallest elements."""
return self.sort(by=columns, ascending=True).head(n)

# operators --------------------------------------------------------------

@expression
Expand Down
Loading