Skip to content

Commit

Permalink
Add DataFrame tests (#2138)
Browse files Browse the repository at this point in the history
  • Loading branch information
zundertj committed Dec 23, 2021
1 parent 30c0f15 commit aaa5805
Show file tree
Hide file tree
Showing 2 changed files with 235 additions and 19 deletions.
39 changes: 26 additions & 13 deletions py-polars/polars/internals/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2061,7 +2061,7 @@ def tail(self, length: int = 5) -> "DataFrame":
"""
return wrap_df(self._df.tail(length))

def drop_nulls(self, subset: Optional[List[str]] = None) -> "DataFrame":
def drop_nulls(self, subset: Optional[Union[str, List[str]]] = None) -> "DataFrame":
"""
Return a new DataFrame where the null values are dropped.
Expand Down Expand Up @@ -2155,7 +2155,7 @@ def drop_nulls(self, subset: Optional[List[str]] = None) -> "DataFrame":
└──────┴──────┘
"""
if subset is not None and isinstance(subset, str):
if isinstance(subset, str):
subset = [subset]
return wrap_df(self._df.drop_nulls(subset))

Expand Down Expand Up @@ -2730,6 +2730,22 @@ def with_column_renamed(self, existing_name: str, new_name: str) -> "DataFrame":
----------
existing_name
new_name
Examples
--------
>>> df = pl.DataFrame({"a": [1, 2], "b": [3, 4]})
>>> df.with_column_renamed("b", "c")
shape: (2, 2)
┌─────┬─────┐
│ a ┆ c │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 1 ┆ 3 │
├╌╌╌╌╌┼╌╌╌╌╌┤
│ 2 ┆ 4 │
└─────┴─────┘
"""
return (
self.lazy()
Expand Down Expand Up @@ -2975,7 +2991,7 @@ def get_column(self, name: str) -> "pli.Series":
"""
return self[name]

def fill_null(self, strategy: Union[str, "pli.Expr"]) -> "DataFrame":
def fill_null(self, strategy: Union[str, "pli.Expr", Any]) -> "DataFrame":
"""
Fill None/missing values by a filling strategy or an Expression evaluation.
Expand Down Expand Up @@ -3339,7 +3355,7 @@ def max(self, axis: int = 0) -> Union["DataFrame", "pli.Series"]:
return wrap_df(self._df.max())
if axis == 1:
return pli.wrap_s(self._df.hmax())
raise ValueError("Axis should be 0 or 1.")
raise ValueError("Axis should be 0 or 1.") # pragma: no cover

def min(self, axis: int = 0) -> Union["DataFrame", "pli.Series"]:
"""
Expand Down Expand Up @@ -3369,7 +3385,7 @@ def min(self, axis: int = 0) -> Union["DataFrame", "pli.Series"]:
return wrap_df(self._df.min())
if axis == 1:
return pli.wrap_s(self._df.hmin())
raise ValueError("Axis should be 0 or 1.")
raise ValueError("Axis should be 0 or 1.") # pragma: no cover

def sum(
self, axis: int = 0, null_strategy: str = "ignore"
Expand Down Expand Up @@ -3409,7 +3425,7 @@ def sum(
return wrap_df(self._df.sum())
if axis == 1:
return pli.wrap_s(self._df.hsum(null_strategy))
raise ValueError("Axis should be 0 or 1.")
raise ValueError("Axis should be 0 or 1.") # pragma: no cover

def mean(
self, axis: int = 0, null_strategy: str = "ignore"
Expand Down Expand Up @@ -3449,7 +3465,7 @@ def mean(
return wrap_df(self._df.mean())
if axis == 1:
return pli.wrap_s(self._df.hmean(null_strategy))
raise ValueError("Axis should be 0 or 1.")
raise ValueError("Axis should be 0 or 1.") # pragma: no cover

def std(self) -> "DataFrame":
"""
Expand Down Expand Up @@ -3757,13 +3773,10 @@ def fold(
function that takes two `Series` and returns a `Series`.
"""
if self.width == 1:
return self.to_series(0)
df = self
acc = operation(df.to_series(0), df.to_series(1))
acc = self.to_series(0)

for i in range(2, df.width):
acc = operation(acc, df.to_series(i))
for i in range(1, self.width):
acc = operation(acc, self.to_series(i))
return acc

def row(self, index: int) -> Tuple[Any]:
Expand Down
215 changes: 209 additions & 6 deletions py-polars/tests/test_df.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,11 @@ def test_sort() -> None:
df.sort("a", in_place=True)
assert df.frame_equal(pl.DataFrame({"a": [1, 2, 3], "b": [2, 1, 3]}))

# test in-place + passing a list
df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3]})
df.sort(["a", "b"], in_place=True)
assert df.frame_equal(pl.DataFrame({"a": [1, 2, 3], "b": [2, 1, 3]}))


def test_replace() -> None:
df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3]})
Expand Down Expand Up @@ -339,16 +344,56 @@ def test_null_count() -> None:
assert df.null_count().shape == (1, 2)


def test_head_tail() -> None:
def test_head_tail_limit() -> None:
df = pl.DataFrame({"a": range(10), "b": range(10)})
assert df.head(5).height == 5
assert df.limit(5).height == 5
assert df.tail(5).height == 5

assert not df.head(5).frame_equal(df.tail(5))
# check if it doesn't fail when out of bounds
assert df.head(100).height == 10
assert df.limit(100).height == 10
assert df.tail(100).height == 10

# limit is an alias of head
assert df.head(5).frame_equal(df.limit(5))


def test_drop_nulls() -> None:
df = pl.DataFrame(
{
"foo": [1, 2, 3],
"bar": [6, None, 8],
"ham": ["a", "b", "c"],
}
)

result = df.drop_nulls()
expected = pl.DataFrame(
{
"foo": [1, 3],
"bar": [6, 8],
"ham": ["a", "c"],
}
)
assert result.frame_equal(expected)

# below we only drop entries if they are null in the column 'foo'
result = df.drop_nulls("foo")
assert result.frame_equal(df)


def test_pipe() -> None:
df = pl.DataFrame({"foo": [1, 2, 3], "bar": [6, None, 8]})

def _multiply(data: pl.DataFrame, mul: int) -> pl.DataFrame:
return data * mul

result = df.pipe(_multiply, mul=3)

assert result.frame_equal(df * 3)


def test_explode() -> None:
df = pl.DataFrame({"letters": ["c", "a"], "nrs": [[1, 2], [1, 3]]})
Expand Down Expand Up @@ -409,6 +454,39 @@ def test_groupby() -> None:
# check if this query runs and thus column names propagate
df.groupby("b").agg(pl.col("c").forward_fill()).explode("c")

# get a specific column
result = df.groupby("b")["a"].count()
assert result.shape == (2, 2)
assert result.columns == ["b", "a_count"]

# make sure all the methods below run
assert df.groupby("b").first().shape == (2, 3)
assert df.groupby("b").last().shape == (2, 3)
assert df.groupby("b").max().shape == (2, 3)
assert df.groupby("b").min().shape == (2, 3)
assert df.groupby("b").count().shape == (2, 3)
assert df.groupby("b").mean().shape == (2, 3)
assert df.groupby("b").n_unique().shape == (2, 3)
assert df.groupby("b").median().shape == (2, 3)
# assert df.groupby("b").quantile(0.5).shape == (2, 3)
assert df.groupby("b").agg_list().shape == (2, 3)


def test_pivot() -> None:
df = pl.DataFrame(
{
"a": [1, 2, 3, 4, 5],
"b": ["a", "a", "b", "b", "b"],
"c": [None, 1, None, 1, None],
}
)
gb = df.groupby("b").pivot("a", "c")
assert gb.first().shape == (2, 6)
assert gb.max().shape == (2, 6)
assert gb.mean().shape == (2, 6)
assert gb.count().shape == (2, 6)
assert gb.median().shape == (2, 6)


def test_join() -> None:
df_left = pl.DataFrame(
Expand Down Expand Up @@ -438,6 +516,14 @@ def test_join() -> None:
assert joined["k"].null_count() == 1
assert joined["a"].null_count() == 0

# we need to pass in a column to join on, either by supplying `on`, or both `left_on` and `right_on`
with pytest.raises(ValueError):
df_left.join(df_right)
with pytest.raises(ValueError):
df_left.join(df_right, right_on="a")
with pytest.raises(ValueError):
df_left.join(df_right, left_on="a")

df_a = pl.DataFrame({"a": [1, 2, 1, 1], "b": ["a", "b", "c", "c"]})
df_b = pl.DataFrame(
{"foo": [1, 1, 1], "bar": ["a", "c", "c"], "ham": ["let", "var", "const"]}
Expand Down Expand Up @@ -477,11 +563,59 @@ def test_joins_dispatch() -> None:
dfa.join(dfa, on=["date"], how=how)


def test_hstack() -> None:
@pytest.mark.parametrize(
"stack,exp_shape,exp_columns",
[
([pl.Series("stacked", [-1, -1, -1])], (3, 3), ["a", "b", "stacked"]),
(
[pl.Series("stacked2", [-1, -1, -1]), pl.Series("stacked3", [-1, -1, -1])],
(3, 4),
["a", "b", "stacked2", "stacked3"],
),
],
)
@pytest.mark.parametrize("in_place", [True, False])
def test_hstack_list_of_series(
stack: list, exp_shape: tuple, exp_columns: list, in_place: bool
) -> None:
df = pl.DataFrame({"a": [2, 1, 3], "b": ["a", "b", "c"]})
df_out = df.hstack(stack, in_place=in_place)
if in_place:
assert df.shape == exp_shape
assert df.columns == exp_columns
else:
assert df_out.shape == exp_shape # type: ignore
assert df_out.columns == exp_columns # type: ignore


@pytest.mark.parametrize("in_place", [True, False])
def test_hstack_dataframe(in_place: bool) -> None:
df = pl.DataFrame({"a": [2, 1, 3], "b": ["a", "b", "c"]})
df.hstack([pl.Series("stacked", [-1, -1, -1])], in_place=True)
assert df.shape == (3, 3)
assert df.columns == ["a", "b", "stacked"]
df2 = pl.DataFrame({"c": [2, 1, 3], "d": ["a", "b", "c"]})
df_out = df.hstack(df2, in_place=in_place)
expected = pl.DataFrame(
{"a": [2, 1, 3], "b": ["a", "b", "c"], "c": [2, 1, 3], "d": ["a", "b", "c"]}
)
if in_place:
assert df.frame_equal(expected)
else:
assert df_out.frame_equal(expected) # type: ignore


@pytest.mark.parametrize("in_place", [True, False])
def test_vstack(in_place: bool) -> None:
df1 = pl.DataFrame({"foo": [1, 2], "bar": [6, 7], "ham": ["a", "b"]})
df2 = pl.DataFrame({"foo": [3, 4], "bar": [8, 9], "ham": ["c", "d"]})

expected = pl.DataFrame(
{"foo": [1, 2, 3, 4], "bar": [6, 7, 8, 9], "ham": ["a", "b", "c", "d"]}
)

out = df1.vstack(df2, in_place=in_place)
if in_place:
assert df1.frame_equal(expected)
else:
assert out.frame_equal(expected) # type: ignore


def test_drop() -> None:
Expand Down Expand Up @@ -531,7 +665,10 @@ def test_set() -> None:
def test_melt() -> None:
df = pl.DataFrame({"A": ["a", "b", "c"], "B": [1, 3, 5], "C": [2, 4, 6]})
melted = df.melt(id_vars="A", value_vars=["B", "C"])
assert melted["value"] == [1, 3, 4, 2, 4, 6]
assert all(melted["value"] == [1, 3, 5, 2, 4, 6])

melted = df.melt(id_vars="A", value_vars="B")
assert all(melted["value"] == [1, 3, 5])


def test_shift() -> None:
Expand Down Expand Up @@ -689,6 +826,9 @@ def test_df_fold() -> None:
assert len(df.min(axis=1)) == 3
assert len(df.max(axis=1)) == 3

df_width_one = df[["a"]]
assert df_width_one.fold(lambda s1, s2: s1).series_equal(df["a"])


def test_row_tuple() -> None:
df = pl.DataFrame({"a": ["foo", "bar", "2"], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]})
Expand Down Expand Up @@ -1332,6 +1472,69 @@ def test_empty_projection() -> None:
assert pl.DataFrame({"a": [1, 2], "b": [3, 4]}).select([]).shape == (0, 0)


def test_with_column_renamed() -> None:
df = pl.DataFrame({"a": [1, 2], "b": [3, 4]})
result = df.with_column_renamed("b", "c")
expected = pl.DataFrame({"a": [1, 2], "c": [3, 4]})
assert result.frame_equal(expected)


def test_fill_null() -> None:
df = pl.DataFrame({"a": [1, 2], "b": [3, None]})
assert df.fill_null(4).frame_equal(pl.DataFrame({"a": [1, 2], "b": [3, 4]}))
assert df.fill_null("max").frame_equal(pl.DataFrame({"a": [1, 2], "b": [3, 3]}))


def test_fill_nan() -> None:
df = pl.DataFrame({"a": [1, 2], "b": [3.0, float("nan")]})
assert df.fill_nan(4).frame_equal(pl.DataFrame({"a": [1, 2], "b": [3, 4]}))


def test_shift_and_fill() -> None:
df = pl.DataFrame(
{
"foo": [1, 2, 3],
"bar": [6, 7, 8],
"ham": ["a", "b", "c"],
}
)
result = df.shift_and_fill(periods=1, fill_value=0)
expected = pl.DataFrame(
{
"foo": [0, 1, 2],
"bar": [0, 6, 7],
"ham": ["0", "a", "b"],
}
)
assert result.frame_equal(expected)


def test_is_duplicated() -> None:
df = pl.DataFrame({"foo": [1, 2, 2], "bar": [6, 7, 7]})
assert df.is_duplicated().series_equal(pl.Series("", [False, True, True]))


def test_is_unique() -> None:
df = pl.DataFrame({"foo": [1, 2, 2], "bar": [6, 7, 7]})
assert df.is_unique().series_equal(pl.Series("", [True, False, False]))


def test_sample() -> None:
df = pl.DataFrame({"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]})
assert df.sample(n=2).shape == (2, 3)
assert df.sample(frac=0.4).shape == (1, 3)


@pytest.mark.parametrize("in_place", [True, False])
def test_shrink_to_fit(in_place: bool) -> None:
df = pl.DataFrame({"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]})

if in_place:
assert df.shrink_to_fit(in_place) is None
else:
assert df.shrink_to_fit(in_place).frame_equal(df) # type: ignore


def test_arithmetic() -> None:
df = pl.DataFrame({"a": [1.0, 2.0], "b": [3.0, 4.0]})

Expand Down

0 comments on commit aaa5805

Please sign in to comment.