Skip to content

Commit

Permalink
feat[python]: re-implement DataFrame.__setitem__ (#4544)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Aug 23, 2022
1 parent a249806 commit 777b61d
Show file tree
Hide file tree
Showing 3 changed files with 151 additions and 6 deletions.
71 changes: 66 additions & 5 deletions py-polars/polars/internals/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1892,11 +1892,72 @@ def __getitem__(
f" of type: '{type(item)}'."
)

def __setitem__(self, key: Any, value: Any) -> None:
raise TypeError(
"'DataFrame' object does not support item assignment by index. "
"Use 'DataFrame.with_columns'"
)
def __setitem__(
self, key: str | list[int] | list[str] | tuple[Any, str | int], value: Any
) -> None: # pragma: no cover
# df["foo"] = series
if isinstance(key, str):
raise TypeError(
"'DataFrame' object does not support 'Series' assignment by index. "
"Use 'DataFrame.with_columns'"
)

# df[["C", "D"]]
elif isinstance(key, list):
# TODO: Use python sequence constructors
if not _NUMPY_AVAILABLE:
raise ImportError("'numpy' is required for this functionality.")
value = np.array(value)
if value.ndim != 2:
raise ValueError("can only set multiple columns with 2D matrix")
if value.shape[1] != len(key):
raise ValueError(
"matrix columns should be equal to list use to determine column"
" names"
)

# todo! we can parallize this by calling from_numpy
columns = []
for (i, name) in enumerate(key):
columns.append(pli.Series(name, value[:, i]))
self._df = self.with_columns(columns)._df

# df[a, b]
elif isinstance(key, tuple):
row_selection, col_selection = key

if (
isinstance(row_selection, pli.Series) and row_selection.dtype == Boolean
) or is_bool_sequence(row_selection):
raise ValueError(
"Not allowed to set 'DataFrame' by boolean mask in the "
"row position. Consider using 'DataFrame.with_columns'"
)

# get series column selection
if isinstance(col_selection, str):
s = self.__getitem__(col_selection)
elif isinstance(col_selection, int):
s = self[:, col_selection]
else:
raise ValueError(f"column selection not understood: {col_selection}")

# dispatch to __setitem__ of Series to do modification
s[row_selection] = value

# now find the location to place series
# df[idx]
if isinstance(col_selection, int):
self.replace_at_idx(col_selection, s)
# df["foo"]
elif isinstance(col_selection, str):
self.replace(col_selection, s)
else:
raise ValueError(
f"Cannot __setitem__ on DataFrame with key: '{key}' "
f"of type: '{type(key)}' and value: '{value}' "
f"of type: '{type(value)}'."
)

def __len__(self) -> int:
return self.height
Expand Down
80 changes: 80 additions & 0 deletions py-polars/tests/test_df.py
Original file line number Diff line number Diff line change
Expand Up @@ -2078,3 +2078,83 @@ def test_filter_sequence() -> None:
df = pl.DataFrame({"a": [1, 2, 3]})
assert df.filter([True, False, True])["a"].to_list() == [1, 3]
assert df.filter(np.array([True, False, True]))["a"].to_list() == [1, 3]


def test_indexing_set() -> None:
df = pl.DataFrame({"bool": [True, True], "str": ["N/A", "N/A"], "nr": [1, 2]})

df[0, "bool"] = False
df[0, "nr"] = 100
df[0, "str"] = "foo"

assert df.to_dict(False) == {
"bool": [False, True],
"str": ["foo", "N/A"],
"nr": [100, 2],
}


def test_set() -> None:
"""
Setting a dataframe using indices is deprecated. We keep these tests because we
only generate a warning
"""
np.random.seed(1)
df = pl.DataFrame(
{"foo": np.random.rand(10), "bar": np.arange(10), "ham": ["h"] * 10}
)
with pytest.raises(
TypeError,
match=r"'DataFrame' object does not support "
r"'Series' assignment by index. Use "
r"'DataFrame.with_columns'",
):
df["new"] = np.random.rand(10)

with pytest.raises(
ValueError,
match=r"Not allowed to set 'DataFrame' by "
r"boolean mask in the row position. "
r"Consider using 'DataFrame.with_columns'",
):
df[df["ham"] > 0.5, "ham"] = "a"
with pytest.raises(
ValueError,
match=r"Not allowed to set 'DataFrame' by "
r"boolean mask in the row position. "
r"Consider using 'DataFrame.with_columns'",
):
df[[True, False], "ham"] = "a"

# set 2D
df = pl.DataFrame({"b": [0, 0]})
df[["A", "B"]] = [[1, 2], [1, 2]]

with pytest.raises(ValueError):
df[["C", "D"]] = 1
with pytest.raises(ValueError):
df[["C", "D"]] = [1, 1]
with pytest.raises(ValueError):
df[["C", "D"]] = [[1, 2, 3], [1, 2, 3]]

# set tuple
df = pl.DataFrame({"b": [0, 0]})
df[0, "b"] = 1
assert df[0, "b"] == 1

df[0, 0] = 2
assert df[0, "b"] == 2

# row and col selection have to be int or str
with pytest.raises(ValueError):
df[:, [1]] = 1 # type: ignore[index]
with pytest.raises(ValueError):
df[True, :] = 1 # type: ignore[index]

# needs to be a 2 element tuple
with pytest.raises(ValueError):
df[(1, 2, 3)] = 1 # type: ignore[index]

# we cannot index with any type, such as bool
with pytest.raises(ValueError):
df[True] = 1 # type: ignore[index]
6 changes: 5 additions & 1 deletion py-polars/tests/test_errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,10 @@ def test_getitem_errs() -> None:
df["a"][{"strange"}]

with pytest.raises(
TypeError, match="'DataFrame' object does not support item assignment"
ValueError,
match=r"Cannot __setitem__ on "
r"DataFrame with key: '{'some'}' of "
r"type: '<class 'set'>' and value: "
r"'foo' of type: '<class 'str'>'",
):
df[{"some"}] = "foo"

0 comments on commit 777b61d

Please sign in to comment.