Skip to content

Commit

Permalink
feat[python]: Complain when Pandas Dataframe contains duplicated colu…
Browse files Browse the repository at this point in the history
…mn names (#4733) (#4744)
  • Loading branch information
ghuls committed Sep 6, 2022
1 parent bf99f6c commit a231449
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 2 deletions.
11 changes: 9 additions & 2 deletions py-polars/polars/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,7 @@ def _pandas_series_to_arrow(
Arrow Array
"""
dtype = values.dtype
dtype = getattr(values, "dtype", None)
if dtype == "object" and len(values) > 0:
first_non_none = _get_first_non_none(values.values) # type: ignore[arg-type]

Expand All @@ -364,8 +364,15 @@ def _pandas_series_to_arrow(
return pa.nulls(min_len, pa.large_utf8())

return pa.array(values, from_pandas=nan_to_none)
else:
elif dtype:
return pa.array(values, from_pandas=nan_to_none)
else:
# Pandas Series is actually a Pandas DataFrame when the original dataframe
# contains duplicated columns and a duplicated column is requested with df["a"].
raise ValueError(
"Duplicate column names found: "
+ f"{str(values.columns.tolist())}" # type: ignore[union-attr]
)


def pandas_to_pyseries(
Expand Down
6 changes: 6 additions & 0 deletions py-polars/tests/unit/test_interop.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,12 @@ def test_from_pandas_datetime() -> None:
pl.from_pandas(df)


def test_from_pandas_duplicated_columns() -> None:
df = pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=["a", "b", "c", "b"])
with pytest.raises(ValueError, match="Duplicate column names found: "):
pl.from_pandas(df)


def test_arrow_list_roundtrip() -> None:
# https://github.com/pola-rs/polars/issues/1064
tbl = pa.table({"a": [1], "b": [[1, 2]]})
Expand Down

0 comments on commit a231449

Please sign in to comment.