Skip to content

Commit

Permalink
BUG: Interchange protocol implementation allows non-string column nam…
Browse files Browse the repository at this point in the history
…es (pandas-dev#57174)

* convert non-string colnames to strings in interchange protocol

* remove irrelevant statement

* informative error message if two columns end up becoming duplicates
  • Loading branch information
MarcoGorelli committed Feb 2, 2024
1 parent 1d1672d commit 8ed7dae
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 3 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.2.1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ Bug fixes
~~~~~~~~~
- Fixed bug in :func:`pandas.api.interchange.from_dataframe` which was raising for Nullable integers (:issue:`55069`)
- Fixed bug in :func:`pandas.api.interchange.from_dataframe` which was raising for empty inputs (:issue:`56700`)
- Fixed bug in :func:`pandas.api.interchange.from_dataframe` which wasn't converting columns names to strings (:issue:`55069`)
- Fixed bug in :meth:`DataFrame.__getitem__` for empty :class:`DataFrame` with Copy-on-Write enabled (:issue:`57130`)

.. ---------------------------------------------------------------------------
Expand Down
8 changes: 8 additions & 0 deletions pandas/core/interchange/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,14 @@ def __init__(self, column: pd.Series, allow_copy: bool = True) -> None:
Note: doesn't deal with extension arrays yet, just assume a regular
Series/ndarray for now.
"""
if isinstance(column, pd.DataFrame):
raise TypeError(
"Expected a Series, got a DataFrame. This likely happened "
"because you called __dataframe__ on a DataFrame which, "
"after converting column names to string, resulted in duplicated "
f"names: {column.columns}. Please rename these columns before "
"using the interchange protocol."
)
if not isinstance(column, pd.Series):
raise NotImplementedError(f"Columns of type {type(column)} not handled yet")

Expand Down
2 changes: 1 addition & 1 deletion pandas/core/interchange/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def __init__(self, df: DataFrame, allow_copy: bool = True) -> None:
Constructor - an instance of this (private) class is returned from
`pd.DataFrame.__dataframe__`.
"""
self._df = df
self._df = df.rename(columns=str, copy=False)
self._allow_copy = allow_copy

def __dataframe__(
Expand Down
26 changes: 24 additions & 2 deletions pandas/tests/interchange/test_impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,8 +162,6 @@ def test_missing_from_masked():
}
)

df2 = df.__dataframe__()

rng = np.random.default_rng(2)
dict_null = {col: rng.integers(low=0, high=len(df)) for col in df.columns}
for col, num_nulls in dict_null.items():
Expand Down Expand Up @@ -395,6 +393,30 @@ def test_large_string():
tm.assert_frame_equal(result, expected)


def test_non_str_names():
# https://github.com/pandas-dev/pandas/issues/56701
df = pd.Series([1, 2, 3], name=0).to_frame()
names = df.__dataframe__().column_names()
assert names == ["0"]


def test_non_str_names_w_duplicates():
# https://github.com/pandas-dev/pandas/issues/56701
df = pd.DataFrame({"0": [1, 2, 3], 0: [4, 5, 6]})
dfi = df.__dataframe__()
with pytest.raises(
TypeError,
match=(
"Expected a Series, got a DataFrame. This likely happened because you "
"called __dataframe__ on a DataFrame which, after converting column "
r"names to string, resulted in duplicated names: Index\(\['0', '0'\], "
r"dtype='object'\). Please rename these columns before using the "
"interchange protocol."
),
):
pd.api.interchange.from_dataframe(dfi, allow_copy=False)


@pytest.mark.parametrize(
"dtype", ["Int8", pytest.param("Int8[pyarrow]", marks=td.skip_if_no("pyarrow"))]
)
Expand Down

0 comments on commit 8ed7dae

Please sign in to comment.