Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(python): Implement DataFrame Interchange Protocol through pyarrow #6581

Merged
merged 1 commit into from
Jan 31, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/dataframe/export.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ Export DataFrame data to other formats:
.. autosummary::
:toctree: api/

DataFrame.__dataframe__
DataFrame.to_arrow
DataFrame.to_dict
DataFrame.to_dicts
Expand Down
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/functions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ Conversion
:toctree: api/

from_arrow
from_dataframe
from_dict
from_dicts
from_numpy
Expand Down
2 changes: 2 additions & 0 deletions py-polars/polars/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ def version() -> str:
from polars.cfg import Config
from polars.convert import (
from_arrow,
from_dataframe,
from_dict,
from_dicts,
from_numpy,
Expand Down Expand Up @@ -291,6 +292,7 @@ def version() -> str:
"duration",
"coalesce",
# polars.convert
"from_dataframe",
"from_dict",
"from_dicts",
"from_records",
Expand Down
48 changes: 48 additions & 0 deletions py-polars/polars/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import TYPE_CHECKING, Any, Mapping, Sequence, overload

from polars.datatypes import N_INFER_DEFAULT, SchemaDefinition, SchemaDict
from polars.dependencies import _PYARROW_AVAILABLE
from polars.dependencies import numpy as np
from polars.dependencies import pandas as pd
from polars.dependencies import pyarrow as pa
Expand Down Expand Up @@ -479,3 +480,50 @@ def from_pandas(
)
else:
raise ValueError(f"Expected pandas DataFrame or Series, got {type(df)}.")


def from_dataframe(df: Any, allow_copy: bool = True) -> DataFrame:
"""
Build a Polars DataFrame from any dataframe supporting the interchange protocol.
Parameters
----------
df
Object supporting the dataframe interchange protocol, i.e. must have implemented
the ``__dataframe__`` method.
allow_copy
Allow memory to be copied to perform the conversion. If set to False, causes
conversions that are not zero-copy to fail.
Notes
-----
Details on the dataframe interchange protocol:
https://data-apis.org/dataframe-protocol/latest/index.html
Zero-copy conversions currently cannot be guaranteed and will throw a
``NotImplementedError``.
Using a dedicated function like :func:`from_pandas` or :func:`from_arrow` is a more
efficient method of conversion.
"""
if isinstance(df, DataFrame):
return df
if not hasattr(df, "__dataframe__"):
raise TypeError(
f"`df` of type {type(df)} does not support the dataframe interchange"
" protocol."
)
if not _PYARROW_AVAILABLE or int(pa.__version__.split(".")[0]) < 11:
raise ImportError(
"pyarrow>=11.0.0 is required for converting a dataframe interchange object"
" to a Polars dataframe."
)
if not allow_copy:
raise NotImplementedError(
"Polars cannot guarantee zero-copy conversion from dataframe interchange"
" objects at this time."
)

pa_table = pa.interchange.from_dataframe(df, allow_copy=allow_copy)
return from_arrow(pa_table, rechunk=allow_copy) # type: ignore[return-value]
39 changes: 39 additions & 0 deletions py-polars/polars/internals/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
INTEGER_DTYPES,
N_INFER_DEFAULT,
Boolean,
Categorical,
DataTypeClass,
Float64,
Int8,
Expand Down Expand Up @@ -105,6 +106,8 @@
from typing_extensions import Concatenate, ParamSpec, TypeAlias

if TYPE_CHECKING:
from pyarrow.interchange.dataframe import _PyArrowDataFrame

from polars.internals.type_aliases import (
AsofJoinStrategy,
AvroCompression,
Expand Down Expand Up @@ -1126,6 +1129,42 @@ def schema(self) -> SchemaDict:
"""
return dict(zip(self.columns, self.dtypes))

def __dataframe__(
self, nan_as_null: bool = False, allow_copy: bool = True
) -> _PyArrowDataFrame:
"""
Convert to a dataframe object implementing the dataframe interchange protocol.
Parameters
----------
nan_as_null
Overwrite null values in the data with ``NaN``.
allow_copy
Allow memory to be copied to perform the conversion. If set to False, causes
conversions that are not zero-copy to fail.
Notes
-----
Details on the dataframe interchange protocol:
https://data-apis.org/dataframe-protocol/latest/index.html
`nan_as_null` currently has no effect; once support for nullable extension
dtypes is added, this value should be propagated to columns.
"""
if not _PYARROW_AVAILABLE or int(pa.__version__.split(".")[0]) < 11:
raise ImportError(
"pyarrow>=11.0.0 is required for converting a Polars dataframe to a"
" dataframe interchange object."
)
if not allow_copy and Categorical in self.schema.values():
raise NotImplementedError(
"Polars does not offer zero-copy conversion to Arrow for categorical"
" columns. Set `allow_copy=True` or cast categorical columns to"
" string first."
)
return self.to_arrow().__dataframe__(nan_as_null, allow_copy)

def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame:
"""Compare a DataFrame with another object."""
if isinstance(other, DataFrame):
Expand Down
118 changes: 118 additions & 0 deletions py-polars/tests/unit/test_interchange.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
import sys

import pandas as pd
import pytest
from _pytest.monkeypatch import MonkeyPatch

import polars as pl
from polars.testing import assert_frame_equal


def test_interchange() -> None:
df = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["foo", "bar"]})
dfi = df.__dataframe__()

# Testing some random properties to make sure conversion happened correctly
assert dfi.num_rows() == 2
assert dfi.get_column(0).dtype[1] == 64
assert dfi.get_column_by_name("c").get_buffers()["data"][0].bufsize == 6


def test_interchange_pyarrow_required(monkeypatch: MonkeyPatch) -> None:
monkeypatch.setattr(pl.internals.dataframe.frame, "_PYARROW_AVAILABLE", False)

df = pl.DataFrame({"a": [1, 2]})
with pytest.raises(ImportError, match="pyarrow"):
df.__dataframe__()


def test_interchange_pyarrow_min_version(monkeypatch: MonkeyPatch) -> None:
monkeypatch.setattr(
pl.internals.dataframe.frame.pa, # type: ignore[attr-defined]
"__version__",
"10.0.0",
)

df = pl.DataFrame({"a": [1, 2]})
with pytest.raises(ImportError, match="pyarrow"):
df.__dataframe__()


def test_interchange_categorical() -> None:
df = pl.DataFrame({"a": ["foo", "bar"]}, schema={"a": pl.Categorical})

# Conversion requires copy
dfi = df.__dataframe__(allow_copy=True)
assert dfi.get_column_by_name("a").dtype[0] == 23 # 23 signifies categorical dtype

# If copy not allowed, throws an error
with pytest.raises(NotImplementedError, match="categorical"):
df.__dataframe__(allow_copy=False)


def test_from_dataframe() -> None:
df = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["foo", "bar"]})
dfi = df.__dataframe__()
result = pl.from_dataframe(dfi)
assert_frame_equal(result, df)


@pytest.mark.xfail(
sys.version_info < (3, 8),
reason="Pandas does not implement the protocol on Python 3.7",
)
def test_from_dataframe_pandas() -> None:
data = {"a": [1, 2], "b": [3.0, 4.0], "c": ["foo", "bar"]}

# Pandas dataframe
df = pd.DataFrame(data)
result = pl.from_dataframe(df)
expected = pl.DataFrame(data)
assert_frame_equal(result, expected)


@pytest.mark.xfail(
sys.version_info < (3, 8),
reason="Pandas does not implement the protocol on Python 3.7",
)
def test_from_dataframe_allow_copy() -> None:
# Zero copy only allowed when input is already a Polars dataframe
df = pl.DataFrame({"a": [1, 2]})
result = pl.from_dataframe(df, allow_copy=True)
assert_frame_equal(result, df)

# Zero copy cannot be guaranteed for other inputs at this time
df_pandas = pd.DataFrame({"a": [1, 2]})
with pytest.raises(NotImplementedError):
pl.from_dataframe(df_pandas, allow_copy=False)


def test_from_dataframe_invalid_type() -> None:
df = [[1, 2], [3, 4]]
with pytest.raises(TypeError):
pl.from_dataframe(df)


def test_from_dataframe_pyarrow_required(monkeypatch: MonkeyPatch) -> None:
monkeypatch.setattr(pl.convert, "_PYARROW_AVAILABLE", False)

df = pl.DataFrame({"a": [1, 2]})
with pytest.raises(ImportError, match="pyarrow"):
pl.from_dataframe(df.__dataframe__())

# 'Converting' from a Polars dataframe does not hit this requirement
result = pl.from_dataframe(df)
assert_frame_equal(result, df)


def test_from_dataframe_pyarrow_min_version(monkeypatch: MonkeyPatch) -> None:
dfi = pl.DataFrame({"a": [1, 2]}).__dataframe__()

monkeypatch.setattr(
pl.convert.pa, # type: ignore[attr-defined]
"__version__",
"10.0.0",
)

with pytest.raises(ImportError, match="pyarrow"):
pl.from_dataframe(dfi)