Skip to content

Commit

Permalink
Add nan_to_none flag for converting NaN to None for from_pandas (#1393)
Browse files Browse the repository at this point in the history
  • Loading branch information
mahadeveaswar committed Sep 20, 2021
1 parent d7d63b1 commit 6621a17
Show file tree
Hide file tree
Showing 6 changed files with 72 additions and 12 deletions.
7 changes: 5 additions & 2 deletions py-polars/polars/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,7 @@ def from_arrow(
def from_pandas(
df: Union["pd.DataFrame", "pd.Series", "pd.DatetimeIndex"],
rechunk: bool = True,
nan_to_none: bool = True,
) -> Union["pl.Series", "pl.DataFrame"]:
"""
Construct a Polars DataFrame or Series from a pandas DataFrame or Series.
Expand All @@ -228,6 +229,8 @@ def from_pandas(
labels already present in the data. Must match data dimensions.
rechunk : bool, default True
Make sure that all data is contiguous.
nan_to_none : bool, default True
If data contains NaN values PyArrow will convert the NaN to None
Returns
-------
Expand Down Expand Up @@ -270,9 +273,9 @@ def from_pandas(
raise ImportError("from_pandas requires pandas to be installed.") from e

if isinstance(df, (pd.Series, pd.DatetimeIndex)):
return pl.Series._from_pandas("", df)
return pl.Series._from_pandas("", df, nan_to_none=nan_to_none)
elif isinstance(df, pd.DataFrame):
return pl.DataFrame._from_pandas(df, rechunk=rechunk)
return pl.DataFrame._from_pandas(df, rechunk=rechunk, nan_to_none=nan_to_none)
else:
raise ValueError(f"Expected pandas DataFrame or Series, got {type(df)}.")

Expand Down
9 changes: 8 additions & 1 deletion py-polars/polars/eager/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,7 @@ def _from_pandas(
data: "pd.DataFrame",
columns: Optional[Sequence[str]] = None,
rechunk: bool = True,
nan_to_none: bool = True,
) -> "DataFrame":
"""
Construct a Polars DataFrame from a pandas DataFrame.
Expand All @@ -355,12 +356,18 @@ def _from_pandas(
labels already present in the data. Must match data dimensions.
rechunk : bool, default True
Make sure that all data is contiguous.
nan_to_none : bool, default True
If data contains NaN values PyArrow will convert the NaN to None
Returns
-------
DataFrame
"""
return cls._from_pydf(pandas_to_pydf(data, columns=columns, rechunk=rechunk))
return cls._from_pydf(
pandas_to_pydf(
data, columns=columns, rechunk=rechunk, nan_to_none=nan_to_none
)
)

@classmethod
def from_arrow(cls, table: pa.Table, rechunk: bool = True) -> "DataFrame":
Expand Down
9 changes: 7 additions & 2 deletions py-polars/polars/eager/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,12 +239,17 @@ def _from_arrow(cls, name: str, values: pa.Array) -> "Series":

@classmethod
def _from_pandas(
cls, name: str, values: Union["pd.Series", "pd.DatetimeIndex"]
cls,
name: str,
values: Union["pd.Series", "pd.DatetimeIndex"],
nan_to_none: bool = True,
) -> "Series":
"""
Construct a Series from a pandas Series or DatetimeIndex.
"""
return cls._from_pyseries(pandas_to_pyseries(name, values))
return cls._from_pyseries(
pandas_to_pyseries(name, values, nan_to_none=nan_to_none)
)

def inner(self) -> "PySeries":
return self._s
Expand Down
24 changes: 17 additions & 7 deletions py-polars/polars/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,9 @@ def sequence_to_pyseries(
return constructor(name, values, strict)


def _pandas_series_to_arrow(values: Union["pd.Series", "pd.DatetimeIndex"]) -> pa.Array:
def _pandas_series_to_arrow(
values: Union["pd.Series", "pd.DatetimeIndex"], nan_to_none: bool = True
) -> pa.Array:
"""
Convert a pandas Series to an Arrow array.
"""
Expand All @@ -144,25 +146,29 @@ def _pandas_series_to_arrow(values: Union["pd.Series", "pd.DatetimeIndex"]) -> p
# We first cast to ms because that's the unit of Date64,
# Then we cast to via int64 to date64. Casting directly to Date64 lead to
# loss of time information https://github.com/pola-rs/polars/issues/476
arr = pa.array(np.array(values.values, dtype="datetime64[ms]"))
arr = pa.array(
np.array(values.values, dtype="datetime64[ms]"), from_pandas=nan_to_none
)
arr = pa.compute.cast(arr, pa.int64())
return pa.compute.cast(arr, pa.date64())
elif dtype == "object" and len(values) > 0 and isinstance(values.iloc[0], str):
return pa.array(values, pa.large_utf8())
return pa.array(values, pa.large_utf8(), from_pandas=nan_to_none)
else:
return pa.array(values)
return pa.array(values, from_pandas=nan_to_none)


def pandas_to_pyseries(
name: str, values: Union["pd.Series", "pd.DatetimeIndex"]
name: str, values: Union["pd.Series", "pd.DatetimeIndex"], nan_to_none: bool = True
) -> "PySeries":
"""
Construct a PySeries from a pandas Series or DatetimeIndex.
"""
# TODO: Change `if not name` to `if name is not None` once name is Optional[str]
if not name and values.name is not None:
name = str(values.name)
return arrow_to_pyseries(name, _pandas_series_to_arrow(values))
return arrow_to_pyseries(
name, _pandas_series_to_arrow(values, nan_to_none=nan_to_none)
)


###################################
Expand Down Expand Up @@ -356,7 +362,11 @@ def pandas_to_pydf(
data: "pd.DataFrame",
columns: Optional[Sequence[str]] = None,
rechunk: bool = True,
nan_to_none: bool = True,
) -> "PyDataFrame":
arrow_dict = {str(col): _pandas_series_to_arrow(data[col]) for col in data.columns}
arrow_dict = {
str(col): _pandas_series_to_arrow(data[col], nan_to_none=nan_to_none)
for col in data.columns
}
arrow_table = pa.table(arrow_dict)
return arrow_to_pydf(arrow_table, columns=columns, rechunk=rechunk)
21 changes: 21 additions & 0 deletions py-polars/tests/test_df.py
Original file line number Diff line number Diff line change
Expand Up @@ -611,6 +611,27 @@ def test_from_pandas():
assert out.shape == (3, 9)


def test_from_pandas_nan_to_none():
from pyarrow import ArrowInvalid

df = pd.DataFrame(
{
"bools_nulls": [None, True, False],
"int_nulls": [1, None, 3],
"floats_nulls": [1.0, None, 3.0],
"strings_nulls": ["foo", None, "ham"],
"nulls": [None, np.nan, np.nan],
}
)
out_true = pl.from_pandas(df)
out_false = pl.from_pandas(df, nan_to_none=False)
df.loc[2, "nulls"] = pd.NA
assert [val is None for val in out_true["nulls"]]
assert [np.isnan(val) for val in out_false["nulls"][1:]]
with pytest.raises(ArrowInvalid, match="Could not convert"):
pl.from_pandas(df, nan_to_none=False)


def test_custom_groupby():
df = pl.DataFrame({"A": ["a", "a", "c", "c"], "B": [1, 3, 5, 2]})
assert df.groupby("A").select("B").apply(lambda x: x.sum()).shape == (2, 2)
Expand Down
14 changes: 14 additions & 0 deletions py-polars/tests/test_series.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from datetime import date, datetime

import numpy as np
import pandas as pd
import pyarrow as pa
import pytest

Expand Down Expand Up @@ -443,6 +444,19 @@ def test_from_pydatetime():
assert s.dt[0] == dates[0]


def test_from_pandas_nan_to_none():
from pyarrow import ArrowInvalid

df = pd.Series([2, np.nan, None], name="pd")
out_true = pl.from_pandas(df)
out_false = pl.from_pandas(df, nan_to_none=False)
df.loc[2] = pd.NA
assert [val is None for val in out_true]
assert [np.isnan(val) for val in out_false[1:]]
with pytest.raises(ArrowInvalid, match="Could not convert"):
pl.from_pandas(df, nan_to_none=False)


def test_round():
a = pl.Series("f", [1.003, 2.003])
b = a.round(2)
Expand Down

0 comments on commit 6621a17

Please sign in to comment.