Add nan_to_none flag for converting NaN to None for from_pandas (#1393)

pola-rs · Sep 20, 2021 · 6621a17 · 6621a17
1 parent d7d63b1
commit 6621a17
Show file tree

Hide file tree

Showing 6 changed files with 72 additions and 12 deletions.
diff --git a/py-polars/polars/convert.py b/py-polars/polars/convert.py
@@ -213,6 +213,7 @@ def from_arrow(
 def from_pandas(
     df: Union["pd.DataFrame", "pd.Series", "pd.DatetimeIndex"],
     rechunk: bool = True,
+    nan_to_none: bool = True,
 ) -> Union["pl.Series", "pl.DataFrame"]:
     """
     Construct a Polars DataFrame or Series from a pandas DataFrame or Series.
@@ -228,6 +229,8 @@ def from_pandas(
         labels already present in the data. Must match data dimensions.
     rechunk : bool, default True
         Make sure that all data is contiguous.
+    nan_to_none : bool, default True
+        If data contains NaN values PyArrow will convert the NaN to None
 
     Returns
     -------
@@ -270,9 +273,9 @@ def from_pandas(
         raise ImportError("from_pandas requires pandas to be installed.") from e
 
     if isinstance(df, (pd.Series, pd.DatetimeIndex)):
-        return pl.Series._from_pandas("", df)
+        return pl.Series._from_pandas("", df, nan_to_none=nan_to_none)
     elif isinstance(df, pd.DataFrame):
-        return pl.DataFrame._from_pandas(df, rechunk=rechunk)
+        return pl.DataFrame._from_pandas(df, rechunk=rechunk, nan_to_none=nan_to_none)
     else:
         raise ValueError(f"Expected pandas DataFrame or Series, got {type(df)}.")
 

diff --git a/py-polars/polars/eager/frame.py b/py-polars/polars/eager/frame.py
@@ -342,6 +342,7 @@ def _from_pandas(
         data: "pd.DataFrame",
         columns: Optional[Sequence[str]] = None,
         rechunk: bool = True,
+        nan_to_none: bool = True,
     ) -> "DataFrame":
         """
         Construct a Polars DataFrame from a pandas DataFrame.
@@ -355,12 +356,18 @@ def _from_pandas(
             labels already present in the data. Must match data dimensions.
         rechunk : bool, default True
             Make sure that all data is contiguous.
+        nan_to_none : bool, default True
+            If data contains NaN values PyArrow will convert the NaN to None
 
         Returns
         -------
         DataFrame
         """
-        return cls._from_pydf(pandas_to_pydf(data, columns=columns, rechunk=rechunk))
+        return cls._from_pydf(
+            pandas_to_pydf(
+                data, columns=columns, rechunk=rechunk, nan_to_none=nan_to_none
+            )
+        )
 
     @classmethod
     def from_arrow(cls, table: pa.Table, rechunk: bool = True) -> "DataFrame":

diff --git a/py-polars/polars/eager/series.py b/py-polars/polars/eager/series.py
@@ -239,12 +239,17 @@ def _from_arrow(cls, name: str, values: pa.Array) -> "Series":
 
     @classmethod
     def _from_pandas(
-        cls, name: str, values: Union["pd.Series", "pd.DatetimeIndex"]
+        cls,
+        name: str,
+        values: Union["pd.Series", "pd.DatetimeIndex"],
+        nan_to_none: bool = True,
     ) -> "Series":
         """
         Construct a Series from a pandas Series or DatetimeIndex.
         """
-        return cls._from_pyseries(pandas_to_pyseries(name, values))
+        return cls._from_pyseries(
+            pandas_to_pyseries(name, values, nan_to_none=nan_to_none)
+        )
 
     def inner(self) -> "PySeries":
         return self._s

diff --git a/py-polars/polars/internals/construction.py b/py-polars/polars/internals/construction.py
@@ -135,7 +135,9 @@ def sequence_to_pyseries(
             return constructor(name, values, strict)
 
 
-def _pandas_series_to_arrow(values: Union["pd.Series", "pd.DatetimeIndex"]) -> pa.Array:
+def _pandas_series_to_arrow(
+    values: Union["pd.Series", "pd.DatetimeIndex"], nan_to_none: bool = True
+) -> pa.Array:
     """
     Convert a pandas Series to an Arrow array.
     """
@@ -144,25 +146,29 @@ def _pandas_series_to_arrow(values: Union["pd.Series", "pd.DatetimeIndex"]) -> p
         # We first cast to ms because that's the unit of Date64,
         # Then we cast to via int64 to date64. Casting directly to Date64 lead to
         # loss of time information https://github.com/pola-rs/polars/issues/476
-        arr = pa.array(np.array(values.values, dtype="datetime64[ms]"))
+        arr = pa.array(
+            np.array(values.values, dtype="datetime64[ms]"), from_pandas=nan_to_none
+        )
         arr = pa.compute.cast(arr, pa.int64())
         return pa.compute.cast(arr, pa.date64())
     elif dtype == "object" and len(values) > 0 and isinstance(values.iloc[0], str):
-        return pa.array(values, pa.large_utf8())
+        return pa.array(values, pa.large_utf8(), from_pandas=nan_to_none)
     else:
-        return pa.array(values)
+        return pa.array(values, from_pandas=nan_to_none)
 
 
 def pandas_to_pyseries(
-    name: str, values: Union["pd.Series", "pd.DatetimeIndex"]
+    name: str, values: Union["pd.Series", "pd.DatetimeIndex"], nan_to_none: bool = True
 ) -> "PySeries":
     """
     Construct a PySeries from a pandas Series or DatetimeIndex.
     """
     # TODO: Change `if not name` to `if name is not None` once name is Optional[str]
     if not name and values.name is not None:
         name = str(values.name)
-    return arrow_to_pyseries(name, _pandas_series_to_arrow(values))
+    return arrow_to_pyseries(
+        name, _pandas_series_to_arrow(values, nan_to_none=nan_to_none)
+    )
 
 
 ###################################
@@ -356,7 +362,11 @@ def pandas_to_pydf(
     data: "pd.DataFrame",
     columns: Optional[Sequence[str]] = None,
     rechunk: bool = True,
+    nan_to_none: bool = True,
 ) -> "PyDataFrame":
-    arrow_dict = {str(col): _pandas_series_to_arrow(data[col]) for col in data.columns}
+    arrow_dict = {
+        str(col): _pandas_series_to_arrow(data[col], nan_to_none=nan_to_none)
+        for col in data.columns
+    }
     arrow_table = pa.table(arrow_dict)
     return arrow_to_pydf(arrow_table, columns=columns, rechunk=rechunk)
diff --git a/py-polars/tests/test_df.py b/py-polars/tests/test_df.py
@@ -611,6 +611,27 @@ def test_from_pandas():
     assert out.shape == (3, 9)
 
 
+def test_from_pandas_nan_to_none():
+    from pyarrow import ArrowInvalid
+
+    df = pd.DataFrame(
+        {
+            "bools_nulls": [None, True, False],
+            "int_nulls": [1, None, 3],
+            "floats_nulls": [1.0, None, 3.0],
+            "strings_nulls": ["foo", None, "ham"],
+            "nulls": [None, np.nan, np.nan],
+        }
+    )
+    out_true = pl.from_pandas(df)
+    out_false = pl.from_pandas(df, nan_to_none=False)
+    df.loc[2, "nulls"] = pd.NA
+    assert [val is None for val in out_true["nulls"]]
+    assert [np.isnan(val) for val in out_false["nulls"][1:]]
+    with pytest.raises(ArrowInvalid, match="Could not convert"):
+        pl.from_pandas(df, nan_to_none=False)
+
+
 def test_custom_groupby():
     df = pl.DataFrame({"A": ["a", "a", "c", "c"], "B": [1, 3, 5, 2]})
     assert df.groupby("A").select("B").apply(lambda x: x.sum()).shape == (2, 2)

diff --git a/py-polars/tests/test_series.py b/py-polars/tests/test_series.py
@@ -1,6 +1,7 @@
 from datetime import date, datetime
 
 import numpy as np
+import pandas as pd
 import pyarrow as pa
 import pytest
 
@@ -443,6 +444,19 @@ def test_from_pydatetime():
     assert s.dt[0] == dates[0]
 
 
+def test_from_pandas_nan_to_none():
+    from pyarrow import ArrowInvalid
+
+    df = pd.Series([2, np.nan, None], name="pd")
+    out_true = pl.from_pandas(df)
+    out_false = pl.from_pandas(df, nan_to_none=False)
+    df.loc[2] = pd.NA
+    assert [val is None for val in out_true]
+    assert [np.isnan(val) for val in out_false[1:]]
+    with pytest.raises(ArrowInvalid, match="Could not convert"):
+        pl.from_pandas(df, nan_to_none=False)
+
+
 def test_round():
     a = pl.Series("f", [1.003, 2.003])
     b = a.round(2)