Skip to content

Commit

Permalink
fix bug in pandas None arrays
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Nov 2, 2021
1 parent 7ed58e4 commit 8d97a8d
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 7 deletions.
32 changes: 28 additions & 4 deletions py-polars/polars/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,10 +152,24 @@ def sequence_to_pyseries(


def _pandas_series_to_arrow(
values: Union["pd.Series", "pd.DatetimeIndex"], nan_to_none: bool = True
values: Union["pd.Series", "pd.DatetimeIndex"],
nan_to_none: bool = True,
min_len: Optional[int] = None,
) -> "pa.Array":
"""
Convert a pandas Series to an Arrow Array.
Parameters
----------
values
Series to convert to arrow
nan_to_none
Interpret `NaN` as missing values
min_len
in case of null values, this length will be used to create a dummy f64 array (with all values set to null)
Returns
-------
"""
dtype = values.dtype
if dtype == "datetime64[ns]":
Expand All @@ -167,8 +181,15 @@ def _pandas_series_to_arrow(
)
arr = pa.compute.cast(arr, pa.int64())
return pa.compute.cast(arr, pa.timestamp("ms"))
elif dtype == "object" and len(values) > 0 and isinstance(values.iloc[0], str):
return pa.array(values, pa.large_utf8(), from_pandas=nan_to_none)
elif dtype == "object" and len(values) > 0:
if isinstance(values.iloc[0], str):
return pa.array(values, pa.large_utf8(), from_pandas=nan_to_none)

# array is null array, we set to a float64 array
if values.iloc[0] is None and min_len is not None:
return pa.nulls(min_len, pa.float64())
else:
return pa.array(values, from_pandas=nan_to_none)
else:
return pa.array(values, from_pandas=nan_to_none)

Expand Down Expand Up @@ -385,8 +406,11 @@ def pandas_to_pydf(
raise ImportError(
"'pyarrow' is required when constructing a PyDataFrame from a pandas DataFrame."
)
len = data.shape[0]
arrow_dict = {
str(col): _pandas_series_to_arrow(data[col], nan_to_none=nan_to_none)
str(col): _pandas_series_to_arrow(
data[col], nan_to_none=nan_to_none, min_len=len
)
for col in data.columns
}
arrow_table = pa.table(arrow_dict)
Expand Down
10 changes: 7 additions & 3 deletions py-polars/tests/test_interop.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,14 @@ def test_arrow_list_chunked_array():


def test_from_pandas_null():
test_df = pd.DataFrame([{0: None}, {0: None}])
out = pl.DataFrame(test_df)
df = pd.DataFrame([{"a": None}, {"a": None}])
out = pl.DataFrame(df)
assert out.dtypes == [pl.Float64]
assert out["0"][0] is None
assert out["a"][0] is None

df = pd.DataFrame([{"a": None, "b": 1}, {"a": None, "b": 2}])
out = pl.DataFrame(df)
assert out.dtypes == [pl.Float64, pl.Int64]


def test_from_pandas_nested_list():
Expand Down

0 comments on commit 8d97a8d

Please sign in to comment.