Skip to content

Commit

Permalink
Unify from_pandas construction (#1037)
Browse files Browse the repository at this point in the history
  • Loading branch information
stinodego authored and ritchie46 committed Jul 26, 2021
1 parent d8257a8 commit f44a9cc
Show file tree
Hide file tree
Showing 6 changed files with 246 additions and 212 deletions.
86 changes: 49 additions & 37 deletions py-polars/polars/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import numpy as np
import pyarrow as pa
import pyarrow.compute

import polars as pl

Expand Down Expand Up @@ -139,7 +138,6 @@ def from_arrow(
Examples
--------
Constructing a DataFrame from an Arrow table:
```python
Expand Down Expand Up @@ -183,60 +181,74 @@ def from_arrow(
raise ValueError(f"Expected Arrow table or array, got {type(a)}.")


def _from_pandas_helper(a: Union["pd.Series", "pd.DatetimeIndex"]) -> pa.Array:
dtype = a.dtype
if dtype == "datetime64[ns]":
# We first cast to ms because that's the unit of Date64,
# Then we cast to via int64 to date64. Casting directly to Date64 lead to
# loss of time information https://github.com/ritchie46/polars/issues/476
arr = pa.array(np.array(a.values, dtype="datetime64[ms]"))
arr = pa.compute.cast(arr, pa.int64())
return pa.compute.cast(arr, pa.date64())
elif dtype == "object" and isinstance(a.iloc[0], str):
return pa.array(a, pa.large_utf8())
else:
return pa.array(a)


def from_pandas(
df: Union["pd.DataFrame", "pd.Series", "pd.DatetimeIndex"],
rechunk: bool = True,
) -> Union["pl.Series", "pl.DataFrame"]:
"""
Convert from a pandas DataFrame to a polars DataFrame.
Construct a Polars DataFrame or Series from a pandas DataFrame or Series.
Requires the pandas package to be installed.
Parameters
----------
df
DataFrame to convert.
rechunk
data : pandas DataFrame, Series, or DatetimeIndex
Data represented as a pandas DataFrame, Series, or DatetimeIndex.
columns : Sequence of str, default None
Column labels to use for resulting DataFrame. If specified, overrides any
labels already present in the data. Must match data dimensions.
rechunk : bool, default True
Make sure that all data is contiguous.
Returns
-------
A Polars DataFrame
DataFrame
Examples
--------
Constructing a DataFrame from a pandas DataFrame:
```python
>>> pd_df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['a', 'b', 'c'])
>>> df = pl.from_pandas(pd_df)
>>> df
shape: (2, 3)
╭─────┬─────┬─────╮
│ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 │
╞═════╪═════╪═════╡
│ 1 ┆ 2 ┆ 3 │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
│ 4 ┆ 5 ┆ 6 │
╰─────┴─────┴─────╯
```
Constructing a Series from a pandas Series:
```python
>>> pd_series = pd.Series([1,2,3], name='pd')
>>> df = pl.from_pandas(pd_series)
>>> df
shape: (3,)
Series: 'pd' [i64]
[
1
2
3
]
"""
try:
import pandas as pd
except ImportError as e:
raise ImportError("from_pandas requires pandas to be installed.") from e

if isinstance(df, (pd.Series, pd.DatetimeIndex)):
return from_arrow(_from_pandas_helper(df))

# Note: we first tried to infer the schema via pyarrow and then modify the schema if
# needed. However arrow 3.0 determines the type of a string like this:
# pa.array(array).type
# Needlessly allocating and failing when the string is too large for the string dtype.

data = {}

for name in df.columns:
s = df[name]
data[name] = _from_pandas_helper(s)

table = pa.table(data)
return from_arrow(table, rechunk)
return pl.Series._from_pandas("", df)
elif isinstance(df, pd.DataFrame):
return pl.DataFrame._from_pandas(df, rechunk=rechunk)
else:
raise ValueError(f"Expected pandas DataFrame or Series, got {type(df)}.")


def from_rows(
Expand Down
74 changes: 28 additions & 46 deletions py-polars/polars/eager/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

import numpy as np
import pyarrow as pa
import pyarrow.compute
import pyarrow.parquet

import polars as pl
Expand Down Expand Up @@ -222,7 +223,7 @@ def __init__(
self._df = series_to_pydf(data, columns=columns)

elif _PANDAS_AVAILABLE and isinstance(data, pd.DataFrame):
self._df = pandas_to_pydf(data, columns=columns, nullable=nullable)
self._df = pandas_to_pydf(data, columns=columns)

else:
raise ValueError("DataFrame constructor not called properly.")
Expand Down Expand Up @@ -334,6 +335,32 @@ def _from_arrow(
"""
return cls._from_pydf(arrow_to_pydf(data, columns=columns, rechunk=rechunk))

@classmethod
def _from_pandas(
cls,
data: "pd.DataFrame",
columns: Optional[Sequence[str]] = None,
rechunk: bool = True,
) -> "DataFrame":
"""
Construct a Polars DataFrame from a pandas DataFrame.
Parameters
----------
data : pandas DataFrame
Two-dimensional data represented as a pandas DataFrame.
columns : Sequence of str, default None
Column labels to use for resulting DataFrame. If specified, overrides any
labels already present in the data. Must match data dimensions.
rechunk : bool, default True
Make sure that all data is contiguous.
Returns
-------
DataFrame
"""
return cls._from_pydf(pandas_to_pydf(data, columns=columns, rechunk=rechunk))

@classmethod
def from_arrow(cls, table: pa.Table, rechunk: bool = True) -> "DataFrame":
"""
Expand Down Expand Up @@ -361,51 +388,6 @@ def from_arrow(cls, table: pa.Table, rechunk: bool = True) -> "DataFrame":
)
return cls._from_arrow(table, rechunk=rechunk)

@classmethod
def _from_pandas(
cls,
data: "pd.DataFrame",
columns: Optional[Sequence[str]] = None,
nullable: bool = True,
) -> "DataFrame":
"""
Construct a Polars DataFrame from a pandas DataFrame.
Parameters
----------
data : pandas DataFrame
Two-dimensional data represented as a pandas DataFrame.
columns : Sequence of str, default None
Column labels to use for resulting DataFrame. If specified, overrides any
labels already present in the data. Must match data dimensions.
nullable : bool, default True
If your data does not contain null values, set to False to speed up
DataFrame creation.
Returns
-------
DataFrame
Examples
--------
```python
>>> pd_df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['a', 'b', 'c'])
>>> df = pl.DataFrame.from_pandas(pd_df, columns=['d', 'e', 'f'])
>>> df
shape: (2, 3)
╭─────┬─────┬─────╮
│ d ┆ e ┆ f │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 │
╞═════╪═════╪═════╡
│ 1 ┆ 2 ┆ 3 │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
│ 4 ┆ 5 ┆ 6 │
╰─────┴─────┴─────╯
```
"""
return cls._from_pydf(pandas_to_pydf(data, columns=columns, nullable=nullable))

@classmethod
def from_rows(
cls,
Expand Down
23 changes: 22 additions & 1 deletion py-polars/polars/eager/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from polars.internals.construction import (
arrow_to_pyseries,
numpy_to_pyseries,
pandas_to_pyseries,
sequence_to_pyseries,
series_to_pyseries,
)
Expand Down Expand Up @@ -40,6 +41,13 @@
)
from ..utils import _ptr_to_numpy

try:
import pandas as pd

_PANDAS_AVAILABLE = True
except ImportError:
_PANDAS_AVAILABLE = False

__all__ = [
"Series",
]
Expand Down Expand Up @@ -83,7 +91,9 @@ def wrap_s(s: "PySeries") -> "Series":
return Series._from_pyseries(s)


ArrayLike = Union[Sequence[Any], "Series", pa.Array, np.ndarray]
ArrayLike = Union[
Sequence[Any], "Series", pa.Array, np.ndarray, "pd.Series", "pd.DatetimeIndex"
]


class Series:
Expand Down Expand Up @@ -192,6 +202,8 @@ def __init__(
self._s = sequence_to_pyseries(name, values, dtype=dtype)
else:
self._s = numpy_to_pyseries(name, np.array(values))
elif _PANDAS_AVAILABLE and isinstance(values, (pd.Series, pd.DatetimeIndex)):
self._s = pandas_to_pyseries(name, values)
else:
raise ValueError("Series constructor not called properly.")

Expand All @@ -215,6 +227,15 @@ def _from_arrow(cls, name: str, values: pa.Array) -> "Series":
"""
return cls._from_pyseries(arrow_to_pyseries(name, values))

@classmethod
def _from_pandas(
cls, name: str, values: Union["pd.Series", "pd.DatetimeIndex"]
) -> "Series":
"""
Construct a Series from a pandas Series or DatetimeIndex.
"""
return cls._from_pyseries(pandas_to_pyseries(name, values))

@classmethod
def from_arrow(cls, name: str, array: pa.Array) -> "Series":
"""
Expand Down
1 change: 0 additions & 1 deletion py-polars/polars/functions.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from typing import Optional, Sequence, Union

import pyarrow as pa
import pyarrow.compute

import polars as pl

Expand Down

0 comments on commit f44a9cc

Please sign in to comment.