Skip to content

Commit

Permalink
Add from_numpy constructor (#3944)
Browse files Browse the repository at this point in the history
  • Loading branch information
stinodego committed Jul 8, 2022
1 parent f663838 commit 9c319c9
Show file tree
Hide file tree
Showing 5 changed files with 217 additions and 108 deletions.
10 changes: 9 additions & 1 deletion py-polars/polars/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,14 @@ def version() -> str:
Config,
toggle_string_cache,
)
from polars.convert import from_arrow, from_dict, from_dicts, from_pandas, from_records
from polars.convert import (
from_arrow,
from_dict,
from_dicts,
from_numpy,
from_pandas,
from_records,
)
from polars.datatypes import (
Boolean,
Categorical,
Expand Down Expand Up @@ -231,6 +238,7 @@ def version() -> str:
"from_dict",
"from_dicts",
"from_records",
"from_numpy",
"from_arrow",
"from_pandas",
# testing
Expand Down
130 changes: 95 additions & 35 deletions py-polars/polars/convert.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,30 @@
from __future__ import annotations

from typing import TYPE_CHECKING, Any, Mapping, Sequence, overload
import warnings
from typing import Any, Mapping, Sequence, overload

from polars.internals import DataFrame, Series

if TYPE_CHECKING: # pragma: no cover
try:
import numpy as np
import pandas as pd

_NUMPY_AVAILABLE = True
except ImportError: # pragma: no cover
_NUMPY_AVAILABLE = False

try:
import pyarrow as pa

_PYARROW_AVAILABLE = True
else:
try:
import pyarrow as pa
except ImportError: # pragma: no cover
_PYARROW_AVAILABLE = False

try:
import pandas as pd

_PYARROW_AVAILABLE = True
except ImportError: # pragma: no cover
_PYARROW_AVAILABLE = False
_PANDAS_AVAILABLE = True
except ImportError: # pragma: no cover
_PANDAS_AVAILABLE = False


def from_dict(
Expand Down Expand Up @@ -60,8 +68,49 @@ def from_dict(
return DataFrame._from_dict(data=data, columns=columns) # type: ignore


def from_dicts(
dicts: Sequence[dict[str, Any]], infer_schema_length: int | None = 50
) -> DataFrame:
"""
Construct a DataFrame from a sequence of dictionaries.
Parameters
----------
dicts
Sequence with dictionaries mapping column name to value
infer_schema_length
How many dictionaries/rows to scan to determine the data types
if set to `None` all rows are scanned. This will be slow.
Returns
-------
DataFrame
Examples
--------
>>> data = [{"a": 1, "b": 4}, {"a": 2, "b": 5}, {"a": 3, "b": 6}]
>>> df = pl.from_dicts(data)
>>> df
shape: (3, 2)
┌─────┬─────┐
│ a ┆ b │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 1 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌┤
│ 2 ┆ 5 │
├╌╌╌╌╌┼╌╌╌╌╌┤
│ 3 ┆ 6 │
└─────┴─────┘
"""
return DataFrame._from_dicts(dicts, infer_schema_length)


def from_records(
data: np.ndarray | Sequence[Sequence[Any]],
data: Sequence[Sequence[Any]],
columns: Sequence[str] | None = None,
orient: str | None = None,
) -> DataFrame:
Expand Down Expand Up @@ -92,7 +141,7 @@ def from_records(
>>> data = [[1, 2, 3], [4, 5, 6]]
>>> df = pl.from_records(data, columns=["a", "b"])
>>> df
shape: (3, 2)
shape: (3, 2)
┌─────┬─────┐
│ a ┆ b │
│ --- ┆ --- │
Expand All @@ -106,23 +155,38 @@ def from_records(
└─────┴─────┘
"""
return DataFrame._from_records(data, columns=columns, orient=orient)
if _NUMPY_AVAILABLE and isinstance(data, np.ndarray):
warnings.warn(
"using `from_records` with a numpy ndarray is deprecated, "
"use `from_numpy` instead",
DeprecationWarning,
)
return DataFrame._from_numpy(data, columns=columns, orient=orient)
else:
return DataFrame._from_records(data, columns=columns, orient=orient)


def from_dicts(
dicts: Sequence[dict[str, Any]],
infer_schema_length: int | None = 50,
def from_numpy(
data: np.ndarray,
columns: Sequence[str] | None = None,
orient: str | None = None,
) -> DataFrame:
"""
Construct a DataFrame from a sequence of dictionaries.
Construct a DataFrame from a numpy ndarray.
Note that this is slower than creating from columnar memory.
Parameters
----------
dicts
Sequence with dictionaries mapping column name to value
infer_schema_length
How many dictionaries/rows to scan to determine the data types
if set to `None` all rows are scanned. This will be slow.
data : numpy ndarray
Two-dimensional data represented as a numpy ndarray.
columns : Sequence of str, default None
Column labels to use for resulting DataFrame. Must match data dimensions.
If not specified, columns will be named `column_0`, `column_1`, etc.
orient : {'col', 'row'}, default None
Whether to interpret two-dimensional data as columns or as rows. If None,
the orientation is inferred by matching the columns and data dimensions. If
this does not yield conclusive results, column orientation is used.
Returns
-------
Expand All @@ -131,8 +195,9 @@ def from_dicts(
Examples
--------
>>> data = [{"a": 1, "b": 4}, {"a": 2, "b": 5}, {"a": 3, "b": 6}]
>>> df = pl.from_dicts(data)
>>> import numpy as np
>>> data = np.array([[1, 2, 3], [4, 5, 6]])
>>> df = pl.from_numpy(data, columns=["a", "b"], orient="col")
>>> df
shape: (3, 2)
┌─────┬─────┐
Expand All @@ -148,7 +213,9 @@ def from_dicts(
└─────┴─────┘
"""
return DataFrame._from_dicts(dicts, infer_schema_length)
if not _NUMPY_AVAILABLE:
raise ImportError("'numpy' is required when using from_numpy().")
return DataFrame._from_numpy(data, columns=columns, orient=orient)


# Note that we cannot overload because pyarrow has no stubs :(
Expand Down Expand Up @@ -209,9 +276,7 @@ def from_arrow(
"""
if not _PYARROW_AVAILABLE:
raise ImportError(
"'pyarrow' is required when using from_arrow()."
) # pragma: no cover
raise ImportError("'pyarrow' is required when using from_arrow().")
if isinstance(a, pa.Table):
return DataFrame._from_arrow(a, rechunk=rechunk)
elif isinstance(a, (pa.Array, pa.ChunkedArray)):
Expand Down Expand Up @@ -296,14 +361,9 @@ def from_pandas(
"""
if not _PYARROW_AVAILABLE:
raise ImportError( # pragma: no cover
"'pyarrow' is required when using from_pandas()."
)

try:
import pandas as pd
except ImportError as e: # pragma: no cover
raise ImportError("'pandas' is required when using from_pandas().") from e
raise ImportError("'pyarrow' is required when using from_pandas().")
if not _PANDAS_AVAILABLE:
raise ImportError("'pandas' is required when using from_pandas().")

if isinstance(df, (pd.Series, pd.DatetimeIndex)):
return Series._from_pandas("", df, nan_to_none=nan_to_none)
Expand Down
116 changes: 58 additions & 58 deletions py-polars/polars/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,6 +446,64 @@ def dict_to_pydf(
return PyDataFrame.read_dict(data)


def sequence_to_pydf(
data: Sequence[Any],
columns: ColumnsType | None = None,
orient: str | None = None,
) -> PyDataFrame:
"""
Construct a PyDataFrame from a sequence.
"""
data_series: list[PySeries]

if len(data) == 0:
return dict_to_pydf({}, columns=columns)

elif isinstance(data[0], pli.Series):
series_names = [s.name for s in data]
columns, dtypes = _unpack_columns(columns or series_names, n_expected=len(data))
data_series = []
for i, s in enumerate(data):
if not s.name: # TODO: Replace by `if s.name is None` once allowed
s.rename(columns[i], in_place=True)

new_dtype = dtypes.get(columns[i])
if new_dtype and new_dtype != s.dtype:
s = s.cast(new_dtype)

data_series.append(s.inner())

elif isinstance(data[0], dict):
pydf = PyDataFrame.read_dicts(data)
if columns:
pydf = _post_apply_columns(pydf, columns)
return pydf

elif isinstance(data[0], Sequence) and not isinstance(data[0], str):
# Infer orientation
if orient is None and columns is not None:
orient = "col" if len(columns) == len(data) else "row"

if orient == "row":
pydf = PyDataFrame.read_rows(data)
if columns:
pydf = _post_apply_columns(pydf, columns)
return pydf
else:
columns, dtypes = _unpack_columns(columns, n_expected=len(data))
data_series = [
pli.Series(columns[i], data[i], dtypes.get(columns[i])).inner()
for i in range(len(data))
]

else:
columns, dtypes = _unpack_columns(columns, n_expected=1)
data_series = [pli.Series(columns[0], data, dtypes.get(columns[0])).inner()]

data_series = _handle_columns_arg(data_series, columns=columns)
return PyDataFrame(data_series)


def numpy_to_pydf(
data: np.ndarray,
columns: ColumnsType | None = None,
Expand Down Expand Up @@ -505,64 +563,6 @@ def numpy_to_pydf(
return PyDataFrame(data_series)


def sequence_to_pydf(
data: Sequence[Any],
columns: ColumnsType | None = None,
orient: str | None = None,
) -> PyDataFrame:
"""
Construct a PyDataFrame from a sequence.
"""
data_series: list[PySeries]

if len(data) == 0:
return dict_to_pydf({}, columns=columns)

elif isinstance(data[0], pli.Series):
series_names = [s.name for s in data]
columns, dtypes = _unpack_columns(columns or series_names, n_expected=len(data))
data_series = []
for i, s in enumerate(data):
if not s.name: # TODO: Replace by `if s.name is None` once allowed
s.rename(columns[i], in_place=True)

new_dtype = dtypes.get(columns[i])
if new_dtype and new_dtype != s.dtype:
s = s.cast(new_dtype)

data_series.append(s.inner())

elif isinstance(data[0], dict):
pydf = PyDataFrame.read_dicts(data)
if columns:
pydf = _post_apply_columns(pydf, columns)
return pydf

elif isinstance(data[0], Sequence) and not isinstance(data[0], str):
# Infer orientation
if orient is None and columns is not None:
orient = "col" if len(columns) == len(data) else "row"

if orient == "row":
pydf = PyDataFrame.read_rows(data)
if columns:
pydf = _post_apply_columns(pydf, columns)
return pydf
else:
columns, dtypes = _unpack_columns(columns, n_expected=len(data))
data_series = [
pli.Series(columns[i], data[i], dtypes.get(columns[i])).inner()
for i in range(len(data))
]

else:
columns, dtypes = _unpack_columns(columns, n_expected=1)
data_series = [pli.Series(columns[0], data, dtypes.get(columns[0])).inner()]

data_series = _handle_columns_arg(data_series, columns=columns)
return PyDataFrame(data_series)


def arrow_to_pydf(
data: pa.Table, columns: ColumnsType | None = None, rechunk: bool = True
) -> PyDataFrame:
Expand Down

0 comments on commit 9c319c9

Please sign in to comment.