Add from_numpy constructor (#3944)

pola-rs · Jul 8, 2022 · 9c319c9 · 9c319c9
1 parent f663838
commit 9c319c9
Show file tree

Hide file tree

Showing 5 changed files with 217 additions and 108 deletions.
diff --git a/py-polars/polars/__init__.py b/py-polars/polars/__init__.py
@@ -16,7 +16,14 @@ def version() -> str:
     Config,
     toggle_string_cache,
 )
-from polars.convert import from_arrow, from_dict, from_dicts, from_pandas, from_records
+from polars.convert import (
+    from_arrow,
+    from_dict,
+    from_dicts,
+    from_numpy,
+    from_pandas,
+    from_records,
+)
 from polars.datatypes import (
     Boolean,
     Categorical,
@@ -231,6 +238,7 @@ def version() -> str:
     "from_dict",
     "from_dicts",
     "from_records",
+    "from_numpy",
     "from_arrow",
     "from_pandas",
     # testing

diff --git a/py-polars/polars/convert.py b/py-polars/polars/convert.py
@@ -1,22 +1,30 @@
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Any, Mapping, Sequence, overload
+import warnings
+from typing import Any, Mapping, Sequence, overload
 
 from polars.internals import DataFrame, Series
 
-if TYPE_CHECKING:  # pragma: no cover
+try:
     import numpy as np
-    import pandas as pd
+
+    _NUMPY_AVAILABLE = True
+except ImportError:  # pragma: no cover
+    _NUMPY_AVAILABLE = False
+
+try:
     import pyarrow as pa
 
     _PYARROW_AVAILABLE = True
-else:
-    try:
-        import pyarrow as pa
+except ImportError:  # pragma: no cover
+    _PYARROW_AVAILABLE = False
+
+try:
+    import pandas as pd
 
-        _PYARROW_AVAILABLE = True
-    except ImportError:  # pragma: no cover
-        _PYARROW_AVAILABLE = False
+    _PANDAS_AVAILABLE = True
+except ImportError:  # pragma: no cover
+    _PANDAS_AVAILABLE = False
 
 
 def from_dict(
@@ -60,8 +68,49 @@ def from_dict(
     return DataFrame._from_dict(data=data, columns=columns)  # type: ignore
 
 
+def from_dicts(
+    dicts: Sequence[dict[str, Any]], infer_schema_length: int | None = 50
+) -> DataFrame:
+    """
+    Construct a DataFrame from a sequence of dictionaries.
+
+    Parameters
+    ----------
+    dicts
+        Sequence with dictionaries mapping column name to value
+    infer_schema_length
+        How many dictionaries/rows to scan to determine the data types
+        if set to `None` all rows are scanned. This will be slow.
+
+    Returns
+    -------
+    DataFrame
+
+    Examples
+    --------
+
+    >>> data = [{"a": 1, "b": 4}, {"a": 2, "b": 5}, {"a": 3, "b": 6}]
+    >>> df = pl.from_dicts(data)
+    >>> df
+    shape: (3, 2)
+    ┌─────┬─────┐
+    │ a   ┆ b   │
+    │ --- ┆ --- │
+    │ i64 ┆ i64 │
+    ╞═════╪═════╡
+    │ 1   ┆ 4   │
+    ├╌╌╌╌╌┼╌╌╌╌╌┤
+    │ 2   ┆ 5   │
+    ├╌╌╌╌╌┼╌╌╌╌╌┤
+    │ 3   ┆ 6   │
+    └─────┴─────┘
+
+    """
+    return DataFrame._from_dicts(dicts, infer_schema_length)
+
+
 def from_records(
-    data: np.ndarray | Sequence[Sequence[Any]],
+    data: Sequence[Sequence[Any]],
     columns: Sequence[str] | None = None,
     orient: str | None = None,
 ) -> DataFrame:
@@ -92,7 +141,7 @@ def from_records(
     >>> data = [[1, 2, 3], [4, 5, 6]]
     >>> df = pl.from_records(data, columns=["a", "b"])
     >>> df
-        shape: (3, 2)
+    shape: (3, 2)
     ┌─────┬─────┐
     │ a   ┆ b   │
     │ --- ┆ --- │
@@ -106,23 +155,38 @@ def from_records(
     └─────┴─────┘
 
     """
-    return DataFrame._from_records(data, columns=columns, orient=orient)
+    if _NUMPY_AVAILABLE and isinstance(data, np.ndarray):
+        warnings.warn(
+            "using `from_records` with a numpy ndarray is deprecated, "
+            "use `from_numpy` instead",
+            DeprecationWarning,
+        )
+        return DataFrame._from_numpy(data, columns=columns, orient=orient)
+    else:
+        return DataFrame._from_records(data, columns=columns, orient=orient)
 
 
-def from_dicts(
-    dicts: Sequence[dict[str, Any]],
-    infer_schema_length: int | None = 50,
+def from_numpy(
+    data: np.ndarray,
+    columns: Sequence[str] | None = None,
+    orient: str | None = None,
 ) -> DataFrame:
     """
-    Construct a DataFrame from a sequence of dictionaries.
+    Construct a DataFrame from a numpy ndarray.
+
+    Note that this is slower than creating from columnar memory.
 
     Parameters
     ----------
-    dicts
-        Sequence with dictionaries mapping column name to value
-    infer_schema_length
-        How many dictionaries/rows to scan to determine the data types
-        if set to `None` all rows are scanned. This will be slow.
+    data : numpy ndarray
+        Two-dimensional data represented as a numpy ndarray.
+    columns : Sequence of str, default None
+        Column labels to use for resulting DataFrame. Must match data dimensions.
+        If not specified, columns will be named `column_0`, `column_1`, etc.
+    orient : {'col', 'row'}, default None
+        Whether to interpret two-dimensional data as columns or as rows. If None,
+        the orientation is inferred by matching the columns and data dimensions. If
+        this does not yield conclusive results, column orientation is used.
 
     Returns
     -------
@@ -131,8 +195,9 @@ def from_dicts(
     Examples
     --------
 
-    >>> data = [{"a": 1, "b": 4}, {"a": 2, "b": 5}, {"a": 3, "b": 6}]
-    >>> df = pl.from_dicts(data)
+    >>> import numpy as np
+    >>> data = np.array([[1, 2, 3], [4, 5, 6]])
+    >>> df = pl.from_numpy(data, columns=["a", "b"], orient="col")
     >>> df
     shape: (3, 2)
     ┌─────┬─────┐
@@ -148,7 +213,9 @@ def from_dicts(
     └─────┴─────┘
 
     """
-    return DataFrame._from_dicts(dicts, infer_schema_length)
+    if not _NUMPY_AVAILABLE:
+        raise ImportError("'numpy' is required when using from_numpy().")
+    return DataFrame._from_numpy(data, columns=columns, orient=orient)
 
 
 # Note that we cannot overload because pyarrow has no stubs :(
@@ -209,9 +276,7 @@ def from_arrow(
 
     """
     if not _PYARROW_AVAILABLE:
-        raise ImportError(
-            "'pyarrow' is required when using from_arrow()."
-        )  # pragma: no cover
+        raise ImportError("'pyarrow' is required when using from_arrow().")
     if isinstance(a, pa.Table):
         return DataFrame._from_arrow(a, rechunk=rechunk)
     elif isinstance(a, (pa.Array, pa.ChunkedArray)):
@@ -296,14 +361,9 @@ def from_pandas(
 
     """
     if not _PYARROW_AVAILABLE:
-        raise ImportError(  # pragma: no cover
-            "'pyarrow' is required when using from_pandas()."
-        )
-
-    try:
-        import pandas as pd
-    except ImportError as e:  # pragma: no cover
-        raise ImportError("'pandas' is required when using from_pandas().") from e
+        raise ImportError("'pyarrow' is required when using from_pandas().")
+    if not _PANDAS_AVAILABLE:
+        raise ImportError("'pandas' is required when using from_pandas().")
 
     if isinstance(df, (pd.Series, pd.DatetimeIndex)):
         return Series._from_pandas("", df, nan_to_none=nan_to_none)

diff --git a/py-polars/polars/internals/construction.py b/py-polars/polars/internals/construction.py
@@ -446,6 +446,64 @@ def dict_to_pydf(
     return PyDataFrame.read_dict(data)
 
 
+def sequence_to_pydf(
+    data: Sequence[Any],
+    columns: ColumnsType | None = None,
+    orient: str | None = None,
+) -> PyDataFrame:
+    """
+    Construct a PyDataFrame from a sequence.
+    """
+    data_series: list[PySeries]
+
+    if len(data) == 0:
+        return dict_to_pydf({}, columns=columns)
+
+    elif isinstance(data[0], pli.Series):
+        series_names = [s.name for s in data]
+        columns, dtypes = _unpack_columns(columns or series_names, n_expected=len(data))
+        data_series = []
+        for i, s in enumerate(data):
+            if not s.name:  # TODO: Replace by `if s.name is None` once allowed
+                s.rename(columns[i], in_place=True)
+
+            new_dtype = dtypes.get(columns[i])
+            if new_dtype and new_dtype != s.dtype:
+                s = s.cast(new_dtype)
+
+            data_series.append(s.inner())
+
+    elif isinstance(data[0], dict):
+        pydf = PyDataFrame.read_dicts(data)
+        if columns:
+            pydf = _post_apply_columns(pydf, columns)
+        return pydf
+
+    elif isinstance(data[0], Sequence) and not isinstance(data[0], str):
+        # Infer orientation
+        if orient is None and columns is not None:
+            orient = "col" if len(columns) == len(data) else "row"
+
+        if orient == "row":
+            pydf = PyDataFrame.read_rows(data)
+            if columns:
+                pydf = _post_apply_columns(pydf, columns)
+            return pydf
+        else:
+            columns, dtypes = _unpack_columns(columns, n_expected=len(data))
+            data_series = [
+                pli.Series(columns[i], data[i], dtypes.get(columns[i])).inner()
+                for i in range(len(data))
+            ]
+
+    else:
+        columns, dtypes = _unpack_columns(columns, n_expected=1)
+        data_series = [pli.Series(columns[0], data, dtypes.get(columns[0])).inner()]
+
+    data_series = _handle_columns_arg(data_series, columns=columns)
+    return PyDataFrame(data_series)
+
+
 def numpy_to_pydf(
     data: np.ndarray,
     columns: ColumnsType | None = None,
@@ -505,64 +563,6 @@ def numpy_to_pydf(
     return PyDataFrame(data_series)
 
 
-def sequence_to_pydf(
-    data: Sequence[Any],
-    columns: ColumnsType | None = None,
-    orient: str | None = None,
-) -> PyDataFrame:
-    """
-    Construct a PyDataFrame from a sequence.
-    """
-    data_series: list[PySeries]
-
-    if len(data) == 0:
-        return dict_to_pydf({}, columns=columns)
-
-    elif isinstance(data[0], pli.Series):
-        series_names = [s.name for s in data]
-        columns, dtypes = _unpack_columns(columns or series_names, n_expected=len(data))
-        data_series = []
-        for i, s in enumerate(data):
-            if not s.name:  # TODO: Replace by `if s.name is None` once allowed
-                s.rename(columns[i], in_place=True)
-
-            new_dtype = dtypes.get(columns[i])
-            if new_dtype and new_dtype != s.dtype:
-                s = s.cast(new_dtype)
-
-            data_series.append(s.inner())
-
-    elif isinstance(data[0], dict):
-        pydf = PyDataFrame.read_dicts(data)
-        if columns:
-            pydf = _post_apply_columns(pydf, columns)
-        return pydf
-
-    elif isinstance(data[0], Sequence) and not isinstance(data[0], str):
-        # Infer orientation
-        if orient is None and columns is not None:
-            orient = "col" if len(columns) == len(data) else "row"
-
-        if orient == "row":
-            pydf = PyDataFrame.read_rows(data)
-            if columns:
-                pydf = _post_apply_columns(pydf, columns)
-            return pydf
-        else:
-            columns, dtypes = _unpack_columns(columns, n_expected=len(data))
-            data_series = [
-                pli.Series(columns[i], data[i], dtypes.get(columns[i])).inner()
-                for i in range(len(data))
-            ]
-
-    else:
-        columns, dtypes = _unpack_columns(columns, n_expected=1)
-        data_series = [pli.Series(columns[0], data, dtypes.get(columns[0])).inner()]
-
-    data_series = _handle_columns_arg(data_series, columns=columns)
-    return PyDataFrame(data_series)
-
-
 def arrow_to_pydf(
     data: pa.Table, columns: ColumnsType | None = None, rechunk: bool = True
 ) -> PyDataFrame: