Skip to content

Commit

Permalink
Specialized constructor functions
Browse files Browse the repository at this point in the history
  • Loading branch information
stinodego authored and ritchie46 committed Jul 23, 2021
1 parent 40c310a commit 4e0aaea
Show file tree
Hide file tree
Showing 8 changed files with 629 additions and 276 deletions.
6 changes: 4 additions & 2 deletions py-polars/polars/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
from polars.lazy.expr import Expr, wrap_expr
from polars.lazy.frame import LazyFrame, wrap_ldf

from . import datatypes, eager, functions, io, lazy, string_cache
from . import convert, datatypes, eager, functions, io, lazy, string_cache
from .convert import *
from .datatypes import *
from .eager import *
from .functions import *
Expand All @@ -23,7 +24,8 @@
pass

__all__ = (
datatypes.__all__
convert.__all__
+ datatypes.__all__
+ eager.__all__
+ functions.__all__
+ io.__all__
Expand Down
248 changes: 248 additions & 0 deletions py-polars/polars/convert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,248 @@
from typing import TYPE_CHECKING, Any, Dict, Optional, Sequence, Union

import numpy as np
import pyarrow as pa
import pyarrow.compute

import polars as pl

if TYPE_CHECKING:
import pandas as pd

__all__ = [
"from_dict",
"from_records",
"from_arrow",
"from_pandas",
"from_rows", # deprecated
"from_arrow_table", # deprecated
]


def from_dict(
data: Dict[str, Sequence[Any]],
columns: Optional[Sequence[str]] = None,
nullable: bool = True,
) -> "pl.DataFrame":
"""
Construct a DataFrame from a dictionary of sequences.
Parameters
----------
data : dict of sequences
Two-dimensional data represented as a dictionary. dict must contain
Sequences.
columns : Sequence of str, default None
Column labels to use for resulting DataFrame. If specified, overrides any
labels already present in the data. Must match data dimensions.
nullable : bool, default True
If your data does not contain null values, set to False to speed up
DataFrame creation.
Returns
-------
DataFrame
Examples
--------
```python
>>> data = {'a': [1, 2], 'b': [3, 4]}
>>> df = pl.from_dict(data)
>>> df
shape: (2, 2)
╭─────┬─────╮
│ a ┆ b │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 1 ┆ 3 │
├╌╌╌╌╌┼╌╌╌╌╌┤
│ 2 ┆ 4 │
╰─────┴─────╯
```
"""
return pl.DataFrame._from_dict(data=data, columns=columns, nullable=nullable)


def from_records(
data: Union[np.ndarray, Sequence[Sequence[Any]]],
columns: Optional[Sequence[str]] = None,
orient: Optional[str] = None,
nullable: bool = True,
) -> "pl.DataFrame":
"""
Construct a DataFrame from a numpy ndarray or sequence of sequences.
Parameters
----------
data : numpy ndarray or Sequence of sequences
Two-dimensional data represented as numpy ndarray or sequence of sequences.
columns : Sequence of str, default None
Column labels to use for resulting DataFrame. Must match data dimensions.
If not specified, columns will be named `column_0`, `column_1`, etc.
orient : {'col', 'row'}, default None
Whether to interpret two-dimensional data as columns or as rows. If None,
the orientation is infered by matching the columns and data dimensions. If
this does not yield conclusive results, column orientation is used.
nullable : bool, default True
If your data does not contain null values, set to False to speed up
DataFrame creation.
Returns
-------
DataFrame
Examples
--------
```python
>>> data = [[1, 2, 3], [4, 5, 6]]
>>> df = pl.from_records(data, columns=['a', 'b'])
>>> df
shape: (3, 2)
╭─────┬─────╮
│ a ┆ b │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 1 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌┤
│ 2 ┆ 5 │
├╌╌╌╌╌┼╌╌╌╌╌┤
│ 3 ┆ 6 │
╰─────┴─────╯
```
"""
return pl.DataFrame._from_records(
data, columns=columns, orient=orient, nullable=nullable
)


def from_arrow(
a: Union[pa.Table, pa.Array], rechunk: bool = True
) -> Union["pl.DataFrame", "pl.Series"]:
"""
Create a DataFrame from an arrow Table.
Parameters
----------
a
Arrow Table.
rechunk
Make sure that all data is contiguous.
"""
if isinstance(a, pa.Table):
return pl.DataFrame.from_arrow(a, rechunk)
elif isinstance(a, pa.Array):
return pl.Series.from_arrow("", a)
else:
raise ValueError(f"expected arrow table / array, got {a}")


def _from_pandas_helper(a: Union["pd.Series", "pd.DatetimeIndex"]) -> pa.Array:
dtype = a.dtype
if dtype == "datetime64[ns]":
# We first cast to ms because that's the unit of Date64,
# Then we cast to via int64 to date64. Casting directly to Date64 lead to
# loss of time information https://github.com/ritchie46/polars/issues/476
arr = pa.array(np.array(a.values, dtype="datetime64[ms]"))
arr = pa.compute.cast(arr, pa.int64())
return pa.compute.cast(arr, pa.date64())
elif dtype == "object" and isinstance(a.iloc[0], str):
return pa.array(a, pa.large_utf8())
else:
return pa.array(a)


def from_pandas(
df: Union["pd.DataFrame", "pd.Series", "pd.DatetimeIndex"],
rechunk: bool = True,
) -> Union["pl.Series", "pl.DataFrame"]:
"""
Convert from a pandas DataFrame to a polars DataFrame.
Parameters
----------
df
DataFrame to convert.
rechunk
Make sure that all data is contiguous.
Returns
-------
A Polars DataFrame
"""
try:
import pandas as pd
except ImportError as e:
raise ImportError("from_pandas requires pandas to be installed.") from e

if isinstance(df, (pd.Series, pd.DatetimeIndex)):
return from_arrow(_from_pandas_helper(df))

# Note: we first tried to infer the schema via pyarrow and then modify the schema if
# needed. However arrow 3.0 determines the type of a string like this:
# pa.array(array).type
# Needlessly allocating and failing when the string is too large for the string dtype.

data = {}

for name in df.columns:
s = df[name]
data[name] = _from_pandas_helper(s)

table = pa.table(data)
return from_arrow(table, rechunk)


def from_rows(
rows: Sequence[Sequence[Any]],
column_names: Optional[Sequence[str]] = None,
column_name_mapping: Optional[Dict[int, str]] = None,
) -> "pl.DataFrame":
"""
.. deprecated:: 0.8.13
`from_rows` will be removed in Polars 0.9.0, it is replaced by
`from_records` because the latter offers more versatility. To keep the same
functionality, call `from_records` with `orient='row'`
Create a DataFrame from rows. This should only be used as a last resort, as this is
more expensive than creating from columnar data.
Parameters
----------
rows
rows.
column_names
column names to use for the DataFrame.
column_name_mapping
map column index to a new name:
Example:
```python
column_mapping: {0: "first_column, 3: "fourth column"}
```
"""
return pl.DataFrame.from_rows(rows, column_names, column_name_mapping)


def from_arrow_table(table: pa.Table, rechunk: bool = True) -> "pl.DataFrame":
"""
.. deprecated:: 7.3
use `from_arrow`
Create a DataFrame from an arrow Table.
Parameters
----------
a
Arrow Table.
rechunk
Make sure that all data is contiguous.
"""
import warnings

warnings.warn(
"from_arrow_table is deprecated, use DataFrame.from_arrow instead.",
DeprecationWarning,
stacklevel=2,
)
return pl.DataFrame.from_arrow(table, rechunk)

0 comments on commit 4e0aaea

Please sign in to comment.