Skip to content

Commit

Permalink
feat(python): improve Series/DataFrame init from existing Series/Data…
Browse files Browse the repository at this point in the history
…Frame
  • Loading branch information
alexander-beedie committed Dec 31, 2023
1 parent cc72524 commit c2c827d
Show file tree
Hide file tree
Showing 5 changed files with 107 additions and 15 deletions.
7 changes: 7 additions & 0 deletions py-polars/polars/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@
_post_apply_columns,
arrow_to_pydf,
dict_to_pydf,
frame_to_pydf,
iterable_to_pydf,
numpy_to_idxs,
numpy_to_pydf,
Expand Down Expand Up @@ -381,6 +382,7 @@ def __init__(
orient=orient,
infer_schema_length=infer_schema_length,
)

elif isinstance(data, pl.Series):
self._df = series_to_pydf(
data, schema=schema, schema_overrides=schema_overrides
Expand Down Expand Up @@ -413,6 +415,11 @@ def __init__(
orient=orient,
infer_schema_length=infer_schema_length,
)

elif isinstance(data, pl.DataFrame):
self._df = frame_to_pydf(
data, schema=schema, schema_overrides=schema_overrides
)
else:
raise TypeError(
f"DataFrame constructor called with unsupported type {type(data).__name__!r}"
Expand Down
23 changes: 20 additions & 3 deletions py-polars/polars/series/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,9 +264,12 @@ def __init__(
)

# Handle case where values are passed as the first argument
original_name: str | None = None
if name is None:
name = ""
elif not isinstance(name, str):
elif isinstance(name, str):
original_name = name
else:
if values is None:
values = name
name = ""
Expand All @@ -277,12 +280,14 @@ def __init__(
self._s = sequence_to_pyseries(
name, [], dtype=dtype, dtype_if_empty=dtype_if_empty
)
elif isinstance(values, Series):
self._s = series_to_pyseries(name, values)

elif isinstance(values, range):
self._s = range_to_series(name, values, dtype=dtype)._s

elif isinstance(values, Series):
name = values.name if original_name is None else name
self._s = series_to_pyseries(name, values, dtype=dtype, strict=strict)

elif isinstance(values, Sequence):
self._s = sequence_to_pyseries(
name,
Expand All @@ -292,6 +297,7 @@ def __init__(
dtype_if_empty=dtype_if_empty,
nan_to_null=nan_to_null,
)

elif _check_for_numpy(values) and isinstance(values, np.ndarray):
self._s = numpy_to_pyseries(
name, values, strict=strict, nan_to_null=nan_to_null
Expand Down Expand Up @@ -328,6 +334,17 @@ def __init__(
dtype_if_empty=dtype_if_empty,
strict=strict,
)

elif isinstance(values, pl.DataFrame):
to_struct = len(values.columns) > 1
name = (
values.columns[0] if (original_name is None and not to_struct) else name
)
s = values.to_struct(name) if to_struct else values.to_series().rename(name)
if dtype is not None and dtype != s.dtype:
s = s.cast(dtype)
self._s = s._s

else:
raise TypeError(
f"Series constructor called with unsupported type {type(values).__name__!r}"
Expand Down
36 changes: 35 additions & 1 deletion py-polars/polars/utils/_construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,9 +164,17 @@ def nt_unpack(obj: Any) -> Any:
################################


def series_to_pyseries(name: str, values: Series) -> PySeries:
def series_to_pyseries(
name: str,
values: Series,
*,
dtype: PolarsDataType | None = None,
strict: bool = True,
) -> PySeries:
"""Construct a new PySeries from a Polars Series."""
py_s = values._s.clone()
if dtype is not None and dtype != py_s.dtype():
py_s = py_s.cast(dtype, strict=strict)
py_s.rename(name)
return py_s

Expand Down Expand Up @@ -1603,6 +1611,9 @@ def series_to_pydf(
schema_overrides: SchemaDict | None = None,
) -> PyDataFrame:
"""Construct a PyDataFrame from a Polars Series."""
if schema is None and schema_overrides is None:
return PyDataFrame([data._s])

data_series = [data._s]
series_name = [s.name() for s in data_series]
column_names, schema_overrides = _unpack_schema(
Expand All @@ -1617,6 +1628,29 @@ def series_to_pydf(
return PyDataFrame(data_series)


def frame_to_pydf(
data: DataFrame,
schema: SchemaDefinition | None = None,
schema_overrides: SchemaDict | None = None,
) -> PyDataFrame:
"""Construct a PyDataFrame from an existing Polars DataFrame."""
if schema is None and schema_overrides is None:
return data._df.clone()

data_series = {c.name: c._s for c in data}
column_names, schema_overrides = _unpack_schema(
schema or data.columns, schema_overrides=schema_overrides
)
if schema_overrides:
existing_schema = data.schema
for name, new_dtype in schema_overrides.items():
if new_dtype != existing_schema[name]:
data_series[name] = data_series[name].cast(new_dtype, strict=True)

series_cols = _handle_columns_arg(list(data_series.values()), columns=column_names)
return PyDataFrame(series_cols)


def iterable_to_pydf(
data: Iterable[Any],
schema: SchemaDefinition | None = None,
Expand Down
16 changes: 10 additions & 6 deletions py-polars/tests/unit/datatypes/test_struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

import polars as pl
import polars.selectors as cs
from polars.testing import assert_frame_equal
from polars.testing import assert_frame_equal, assert_series_equal

if TYPE_CHECKING:
from polars.datatypes import PolarsDataType
Expand Down Expand Up @@ -165,12 +165,16 @@ def test_struct_function_expansion() -> None:
{"a": [1, 2, 3, 4], "b": ["one", "two", "three", "four"], "c": [9, 8, 7, 6]}
)
struct_schema = {"a": pl.UInt32, "b": pl.String}
s = df.with_columns(pl.struct(pl.col(["a", "b"]), schema=struct_schema))["a"]
dfs = df.with_columns(pl.struct(pl.col(["a", "b"]), schema=struct_schema))
s = dfs["a"]

assert isinstance(s, pl.Series)
assert s.struct.fields == ["a", "b"]
assert pl.Struct(struct_schema) == s.to_frame().schema["a"]

assert_series_equal(s, pl.Series(dfs.select("a")))
assert_frame_equal(dfs, pl.DataFrame(dfs))


def test_nested_struct() -> None:
df = pl.DataFrame({"d": [1, 2, 3], "e": ["foo", "bar", "biz"]})
Expand All @@ -187,11 +191,11 @@ def test_nested_struct() -> None:


def test_struct_to_pandas() -> None:
df = pd.DataFrame([{"a": {"b": {"c": 2}}}])
pl_df = pl.from_pandas(df)
pdf = pd.DataFrame([{"a": {"b": {"c": 2}}}])
df = pl.from_pandas(pdf)

assert isinstance(pl_df.dtypes[0], pl.datatypes.Struct)
assert pl_df.to_pandas().equals(df)
assert isinstance(df.dtypes[0], pl.datatypes.Struct)
assert df.to_pandas().equals(pdf)


def test_struct_logical_types_to_pandas() -> None:
Expand Down
40 changes: 35 additions & 5 deletions py-polars/tests/unit/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -711,6 +711,28 @@ def test_init_arrow() -> None:
pl.DataFrame(pa.table({"a": [1, 2, 3], "b": [4, 5, 6]}), schema=["c", "d", "e"])


def test_init_from_frame() -> None:
df1 = pl.DataFrame({"id": [0, 1], "misc": ["a", "b"], "val": [-10, 10]})
assert_frame_equal(df1, pl.DataFrame(df1))

df2 = pl.DataFrame(df1, schema=["a", "b", "c"])
assert_frame_equal(df2, pl.DataFrame(df2))

df3 = pl.DataFrame(df1, schema=["a", "b", "c"], schema_overrides={"val": pl.Int8})
assert_frame_equal(df3, pl.DataFrame(df3))

assert df1.schema == {"id": pl.Int64, "misc": pl.String, "val": pl.Int64}
assert df2.schema == {"a": pl.Int64, "b": pl.String, "c": pl.Int64}
assert df3.schema == {"a": pl.Int64, "b": pl.String, "c": pl.Int8}
assert df1.rows() == df2.rows() == df3.rows()

s1 = pl.Series("s", df3)
s2 = pl.Series(df3)

assert s1.name == "s"
assert s2.name == ""


def test_init_series() -> None:
# List of Series
df = pl.DataFrame([pl.Series("a", [1, 2, 3]), pl.Series("b", [4, 5, 6])])
Expand All @@ -730,9 +752,9 @@ def test_init_series() -> None:

# List of unnamed Series
df = pl.DataFrame([pl.Series([1, 2, 3]), pl.Series([4, 5, 6])])
expected = pl.DataFrame(
[pl.Series("column_0", [1, 2, 3]), pl.Series("column_1", [4, 5, 6])]
)
col0 = pl.Series("column_0", [1, 2, 3])
col1 = pl.Series("column_1", [4, 5, 6])
expected = pl.DataFrame([col0, col1])
assert_frame_equal(df, expected)

df = pl.DataFrame([pl.Series([0.0]), pl.Series([1.0])])
Expand Down Expand Up @@ -763,8 +785,16 @@ def test_init_series() -> None:
s2 = pl.Series([[[2, 2]]], dtype=pl.List(pl.List(pl.UInt8)))
assert s2.dtype == pl.List(pl.List(pl.UInt8))

s3 = pl.Series(dtype=pl.List(pl.List(pl.UInt8)))
assert s3.dtype == pl.List(pl.List(pl.UInt8))
nested_dtype = pl.List(pl.List(pl.UInt8))
s3 = pl.Series("x", dtype=nested_dtype)
s4 = pl.Series(s3)
for s in (s3, s4):
assert s.dtype == nested_dtype
assert s.to_list() == []
assert s.name == "x"

s5 = pl.Series("", df, dtype=pl.Int8)
assert_series_equal(s5, pl.Series("", [1, 2, 3], dtype=pl.Int8))


def test_init_seq_of_seq() -> None:
Expand Down

0 comments on commit c2c827d

Please sign in to comment.