From c2c827d3967a488966d2a346fd84910f40cdd5a0 Mon Sep 17 00:00:00 2001 From: alexander-beedie Date: Sun, 31 Dec 2023 18:08:40 +0400 Subject: [PATCH 1/2] feat(python): improve Series/DataFrame init from existing Series/DataFrame --- py-polars/polars/dataframe/frame.py | 7 ++++ py-polars/polars/series/series.py | 23 +++++++++-- py-polars/polars/utils/_construction.py | 36 ++++++++++++++++- py-polars/tests/unit/datatypes/test_struct.py | 16 +++++--- py-polars/tests/unit/test_constructors.py | 40 ++++++++++++++++--- 5 files changed, 107 insertions(+), 15 deletions(-) diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 987e12d3eae6..a2592a3a81af 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -83,6 +83,7 @@ _post_apply_columns, arrow_to_pydf, dict_to_pydf, + frame_to_pydf, iterable_to_pydf, numpy_to_idxs, numpy_to_pydf, @@ -381,6 +382,7 @@ def __init__( orient=orient, infer_schema_length=infer_schema_length, ) + elif isinstance(data, pl.Series): self._df = series_to_pydf( data, schema=schema, schema_overrides=schema_overrides @@ -413,6 +415,11 @@ def __init__( orient=orient, infer_schema_length=infer_schema_length, ) + + elif isinstance(data, pl.DataFrame): + self._df = frame_to_pydf( + data, schema=schema, schema_overrides=schema_overrides + ) else: raise TypeError( f"DataFrame constructor called with unsupported type {type(data).__name__!r}" diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py index e45df2247a2b..1431e384cfb7 100644 --- a/py-polars/polars/series/series.py +++ b/py-polars/polars/series/series.py @@ -264,9 +264,12 @@ def __init__( ) # Handle case where values are passed as the first argument + original_name: str | None = None if name is None: name = "" - elif not isinstance(name, str): + elif isinstance(name, str): + original_name = name + else: if values is None: values = name name = "" @@ -277,12 +280,14 @@ def __init__( self._s = sequence_to_pyseries( name, [], dtype=dtype, dtype_if_empty=dtype_if_empty ) - elif isinstance(values, Series): - self._s = series_to_pyseries(name, values) elif isinstance(values, range): self._s = range_to_series(name, values, dtype=dtype)._s + elif isinstance(values, Series): + name = values.name if original_name is None else name + self._s = series_to_pyseries(name, values, dtype=dtype, strict=strict) + elif isinstance(values, Sequence): self._s = sequence_to_pyseries( name, @@ -292,6 +297,7 @@ def __init__( dtype_if_empty=dtype_if_empty, nan_to_null=nan_to_null, ) + elif _check_for_numpy(values) and isinstance(values, np.ndarray): self._s = numpy_to_pyseries( name, values, strict=strict, nan_to_null=nan_to_null @@ -328,6 +334,17 @@ def __init__( dtype_if_empty=dtype_if_empty, strict=strict, ) + + elif isinstance(values, pl.DataFrame): + to_struct = len(values.columns) > 1 + name = ( + values.columns[0] if (original_name is None and not to_struct) else name + ) + s = values.to_struct(name) if to_struct else values.to_series().rename(name) + if dtype is not None and dtype != s.dtype: + s = s.cast(dtype) + self._s = s._s + else: raise TypeError( f"Series constructor called with unsupported type {type(values).__name__!r}" diff --git a/py-polars/polars/utils/_construction.py b/py-polars/polars/utils/_construction.py index 8cd2a214d6f4..3e3304bd18da 100644 --- a/py-polars/polars/utils/_construction.py +++ b/py-polars/polars/utils/_construction.py @@ -164,9 +164,17 @@ def nt_unpack(obj: Any) -> Any: ################################ -def series_to_pyseries(name: str, values: Series) -> PySeries: +def series_to_pyseries( + name: str, + values: Series, + *, + dtype: PolarsDataType | None = None, + strict: bool = True, +) -> PySeries: """Construct a new PySeries from a Polars Series.""" py_s = values._s.clone() + if dtype is not None and dtype != py_s.dtype(): + py_s = py_s.cast(dtype, strict=strict) py_s.rename(name) return py_s @@ -1603,6 +1611,9 @@ def series_to_pydf( schema_overrides: SchemaDict | None = None, ) -> PyDataFrame: """Construct a PyDataFrame from a Polars Series.""" + if schema is None and schema_overrides is None: + return PyDataFrame([data._s]) + data_series = [data._s] series_name = [s.name() for s in data_series] column_names, schema_overrides = _unpack_schema( @@ -1617,6 +1628,29 @@ def series_to_pydf( return PyDataFrame(data_series) +def frame_to_pydf( + data: DataFrame, + schema: SchemaDefinition | None = None, + schema_overrides: SchemaDict | None = None, +) -> PyDataFrame: + """Construct a PyDataFrame from an existing Polars DataFrame.""" + if schema is None and schema_overrides is None: + return data._df.clone() + + data_series = {c.name: c._s for c in data} + column_names, schema_overrides = _unpack_schema( + schema or data.columns, schema_overrides=schema_overrides + ) + if schema_overrides: + existing_schema = data.schema + for name, new_dtype in schema_overrides.items(): + if new_dtype != existing_schema[name]: + data_series[name] = data_series[name].cast(new_dtype, strict=True) + + series_cols = _handle_columns_arg(list(data_series.values()), columns=column_names) + return PyDataFrame(series_cols) + + def iterable_to_pydf( data: Iterable[Any], schema: SchemaDefinition | None = None, diff --git a/py-polars/tests/unit/datatypes/test_struct.py b/py-polars/tests/unit/datatypes/test_struct.py index abc8d10f31ed..c44d07e6b121 100644 --- a/py-polars/tests/unit/datatypes/test_struct.py +++ b/py-polars/tests/unit/datatypes/test_struct.py @@ -10,7 +10,7 @@ import polars as pl import polars.selectors as cs -from polars.testing import assert_frame_equal +from polars.testing import assert_frame_equal, assert_series_equal if TYPE_CHECKING: from polars.datatypes import PolarsDataType @@ -165,12 +165,16 @@ def test_struct_function_expansion() -> None: {"a": [1, 2, 3, 4], "b": ["one", "two", "three", "four"], "c": [9, 8, 7, 6]} ) struct_schema = {"a": pl.UInt32, "b": pl.String} - s = df.with_columns(pl.struct(pl.col(["a", "b"]), schema=struct_schema))["a"] + dfs = df.with_columns(pl.struct(pl.col(["a", "b"]), schema=struct_schema)) + s = dfs["a"] assert isinstance(s, pl.Series) assert s.struct.fields == ["a", "b"] assert pl.Struct(struct_schema) == s.to_frame().schema["a"] + assert_series_equal(s, pl.Series(dfs.select("a"))) + assert_frame_equal(dfs, pl.DataFrame(dfs)) + def test_nested_struct() -> None: df = pl.DataFrame({"d": [1, 2, 3], "e": ["foo", "bar", "biz"]}) @@ -187,11 +191,11 @@ def test_nested_struct() -> None: def test_struct_to_pandas() -> None: - df = pd.DataFrame([{"a": {"b": {"c": 2}}}]) - pl_df = pl.from_pandas(df) + pdf = pd.DataFrame([{"a": {"b": {"c": 2}}}]) + df = pl.from_pandas(pdf) - assert isinstance(pl_df.dtypes[0], pl.datatypes.Struct) - assert pl_df.to_pandas().equals(df) + assert isinstance(df.dtypes[0], pl.datatypes.Struct) + assert df.to_pandas().equals(pdf) def test_struct_logical_types_to_pandas() -> None: diff --git a/py-polars/tests/unit/test_constructors.py b/py-polars/tests/unit/test_constructors.py index 8256393e240a..01a4de503444 100644 --- a/py-polars/tests/unit/test_constructors.py +++ b/py-polars/tests/unit/test_constructors.py @@ -711,6 +711,28 @@ def test_init_arrow() -> None: pl.DataFrame(pa.table({"a": [1, 2, 3], "b": [4, 5, 6]}), schema=["c", "d", "e"]) +def test_init_from_frame() -> None: + df1 = pl.DataFrame({"id": [0, 1], "misc": ["a", "b"], "val": [-10, 10]}) + assert_frame_equal(df1, pl.DataFrame(df1)) + + df2 = pl.DataFrame(df1, schema=["a", "b", "c"]) + assert_frame_equal(df2, pl.DataFrame(df2)) + + df3 = pl.DataFrame(df1, schema=["a", "b", "c"], schema_overrides={"val": pl.Int8}) + assert_frame_equal(df3, pl.DataFrame(df3)) + + assert df1.schema == {"id": pl.Int64, "misc": pl.String, "val": pl.Int64} + assert df2.schema == {"a": pl.Int64, "b": pl.String, "c": pl.Int64} + assert df3.schema == {"a": pl.Int64, "b": pl.String, "c": pl.Int8} + assert df1.rows() == df2.rows() == df3.rows() + + s1 = pl.Series("s", df3) + s2 = pl.Series(df3) + + assert s1.name == "s" + assert s2.name == "" + + def test_init_series() -> None: # List of Series df = pl.DataFrame([pl.Series("a", [1, 2, 3]), pl.Series("b", [4, 5, 6])]) @@ -730,9 +752,9 @@ def test_init_series() -> None: # List of unnamed Series df = pl.DataFrame([pl.Series([1, 2, 3]), pl.Series([4, 5, 6])]) - expected = pl.DataFrame( - [pl.Series("column_0", [1, 2, 3]), pl.Series("column_1", [4, 5, 6])] - ) + col0 = pl.Series("column_0", [1, 2, 3]) + col1 = pl.Series("column_1", [4, 5, 6]) + expected = pl.DataFrame([col0, col1]) assert_frame_equal(df, expected) df = pl.DataFrame([pl.Series([0.0]), pl.Series([1.0])]) @@ -763,8 +785,16 @@ def test_init_series() -> None: s2 = pl.Series([[[2, 2]]], dtype=pl.List(pl.List(pl.UInt8))) assert s2.dtype == pl.List(pl.List(pl.UInt8)) - s3 = pl.Series(dtype=pl.List(pl.List(pl.UInt8))) - assert s3.dtype == pl.List(pl.List(pl.UInt8)) + nested_dtype = pl.List(pl.List(pl.UInt8)) + s3 = pl.Series("x", dtype=nested_dtype) + s4 = pl.Series(s3) + for s in (s3, s4): + assert s.dtype == nested_dtype + assert s.to_list() == [] + assert s.name == "x" + + s5 = pl.Series("", df, dtype=pl.Int8) + assert_series_equal(s5, pl.Series("", [1, 2, 3], dtype=pl.Int8)) def test_init_seq_of_seq() -> None: From 0502e6172a6247e08d61cdd8bfa848604c644f7a Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Tue, 2 Jan 2024 15:16:05 +0400 Subject: [PATCH 2/2] Update py-polars/polars/series/series.py Co-authored-by: Stijn de Gooijer --- py-polars/polars/series/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py index 1431e384cfb7..b93e21bd00bd 100644 --- a/py-polars/polars/series/series.py +++ b/py-polars/polars/series/series.py @@ -336,7 +336,7 @@ def __init__( ) elif isinstance(values, pl.DataFrame): - to_struct = len(values.columns) > 1 + to_struct = values.width > 1 name = ( values.columns[0] if (original_name is None and not to_struct) else name )