Skip to content

Commit

Permalink
deprecate to_<file> and string cache in lazy (#2916)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Mar 16, 2022
1 parent 98a3d85 commit b90bcfa
Show file tree
Hide file tree
Showing 10 changed files with 171 additions and 49 deletions.
8 changes: 6 additions & 2 deletions polars/polars-lazy/src/frame/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ pub struct OptState {
pub predicate_pushdown: bool,
pub type_coercion: bool,
pub simplify_expr: bool,
/// Make sure that all needed columns are scannedn
/// Make sure that all needed columns are scanned
pub agg_scan_projection: bool,
pub aggregate_pushdown: bool,
pub global_string_cache: bool,
Expand All @@ -114,7 +114,7 @@ impl Default for OptState {
predicate_pushdown: true,
type_coercion: true,
simplify_expr: true,
global_string_cache: true,
global_string_cache: false,
slice_pushdown: true,
// will be toggled by a scan operation such as csv scan or parquet scan
agg_scan_projection: false,
Expand Down Expand Up @@ -597,6 +597,10 @@ impl LazyFrame {
pub fn collect(self) -> Result<DataFrame> {
#[cfg(feature = "dtype-categorical")]
let use_string_cache = self.opt_state.global_string_cache;
#[cfg(feature = "dtype-categorical")]
if use_string_cache {
eprint!("global string cache in combination with LazyFrames is deprecated; please set the global string cache globally.")
}
let mut expr_arena = Arena::with_capacity(256);
let mut lp_arena = Arena::with_capacity(128);
let lp_top = self.optimize(&mut lp_arena, &mut expr_arena)?;
Expand Down
10 changes: 5 additions & 5 deletions py-polars/docs/source/reference/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ CSV

read_csv
scan_csv
DataFrame.to_csv
DataFrame.write_csv

Feather/ IPC
~~~~~~~~~~~~
Expand All @@ -22,7 +22,7 @@ Feather/ IPC
read_ipc
scan_ipc
read_ipc_schema
DataFrame.to_ipc
DataFrame.write_ipc

Parquet
~~~~~~~
Expand All @@ -31,7 +31,7 @@ Parquet

read_parquet
scan_parquet
DataFrame.to_parquet
DataFrame.write_parquet

SQL
~~~
Expand All @@ -46,12 +46,12 @@ JSON
:toctree: api/

read_json
DataFrame.to_json
DataFrame.write_json

AVRO
~~~~
.. autosummary::
:toctree: api/

read_avro
DataFrame.to_avro
DataFrame.write_avro
127 changes: 120 additions & 7 deletions py-polars/polars/internals/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -902,6 +902,60 @@ def to_json(
json_lines: bool = False,
*,
to_string: bool = False,
) -> Optional[str]:
"""
.. deprecated:: 0.13.12
Please use `write_json`
"""
warnings.warn("'to_json' is deprecated. please use 'write_json'")
return self.write_json(
file, pretty, row_oriented, json_lines, to_string=to_string
)

@overload
def write_json(
self,
file: Optional[Union[IOBase, str, Path]] = ...,
pretty: bool = ...,
row_oriented: bool = ...,
json_lines: bool = ...,
*,
to_string: Literal[True],
) -> str:
...

@overload
def write_json(
self,
file: Optional[Union[IOBase, str, Path]] = ...,
pretty: bool = ...,
row_oriented: bool = ...,
json_lines: bool = ...,
*,
to_string: Literal[False] = ...,
) -> None:
...

@overload
def write_json(
self,
file: Optional[Union[IOBase, str, Path]] = ...,
pretty: bool = ...,
row_oriented: bool = ...,
json_lines: bool = ...,
*,
to_string: bool = ...,
) -> Optional[str]:
...

def write_json(
self,
file: Optional[Union[IOBase, str, Path]] = None,
pretty: bool = False,
row_oriented: bool = False,
json_lines: bool = False,
*,
to_string: bool = False,
) -> Optional[str]:
"""
Serialize to JSON representation.
Expand Down Expand Up @@ -970,7 +1024,7 @@ def to_pandas(
tbl = pa.Table.from_batches(record_batches)
return tbl.to_pandas(*args, date_as_object=date_as_object, **kwargs)

def to_csv(
def write_csv(
self,
file: Optional[Union[TextIO, BytesIO, str, Path]] = None,
has_header: bool = True,
Expand All @@ -984,7 +1038,7 @@ def to_csv(
file
File path to which the file should be written.
has_header
Whether or not to include header in the CSV output.
Whether to include header in the CSV output.
sep
Separate CSV fields with this symbol.
Expand All @@ -998,7 +1052,7 @@ def to_csv(
... "ham": ["a", "b", "c", "d", "e"],
... }
... )
>>> df.to_csv("new_file.csv", sep=",")
>>> df.write_csv("new_file.csv", sep=",")
"""
if file is None:
Expand All @@ -1012,7 +1066,20 @@ def to_csv(
self._df.to_csv(file, has_header, ord(sep))
return None

def to_avro(
def to_csv(
self,
file: Optional[Union[TextIO, BytesIO, str, Path]] = None,
has_header: bool = True,
sep: str = ",",
) -> Optional[str]:
"""
.. deprecated:: 0.13.12
Please use `write_csv`
"""
warnings.warn("'to_csv' is deprecated. please use 'write_csv'")
return self.write_csv(file, has_header, sep)

def write_avro(
self,
file: Union[BinaryIO, BytesIO, str, Path],
compression: Literal["uncompressed", "snappy", "deflate"] = "uncompressed",
Expand All @@ -1035,7 +1102,19 @@ def to_avro(

self._df.to_avro(file, compression)

def to_ipc(
def to_avro(
self,
file: Union[BinaryIO, BytesIO, str, Path],
compression: Literal["uncompressed", "snappy", "deflate"] = "uncompressed",
) -> None:
"""
.. deprecated:: 0.13.12
Please use `write_avro`
"""
warnings.warn("'to_avro' is deprecated. please use 'write_avro'")
return self.write_avro(file, compression)

def write_ipc(
self,
file: Union[BinaryIO, BytesIO, str, Path],
compression: Optional[Literal["uncompressed", "lz4", "zstd"]] = "uncompressed",
Expand All @@ -1060,6 +1139,18 @@ def to_ipc(

self._df.to_ipc(file, compression)

def to_ipc(
self,
file: Union[BinaryIO, BytesIO, str, Path],
compression: Optional[Literal["uncompressed", "lz4", "zstd"]] = "uncompressed",
) -> None:
"""
.. deprecated:: 0.13.12
Please use `write_ipc`
"""
warnings.warn("'to_ipc' is deprecated. please use 'write_ipc'")
return self.write_ipc(file, compression)

def to_dicts(self) -> List[Dict[str, Any]]:
pydf = self._df
names = self.columns
Expand Down Expand Up @@ -1177,7 +1268,7 @@ def transpose(
df.columns = names
return df

def to_parquet(
def write_parquet(
self,
file: Union[str, Path, BytesIO],
compression: Optional[
Expand Down Expand Up @@ -1224,7 +1315,7 @@ def to_parquet(
if use_pyarrow:
if not _PYARROW_AVAILABLE:
raise ImportError( # pragma: no cover
"'pyarrow' is required when using 'to_parquet(..., use_pyarrow=True)'."
"'pyarrow' is required when using 'write_parquet(..., use_pyarrow=True)'."
)

tbl = self.to_arrow()
Expand All @@ -1251,6 +1342,28 @@ def to_parquet(
else:
self._df.to_parquet(file, compression, statistics)

def to_parquet(
self,
file: Union[str, Path, BytesIO],
compression: Optional[
Union[
Literal[
"uncompressed", "snappy", "gzip", "lzo", "brotli", "lz4", "zstd"
],
str,
]
] = "snappy",
statistics: bool = False,
use_pyarrow: bool = False,
**kwargs: Any,
) -> None:
"""
.. deprecated:: 0.13.12
Please use `write_parquet`
"""
warnings.warn("'to_parquet' is deprecated. please use 'write_parquet'")
return self.write_parquet(file, compression, statistics, use_pyarrow, **kwargs)

def to_numpy(self) -> np.ndarray:
"""
Convert DataFrame to a 2d numpy array.
Expand Down
6 changes: 5 additions & 1 deletion py-polars/polars/internals/lazy_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -441,6 +441,8 @@ def collect(
simplify_expression
Run simplify expressions optimization.
string_cache
This argument is deprecated. Please set the string cache globally.
Use a global string cache in this query.
This is needed if you want to join on categorical columns.
Expand Down Expand Up @@ -478,7 +480,7 @@ def fetch(
predicate_pushdown: bool = True,
projection_pushdown: bool = True,
simplify_expression: bool = True,
string_cache: bool = True,
string_cache: bool = False,
no_optimization: bool = False,
slice_pushdown: bool = True,
) -> DF:
Expand All @@ -503,6 +505,8 @@ def fetch(
simplify_expression
Run simplify expressions optimization.
string_cache
This argument is deprecated. Please set the string cache globally.
Use a global string cache in this query.
This is needed if you want to join on categorical columns.
no_optimization
Expand Down
8 changes: 4 additions & 4 deletions py-polars/tests/io/test_avro.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def compressions() -> List[str]:
def test_from_to_buffer(example_df: pl.DataFrame, compressions: List[str]) -> None:
for compression in compressions:
buf = io.BytesIO()
example_df.to_avro(buf, compression=compression) # type: ignore
example_df.write_avro(buf, compression=compression) # type: ignore
buf.seek(0)
read_df = pl.read_avro(buf)
assert example_df.frame_equal(read_df)
Expand All @@ -33,7 +33,7 @@ def test_from_to_file(
f = os.path.join(io_test_dir, "small.avro")

for compression in compressions:
example_df.to_avro(f, compression=compression) # type: ignore
example_df.write_avro(f, compression=compression) # type: ignore
df_read = pl.read_avro(str(f))
assert example_df.frame_equal(df_read)

Expand All @@ -43,7 +43,7 @@ def test_select_columns() -> None:
expected = pl.DataFrame({"b": [True, False, True], "c": ["a", "b", "c"]})

f = io.BytesIO()
df.to_avro(f)
df.write_avro(f)
f.seek(0)

read_df = pl.read_avro(f, columns=["b", "c"])
Expand All @@ -55,7 +55,7 @@ def test_select_projection() -> None:
expected = pl.DataFrame({"b": [True, False, True], "c": ["a", "b", "c"]})

f = io.BytesIO()
df.to_avro(f)
df.write_avro(f)
f.seek(0)

read_df = pl.read_avro(f, columns=[1, 2])
Expand Down
8 changes: 4 additions & 4 deletions py-polars/tests/io/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

def test_to_from_buffer(df: pl.DataFrame) -> None:
buf = io.BytesIO()
df.to_csv(buf)
df.write_csv(buf)
buf.seek(0)

read_df = pl.read_csv(buf, parse_dates=True)
Expand All @@ -30,7 +30,7 @@ def test_to_from_file(io_test_dir: str, df: pl.DataFrame) -> None:
df = df.drop("strings_nulls")

f = os.path.join(io_test_dir, "small.csv")
df.to_csv(f)
df.write_csv(f)

read_df = pl.read_csv(f, parse_dates=True)

Expand Down Expand Up @@ -353,7 +353,7 @@ def test_csv_schema_offset(foods_csv: str) -> None:
def test_empty_string_missing_round_trip() -> None:
df = pl.DataFrame({"varA": ["A", "", None], "varB": ["B", "", None]})
f = io.BytesIO()
df.to_csv(f)
df.write_csv(f)
f.seek(0)
df_read = pl.read_csv(f)
assert df.frame_equal(df_read)
Expand All @@ -362,7 +362,7 @@ def test_empty_string_missing_round_trip() -> None:
def test_write_csv_delimiter() -> None:
df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]})
f = io.BytesIO()
df.to_csv(f, sep="\t")
df.write_csv(f, sep="\t")
f.seek(0)
assert f.read() == b"a\tb\n1\t1\n2\t2\n3\t3\n"

Expand Down

0 comments on commit b90bcfa

Please sign in to comment.