From 53593d095e0816614b0503f6899410cfb471655d Mon Sep 17 00:00:00 2001 From: Krzysztof Chomski Date: Wed, 1 Nov 2017 22:39:40 +0100 Subject: [PATCH 1/2] BUG: GH17778 - DataFrame.to_pickle() fails for .zip format. GH17778: add 'zip' format to unittests. Added entry in doc/source/whatsnew/v0.22.0.txt file to Bug Fixes section. --- doc/source/whatsnew/v0.22.0.txt | 1 + pandas/io/common.py | 23 +++++++++++++---------- pandas/io/pickle.py | 12 +++++++++++- pandas/tests/io/test_pickle.py | 7 ++++--- 4 files changed, 29 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 8afdd1b2e22b3..4211d9913a497 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -90,6 +90,7 @@ Documentation Changes Bug Fixes ~~~~~~~~~ +- Bug in ``DataFrame.to_pickle()`` fails for .zip format (:issue:`17778`) Conversion ^^^^^^^^^^ diff --git a/pandas/io/common.py b/pandas/io/common.py index 534c1e0671150..f799cab161cd9 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -357,17 +357,20 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, # ZIP Compression elif compression == 'zip': import zipfile - zip_file = zipfile.ZipFile(path_or_buf) - zip_names = zip_file.namelist() - if len(zip_names) == 1: - f = zip_file.open(zip_names.pop()) - elif len(zip_names) == 0: - raise ValueError('Zero files found in ZIP file {}' - .format(path_or_buf)) + if mode == 'wb': + f = zipfile.ZipFile(path_or_buf, 'w') else: - raise ValueError('Multiple files found in ZIP file.' - ' Only one file per ZIP: {}' - .format(zip_names)) + zip_file = zipfile.ZipFile(path_or_buf) + zip_names = zip_file.namelist() + if len(zip_names) == 1: + f = zip_file.open(zip_names.pop()) + elif len(zip_names) == 0: + raise ValueError('Zero files found in ZIP file {}' + .format(path_or_buf)) + else: + raise ValueError('Multiple files found in ZIP file.' + ' Only one file per ZIP: {}' + .format(zip_names)) # XZ Compression elif compression == 'xz': diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 143b76575e36b..aab9ffa1cce45 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -42,7 +42,17 @@ def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL): if protocol < 0: protocol = pkl.HIGHEST_PROTOCOL try: - pkl.dump(obj, f, protocol=protocol) + import zipfile + if isinstance(f, zipfile.ZipFile): + import os + import tempfile + tmp_file = tempfile.NamedTemporaryFile(delete=False) + pkl.dump(obj, tmp_file, protocol=protocol) + tmp_file.close() + f.write(tmp_file.name) + os.remove(tmp_file.name) + else: + pkl.dump(obj, f, protocol=protocol) finally: for _f in fh: _f.close() diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 91c1f19f5caab..91b59b2ff3ffb 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -382,7 +382,7 @@ def decompress_file(self, src_path, dest_path, compression): fh.write(f.read()) f.close() - @pytest.mark.parametrize('compression', [None, 'gzip', 'bz2', 'xz']) + @pytest.mark.parametrize('compression', [None, 'gzip', 'zip', 'bz2', 'xz']) def test_write_explicit(self, compression, get_random_path): # issue 11666 if compression == 'xz': @@ -414,7 +414,8 @@ def test_write_explicit_bad(self, compression, get_random_path): df = tm.makeDataFrame() df.to_pickle(path, compression=compression) - @pytest.mark.parametrize('ext', ['', '.gz', '.bz2', '.xz', '.no_compress']) + @pytest.mark.parametrize('ext', ['', '.gz', '.zip', '.bz2', '.xz', + '.no_compress']) def test_write_infer(self, ext, get_random_path): if ext == '.xz': tm._skip_if_no_lzma() @@ -442,7 +443,7 @@ def test_write_infer(self, ext, get_random_path): tm.assert_frame_equal(df, df2) - @pytest.mark.parametrize('compression', [None, 'gzip', 'bz2', 'xz', "zip"]) + @pytest.mark.parametrize('compression', [None, 'gzip', 'bz2', 'xz', 'zip']) def test_read_explicit(self, compression, get_random_path): # issue 11666 if compression == 'xz': From 890337868f7dc357a7c69a1f9c3b35dcc1962043 Mon Sep 17 00:00:00 2001 From: Krzysztof Chomski Date: Tue, 14 Nov 2017 18:02:45 +0100 Subject: [PATCH 2/2] Added 'zip' to possible compression types in `to_pickle` docstring. Moved imports to top. --- pandas/io/pickle.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index aab9ffa1cce45..c165fe34f4734 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -1,5 +1,9 @@ """ pickle compat """ +import os +import tempfile +import zipfile + import numpy as np from numpy.lib.format import read_array, write_array from pandas.compat import BytesIO, cPickle as pkl, pickle_compat as pc, PY3 @@ -16,7 +20,7 @@ def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL): obj : any object path : string File path - compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer' + compression : {'infer', 'gzip', 'bz2', 'xz', 'zip', None}, default 'infer' a string representing the compression to use in the output file .. versionadded:: 0.20.0 @@ -42,10 +46,7 @@ def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL): if protocol < 0: protocol = pkl.HIGHEST_PROTOCOL try: - import zipfile if isinstance(f, zipfile.ZipFile): - import os - import tempfile tmp_file = tempfile.NamedTemporaryFile(delete=False) pkl.dump(obj, tmp_file, protocol=protocol) tmp_file.close()