Skip to content

Commit

Permalink
ENH: Allow fixed-length strings in df.to_records()
Browse files Browse the repository at this point in the history
Adds parameter to allow string-like columns to be
cast as fixed-length string-like dtypes for more
efficient storage.

Closes pandas-devgh-18146.

Originally authored by @qinghao1 but cleaned up
by @gfyoung to fix merge conflicts.
  • Loading branch information
gfyoung committed Dec 19, 2018
1 parent 6111f64 commit 0fd56e2
Show file tree
Hide file tree
Showing 3 changed files with 109 additions and 3 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.24.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,7 @@ Other Enhancements
- :meth:`DataFrame.to_stata` and :class:`pandas.io.stata.StataWriter117` can write mixed sting columns to Stata strl format (:issue:`23633`)
- :meth:`DataFrame.between_time` and :meth:`DataFrame.at_time` have gained the an ``axis`` parameter (:issue:`8839`)
- :class:`IntervalIndex` has gained the :attr:`~IntervalIndex.is_overlapping` attribute to indicate if the ``IntervalIndex`` contains any overlapping intervals (:issue:`23309`)
- :meth:`DataFrame.to_records` now accepts a ``stringlike_as_fixed_length`` parameter to efficiently store string-likes as fixed-length string-like dtypes (e.g. ``S1``) instead of object dtype (``O``) (:issue:`18146`)

.. _whatsnew_0240.api_breaking:

Expand Down
73 changes: 70 additions & 3 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
OrderedDict, PY36, raise_with_traceback,
string_and_binary_types)
from pandas.compat.numpy import function as nv

from pandas.api.types import infer_dtype
from pandas.core.dtypes.cast import (
maybe_upcast,
cast_scalar_to_array,
Expand Down Expand Up @@ -1476,7 +1476,8 @@ def from_records(cls, data, index=None, exclude=None, columns=None,

return cls(mgr)

def to_records(self, index=True, convert_datetime64=None):
def to_records(self, index=True, convert_datetime64=None,
stringlike_as_fixed_length=False):
"""
Convert DataFrame to a NumPy record array.
Expand All @@ -1493,6 +1494,11 @@ def to_records(self, index=True, convert_datetime64=None):
Whether to convert the index to datetime.datetime if it is a
DatetimeIndex.
stringlike_as_fixed_length : bool, default False
.. versionadded:: 0.24.0
Store string-likes as fixed-length string-like dtypes
(e.g. ``S1`` dtype) instead of Python objects (``O`` dtype).
Returns
-------
Expand Down Expand Up @@ -1534,6 +1540,27 @@ def to_records(self, index=True, convert_datetime64=None):
>>> df.to_records(index=False)
rec.array([(1, 0.5 ), (2, 0.75)],
dtype=[('A', '<i8'), ('B', '<f8')])
By default, strings are recorded as dtype 'O' for object:
>>> df = pd.DataFrame({'A': [1, 2], 'B': ['abc', 'defg']},
... index=['a', 'b'])
>>> df.to_records()
rec.array([('a', 1, 'abc'), ('b', 2, 'defg')],
dtype=[('index', 'O'), ('A', '<i8'), ('B', 'O')])
This can be inefficient (e.g. for short strings, or when storing with
`np.save()`). They can be recorded as fix-length string-like dtypes
such as 'S1' for zero-terminated bytes instead:
>>> df = pd.DataFrame({'A': [1, 2], 'B': ['abc', 'defg']},
... index=['a', 'b'])
>>> df.to_records(stringlike_as_fixed_length=True)
rec.array([('a', 1, 'abc'), ('b', 2, 'defg')],
dtype=[('index', '<U1'), ('A', '<i8'), ('B', '<U4')])
Notice how the 'B' column is now stored as '<U4' for length-four
strings ('S4' for Python 2.x) instead of the 'O' object dtype.
"""

if convert_datetime64 is not None:
Expand Down Expand Up @@ -1569,7 +1596,47 @@ def to_records(self, index=True, convert_datetime64=None):
arrays = [self[c].get_values() for c in self.columns]
names = lmap(compat.text_type, self.columns)

formats = [v.dtype for v in arrays]
formats = []

for v in arrays:
if not stringlike_as_fixed_length:
formats.append(v.dtype)
else:
# gh-18146
#
# For string-like arrays, set dtype as zero-terminated bytes
# with max length equal to that of the longest string-like.
dtype = infer_dtype(v)
symbol = None

if dtype == "string":
# In Python 3.x, infer_dtype does not
# differentiate string from unicode
# like NumPy arrays do, so we
# specify unicode to be safe.
symbol = "S" if compat.PY2 else "U"
elif dtype == "unicode":
# In Python 3.x, infer_dtype does not
# differentiate string from unicode.
#
# Thus, we can only get this result
# in Python 2.x.
symbol = "U"
elif dtype == "bytes":
# In Python 2.x, infer_dtype does not
# differentiate string from bytes.
#
# Thus, we can only get this result
# in Python 3.x. However, NumPy does
# not have a fixed-length bytes dtype
# and just uses string instead.
symbol = "S"

if symbol is not None:
formats.append("{}{}".format(symbol, max(map(len, v))))
else:
formats.append(v.dtype)

return np.rec.fromarrays(
arrays,
dtype={'names': names, 'formats': formats}
Expand Down
38 changes: 38 additions & 0 deletions pandas/tests/frame/test_convert_to.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,44 @@ def test_to_records_with_categorical(self):
dtype=[('index', '=i8'), ('0', 'O')])
tm.assert_almost_equal(result, expected)

@pytest.mark.parametrize("fixed_length", [True, False])
@pytest.mark.parametrize("values,dtype_getter", [
# Integer --> just take the dtype.
([1, 2], lambda fixed, isPY2: "<i8"),
# Mixed --> cast to object.
([1, "1"], lambda fixed, isPY2: "O"),
# String --> cast to string is PY2 else unicode in PY3.
(["1", "2"], lambda fixed, isPY2: (
("S" if isPY2 else "U") + "1") if fixed else "O"),
# String + max-length of longest string.
(["12", "2"], lambda fixed, isPY2: (
("S" if isPY2 else "U") + "2") if fixed else "O"),
# Unicode --> cast to unicode for both PY2 and PY3.
([u"\u2120b", u"456"], lambda fixed, isPY2: "U3" if fixed else "O"),
# Bytes --> cast to string for both PY2 and PY3.
([b"2", b"5"], lambda fixed, isPY2: "S1" if fixed else "O"),
], ids=["int", "mixed", "str", "max-len", "unicode", "bytes"])
def test_to_records_with_strings_as_fixed_length(self, fixed_length,
values, dtype_getter):

# see gh-18146
df = DataFrame({"values": values}, index=["a", "b"])
result = df.to_records(stringlike_as_fixed_length=fixed_length)

ind_dtype = ((("S" if compat.PY2 else "U") + "1")
if fixed_length else "O")
val_dtype = dtype_getter(fixed_length, compat.PY2)

expected = np.rec.array([("a", values[0]), ("b", values[1])],
dtype=[("index", ind_dtype),
("values", val_dtype)])
tm.assert_almost_equal(result, expected)

@pytest.mark.parametrize('mapping', [
dict,
collections.defaultdict(list),
Expand Down

0 comments on commit 0fd56e2

Please sign in to comment.