Skip to content

Commit

Permalink
ENH: Add strings_as_bytes option for df.to_records() (pandas-dev#18146)
Browse files Browse the repository at this point in the history
This options records dtype for string as arrays as 'Sx', where x
is the length of the longest string, instead of 'O"
  • Loading branch information
qinghao1 authored and Chu Qinghao committed Aug 7, 2018
1 parent 2156431 commit dcb9d0e
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 2 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.24.0.txt
Expand Up @@ -181,6 +181,7 @@ Other Enhancements
The default compression for ``to_csv``, ``to_json``, and ``to_pickle`` methods has been updated to ``'infer'`` (:issue:`22004`).
- :func:`to_timedelta` now supports iso-formated timedelta strings (:issue:`21877`)
- :class:`Series` and :class:`DataFrame` now support :class:`Iterable` in constructor (:issue:`2193`)
- :func:`DataFrame.to_records` now accepts a ``strings_as_bytes`` parameter to efficiently store strings as bytes dtype (``S``) instead of object dtype (``O``) (:issue:`18146`)

.. _whatsnew_0240.api_breaking:

Expand Down
37 changes: 35 additions & 2 deletions pandas/core/frame.py
Expand Up @@ -1335,7 +1335,8 @@ def from_records(cls, data, index=None, exclude=None, columns=None,

return cls(mgr)

def to_records(self, index=True, convert_datetime64=None):
def to_records(self, index=True, convert_datetime64=None,
strings_as_bytes=False):
"""
Convert DataFrame to a NumPy record array.
Expand All @@ -1351,6 +1352,9 @@ def to_records(self, index=True, convert_datetime64=None):
Whether to convert the index to datetime.datetime if it is a
DatetimeIndex.
strings_as_bytes : boolean, default False
Store strings as bytes (``S`` dtype) instead of Python objects
(``O`` dtype)
Returns
-------
Expand Down Expand Up @@ -1401,6 +1405,24 @@ def to_records(self, index=True, convert_datetime64=None):
rec.array([('2018-01-01T09:00:00.000000000', 1, 0.5 ),
('2018-01-01T09:01:00.000000000', 2, 0.75)],
dtype=[('index', '<M8[ns]'), ('A', '<i8'), ('B', '<f8')])
By default, strings are recorded as dtype `O` for object:
>>> df = pd.DataFrame({'A': [1, 2], 'B': ['abc', 'defg']},
... index=['a', 'b'])
>>> df.to_records()
rec.array([('a', 1, 'abc'), ('b', 2, 'defg')],
dtype=[('index', 'O'), ('A', '<i8'), ('B', 'O')])
This can be inefficient (e.g. for short strings, or when storing with
`np.save()`). They can be recorded as dtype `S` for zero-terminated
bytes instead:
>>> df = pd.DataFrame({'A': [1, 2], 'B': ['abc', 'defg']},
... index=['a', 'b'])
>>> df.to_records()
rec.array([('a', 1, 'abc'), ('b', 2, 'defg')],
dtype=[('index', 'S1'), ('A', '<i8'), ('B', 'S4')])
"""

if convert_datetime64 is not None:
Expand Down Expand Up @@ -1436,7 +1458,18 @@ def to_records(self, index=True, convert_datetime64=None):
arrays = [self[c].get_values() for c in self.columns]
names = lmap(compat.text_type, self.columns)

formats = [v.dtype for v in arrays]
if strings_as_bytes:
is_string = np.vectorize(lambda s: type(s) is str)
# GH18146
# for string arrays, set dtype as zero-terminated bytes with max
# length equals to that of the longest string
formats = ['S{}'.format(max(map(len, v)))
if v.dtype == '|O'
and is_string(v).all
else v.dtype
for v in arrays]
else:
formats = [v.dtype for v in arrays]
return np.rec.fromarrays(
arrays,
dtype={'names': names, 'formats': formats}
Expand Down
12 changes: 12 additions & 0 deletions pandas/tests/frame/test_convert_to.py
Expand Up @@ -186,6 +186,18 @@ def test_to_records_with_categorical(self):
dtype=[('index', '=i8'), ('0', 'O')])
tm.assert_almost_equal(result, expected)

def test_to_records_with_strings_as_bytes(self):

# GH18146

df = DataFrame({'A': [1, 2], 'B': ['abc', 'defg']},
index=['a', 'b'])
result = df.to_records(strings_as_bytes=True)
expected = np.rec.array([('a', 1, 'abc'), ('b', 2, 'defg')],
dtype=[('index', 'S1'), ('A', '<i8'),
('B', 'S4')])
tm.assert_almost_equal(result, expected)

@pytest.mark.parametrize('mapping', [
dict,
collections.defaultdict(list),
Expand Down

0 comments on commit dcb9d0e

Please sign in to comment.