Skip to content

Commit

Permalink
BUG: Fix parse_dates processing with usecols and C engine
Browse files Browse the repository at this point in the history
closes #9755
closes #12678

`read_csv` bugs, this PR fixes a bug brought up in #9755 in processing
`parse_dates` with the C engine in which the wrong indices (those of
the filtered column names) were being used to determine the date
columns to not be dtype-parsed by the C engine. The correct indices
are those of the original column names, as they are used later on in
the actual data processing.

Author: gfyoung <gfyoung17@gmail.com>

Closes #12512 from gfyoung/parse_dates_usecols and squashes the following commits:

f0543a4 [gfyoung] BUG: Prevent mixed-typed usecols
83caa3b [gfyoung] BUG: Fix parse_dates processing with usecols and C engine
  • Loading branch information
gfyoung authored and jreback committed Apr 6, 2016
1 parent e04f343 commit c6c201e
Show file tree
Hide file tree
Showing 4 changed files with 166 additions and 27 deletions.
8 changes: 6 additions & 2 deletions doc/source/io.rst
Expand Up @@ -120,8 +120,12 @@ index_col : int or sequence or ``False``, default ``None``
each line, you might consider ``index_col=False`` to force pandas to *not* use
the first column as the index (row names).
usecols : array-like, default ``None``
Return a subset of the columns. Results in much faster parsing time and lower
memory usage
Return a subset of the columns. All elements in this array must either
be positional (i.e. integer indices into the document columns) or strings
that correspond to column names provided either by the user in `names` or
inferred from the document header row(s). For example, a valid `usecols`
parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. Using this parameter
results in much faster parsing time and lower memory usage.
squeeze : boolean, default ``False``
If the parsed data only contains one column then return a Series.
prefix : str, default ``None``
Expand Down
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v0.18.1.txt
Expand Up @@ -101,7 +101,7 @@ API changes


- ``CParserError`` is now a ``ValueError`` instead of just an ``Exception`` (:issue:`12551`)

- ``read_csv`` no longer allows a combination of strings and integers for the ``usecols`` parameter (:issue:`12678`)
- ``pd.show_versions()`` now includes ``pandas_datareader`` version (:issue:`12740`)
- Provide a proper ``__name__`` and ``__qualname__`` attributes for generic functions (:issue:`12021`)

Expand Down Expand Up @@ -211,6 +211,7 @@ Bug Fixes

- Bug in ``value_counts`` when ``normalize=True`` and ``dropna=True`` where nulls still contributed to the normalized count (:issue:`12558`)
- Bug in ``Panel.fillna()`` ignoring ``inplace=True`` (:issue:`12633`)
- Bug in ``read_csv`` when specifying ``names``, ```usecols``, and ``parse_dates`` simultaneously with the C engine (:issue:`9755`)
- Bug in ``Series.rename``, ``DataFrame.rename`` and ``DataFrame.rename_axis`` not treating ``Series`` as mappings to relabel (:issue:`12623`).
- Clean in ``.rolling.min`` and ``.rolling.max`` to enhance dtype handling (:issue:`12373`)

Expand Down
70 changes: 49 additions & 21 deletions pandas/io/parsers.py
Expand Up @@ -75,8 +75,12 @@ class ParserWarning(Warning):
of each line, you might consider index_col=False to force pandas to _not_
use the first column as the index (row names)
usecols : array-like, default None
Return a subset of the columns.
Results in much faster parsing time and lower memory usage.
Return a subset of the columns. All elements in this array must either
be positional (i.e. integer indices into the document columns) or strings
that correspond to column names provided either by the user in `names` or
inferred from the document header row(s). For example, a valid `usecols`
parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. Using this parameter
results in much faster parsing time and lower memory usage.
squeeze : boolean, default False
If the parsed data only contains one column then return a Series
prefix : str, default None
Expand Down Expand Up @@ -801,6 +805,23 @@ def _is_index_col(col):
return col is not None and col is not False


def _validate_usecols_arg(usecols):
"""
Check whether or not the 'usecols' parameter
contains all integers (column selection by index)
or strings (column by name). Raises a ValueError
if that is not the case.
"""
if usecols is not None:
usecols_dtype = lib.infer_dtype(usecols)
if usecols_dtype not in ('integer', 'string'):
raise ValueError(("The elements of 'usecols' "
"must either be all strings "
"or all integers"))

return usecols


class ParserBase(object):

def __init__(self, kwds):
Expand Down Expand Up @@ -1132,7 +1153,7 @@ def __init__(self, src, **kwds):
self._reader = _parser.TextReader(src, **kwds)

# XXX
self.usecols = self._reader.usecols
self.usecols = _validate_usecols_arg(self._reader.usecols)

passed_names = self.names is None

Expand All @@ -1157,18 +1178,21 @@ def __init__(self, src, **kwds):
else:
self.names = lrange(self._reader.table_width)

# If the names were inferred (not passed by user) and usedcols is
# defined, then ensure names refers to the used columns, not the
# document's columns.
if self.usecols and passed_names:
col_indices = []
for u in self.usecols:
if isinstance(u, string_types):
col_indices.append(self.names.index(u))
else:
col_indices.append(u)
self.names = [n for i, n in enumerate(self.names)
if i in col_indices]
# gh-9755
#
# need to set orig_names here first
# so that proper indexing can be done
# with _set_noconvert_columns
#
# once names has been filtered, we will
# then set orig_names again to names
self.orig_names = self.names[:]

if self.usecols:
if len(self.names) > len(self.usecols):
self.names = [n for i, n in enumerate(self.names)
if (i in self.usecols or n in self.usecols)]

if len(self.names) < len(self.usecols):
raise ValueError("Usecols do not match names.")

Expand All @@ -1194,13 +1218,17 @@ def __init__(self, src, **kwds):
self._implicit_index = self._reader.leading_cols > 0

def _set_noconvert_columns(self):
names = self.names
names = self.orig_names
usecols = self.usecols

def _set(x):
if com.is_integer(x):
self._reader.set_noconvert(x)
else:
self._reader.set_noconvert(names.index(x))
if usecols and com.is_integer(x):
x = list(usecols)[x]

if not com.is_integer(x):
x = names.index(x)

self._reader.set_noconvert(x)

if isinstance(self.parse_dates, list):
for val in self.parse_dates:
Expand Down Expand Up @@ -1472,7 +1500,7 @@ def __init__(self, f, **kwds):
self.lineterminator = kwds['lineterminator']
self.quoting = kwds['quoting']
self.mangle_dupe_cols = kwds.get('mangle_dupe_cols', True)
self.usecols = kwds['usecols']
self.usecols = _validate_usecols_arg(kwds['usecols'])
self.skip_blank_lines = kwds['skip_blank_lines']

self.names_passed = kwds['names'] or None
Expand Down
112 changes: 109 additions & 3 deletions pandas/io/tests/test_parsers.py
Expand Up @@ -2682,12 +2682,118 @@ def test_uneven_lines_with_usecols(self):
df = self.read_csv(StringIO(csv), usecols=usecols)
tm.assert_frame_equal(df, expected)

usecols = ['a', 1]
usecols = ['a', 'b']
df = self.read_csv(StringIO(csv), usecols=usecols)
tm.assert_frame_equal(df, expected)

usecols = ['a', 'b']
df = self.read_csv(StringIO(csv), usecols=usecols)
def test_usecols_with_parse_dates(self):
# See gh-9755
s = """a,b,c,d,e
0,1,20140101,0900,4
0,1,20140102,1000,4"""
parse_dates = [[1, 2]]

cols = {
'a' : [0, 0],
'c_d': [
Timestamp('2014-01-01 09:00:00'),
Timestamp('2014-01-02 10:00:00')
]
}
expected = DataFrame(cols, columns=['c_d', 'a'])

df = self.read_csv(StringIO(s), usecols=[0, 2, 3],
parse_dates=parse_dates)
tm.assert_frame_equal(df, expected)

df = self.read_csv(StringIO(s), usecols=[3, 0, 2],
parse_dates=parse_dates)
tm.assert_frame_equal(df, expected)

def test_usecols_with_parse_dates_and_full_names(self):
# See gh-9755
s = """0,1,20140101,0900,4
0,1,20140102,1000,4"""
parse_dates = [[1, 2]]
names = list('abcde')

cols = {
'a' : [0, 0],
'c_d': [
Timestamp('2014-01-01 09:00:00'),
Timestamp('2014-01-02 10:00:00')
]
}
expected = DataFrame(cols, columns=['c_d', 'a'])

df = self.read_csv(StringIO(s), names=names,
usecols=[0, 2, 3],
parse_dates=parse_dates)
tm.assert_frame_equal(df, expected)

df = self.read_csv(StringIO(s), names=names,
usecols=[3, 0, 2],
parse_dates=parse_dates)
tm.assert_frame_equal(df, expected)

def test_usecols_with_parse_dates_and_usecol_names(self):
# See gh-9755
s = """0,1,20140101,0900,4
0,1,20140102,1000,4"""
parse_dates = [[1, 2]]
names = list('acd')

cols = {
'a' : [0, 0],
'c_d': [
Timestamp('2014-01-01 09:00:00'),
Timestamp('2014-01-02 10:00:00')
]
}
expected = DataFrame(cols, columns=['c_d', 'a'])

df = self.read_csv(StringIO(s), names=names,
usecols=[0, 2, 3],
parse_dates=parse_dates)
tm.assert_frame_equal(df, expected)

df = self.read_csv(StringIO(s), names=names,
usecols=[3, 0, 2],
parse_dates=parse_dates)
tm.assert_frame_equal(df, expected)

def test_mixed_dtype_usecols(self):
# See gh-12678
data = """a,b,c
1000,2000,3000
4000,5000,6000
"""
msg = ("The elements of \'usecols\' "
"must either be all strings "
"or all integers")
usecols = [0, 'b', 2]

with tm.assertRaisesRegexp(ValueError, msg):
df = self.read_csv(StringIO(data), usecols=usecols)

def test_usecols_with_integer_like_header(self):
data = """2,0,1
1000,2000,3000
4000,5000,6000
"""

usecols = [0, 1] # column selection by index
expected = DataFrame(data=[[1000, 2000],
[4000, 5000]],
columns=['2', '0'])
df = self.read_csv(StringIO(data), usecols=usecols)
tm.assert_frame_equal(df, expected)

usecols = ['0', '1'] # column selection by name
expected = DataFrame(data=[[2000, 3000],
[5000, 6000]],
columns=['0', '1'])
df = self.read_csv(StringIO(data), usecols=usecols)
tm.assert_frame_equal(df, expected)


Expand Down

0 comments on commit c6c201e

Please sign in to comment.