Skip to content

Commit

Permalink
Fix bug in read_csv for high cardinality category types (pandas-dev#1…
Browse files Browse the repository at this point in the history
  • Loading branch information
sam-cohan committed Nov 21, 2017
1 parent 509e03c commit 009311a
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 3 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.21.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ Bug Fixes
- Bug in ``pd.concat`` when empty and non-empty DataFrames or Series are concatenated (:issue:`18178` :issue:`18187`)
- Bug in :class:`IntervalIndex` constructor when a list of intervals is passed with non-default ``closed`` (:issue:`18334`)
- Bug in :meth:`IntervalIndex.copy` when copying and ``IntervalIndex`` with non-default ``closed`` (:issue:`18339`)
- Bug in ``pd.read_csv`` when reading numeric category fields with high cardinality (:issue `18186`)

Conversion
^^^^^^^^^^
Expand Down
7 changes: 4 additions & 3 deletions pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2227,9 +2227,10 @@ def _concatenate_chunks(list chunks):
for name in names:
arrs = [chunk.pop(name) for chunk in chunks]
# Check each arr for consistent types.
dtypes = set(a.dtype for a in arrs)
if len(dtypes) > 1:
common_type = np.find_common_type(dtypes, [])
dtypes = set([a.dtype for a in arrs])
numpy_dtypes = {x for x in dtypes if not is_categorical_dtype(x)}
if len(numpy_dtypes) > 1:
common_type = np.find_common_type(numpy_dtypes, [])
if common_type == np.object:
warning_columns.append(str(name))

Expand Down
11 changes: 11 additions & 0 deletions pandas/tests/io/parser/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,17 @@ def test_categorical_dtype(self):
actual = self.read_csv(StringIO(data), dtype='category')
tm.assert_frame_equal(actual, expected)

@pytest.mark.slow
def test_categorical_dtype_high_cardinality_numeric(self):
# GH 18186
data = sorted([str(i) for i in range(10**6)])
expected = pd.DataFrame({'a': Categorical(data, ordered=True)})
actual = self.read_csv(StringIO('a\n' + '\n'.join(data)),
dtype='category')
actual.a.cat.reorder_categories(sorted(actual.a.cat.categories),
ordered=True, inplace=True)
tm.assert_frame_equal(actual, expected)

def test_categorical_dtype_encoding(self):
# GH 10153
pth = tm.get_data_path('unicode_series.csv')
Expand Down

0 comments on commit 009311a

Please sign in to comment.