Fix bug in read_csv for high cardinality category types (pandas-dev#1…

…8186)
sam-cohan · Nov 21, 2017 · 009311a · 009311a
1 parent 509e03c
commit 009311a
Show file tree

Hide file tree

Showing 3 changed files with 16 additions and 3 deletions.
diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt
@@ -64,6 +64,7 @@ Bug Fixes
 - Bug in ``pd.concat`` when empty and non-empty DataFrames or Series are concatenated (:issue:`18178` :issue:`18187`)
 - Bug in :class:`IntervalIndex` constructor when a list of intervals is passed with non-default ``closed`` (:issue:`18334`)
 - Bug in :meth:`IntervalIndex.copy` when copying and ``IntervalIndex`` with non-default ``closed`` (:issue:`18339`)
+- Bug in ``pd.read_csv`` when reading numeric category fields with high cardinality (:issue `18186`)
 
 Conversion
 ^^^^^^^^^^

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -2227,9 +2227,10 @@ def _concatenate_chunks(list chunks):
     for name in names:
         arrs = [chunk.pop(name) for chunk in chunks]
         # Check each arr for consistent types.
-        dtypes = set(a.dtype for a in arrs)
-        if len(dtypes) > 1:
-            common_type = np.find_common_type(dtypes, [])
+        dtypes = set([a.dtype for a in arrs])
+        numpy_dtypes = {x for x in dtypes if not is_categorical_dtype(x)}
+        if len(numpy_dtypes) > 1:
+            common_type = np.find_common_type(numpy_dtypes, [])
             if common_type == np.object:
                 warning_columns.append(str(name))
 

diff --git a/pandas/tests/io/parser/dtypes.py b/pandas/tests/io/parser/dtypes.py
@@ -114,6 +114,17 @@ def test_categorical_dtype(self):
         actual = self.read_csv(StringIO(data), dtype='category')
         tm.assert_frame_equal(actual, expected)
 
+    @pytest.mark.slow
+    def test_categorical_dtype_high_cardinality_numeric(self):
+        # GH 18186
+        data = sorted([str(i) for i in range(10**6)])
+        expected = pd.DataFrame({'a': Categorical(data, ordered=True)})
+        actual = self.read_csv(StringIO('a\n' + '\n'.join(data)),
+                               dtype='category')
+        actual.a.cat.reorder_categories(sorted(actual.a.cat.categories),
+                                        ordered=True, inplace=True)
+        tm.assert_frame_equal(actual, expected)
+
     def test_categorical_dtype_encoding(self):
         # GH 10153
         pth = tm.get_data_path('unicode_series.csv')