Skip to content
This repository
Browse code

BUG: implement 64-bit int overflowing case in merge. close #2690

  • Loading branch information...
commit 78d090af266049bad913ac344f38a796686560a5 1 parent 5163bc2
Wes McKinney authored January 19, 2013
3  RELEASE.rst
Source Rendered
@@ -55,6 +55,8 @@ pandas 0.10.1
55 55
   - Add ``logx`` option to DataFrame/Series.plot (GH2327_, #2565)
56 56
   - Support reading gzipped data from file-like object
57 57
   - ``pivot_table`` aggfunc can be anything used in GroupBy.aggregate (GH2643_)
  58
+  - Implement DataFrame merges in case where set cardinalities might overflow
  59
+    64-bit integer (GH2690_)
58 60
 
59 61
 **Bug fixes**
60 62
 
@@ -99,6 +101,7 @@ pandas 0.10.1
99 101
 .. _GH2625: https://github.com/pydata/pandas/issues/2625
100 102
 .. _GH2643: https://github.com/pydata/pandas/issues/2643
101 103
 .. _GH2637: https://github.com/pydata/pandas/issues/2637
  104
+.. _GH2690: https://github.com/pydata/pandas/issues/2690
102 105
 .. _GH2692: https://github.com/pydata/pandas/issues/2692
103 106
 
104 107
 pandas 0.10.0
14  pandas/tools/merge.py
@@ -425,18 +425,20 @@ def _get_join_indexers(left_keys, right_keys, sort=False, how='inner'):
425 425
         right_labels.append(rlab)
426 426
         group_sizes.append(count)
427 427
 
428  
-    left_group_key = get_group_index(left_labels, group_sizes)
429  
-    right_group_key = get_group_index(right_labels, group_sizes)
430  
-
431 428
     max_groups = 1L
432 429
     for x in group_sizes:
433 430
         max_groups *= long(x)
434 431
 
435 432
     if max_groups > 2 ** 63:  # pragma: no cover
436  
-        raise MergeError('Combinatorial explosion! (boom)')
  433
+        left_group_key, right_group_key, max_groups = \
  434
+            _factorize_keys(lib.fast_zip(left_labels),
  435
+                            lib.fast_zip(right_labels))
  436
+    else:
  437
+        left_group_key = get_group_index(left_labels, group_sizes)
  438
+        right_group_key = get_group_index(right_labels, group_sizes)
437 439
 
438  
-    left_group_key, right_group_key, max_groups = \
439  
-        _factorize_keys(left_group_key, right_group_key, sort=sort)
  440
+        left_group_key, right_group_key, max_groups = \
  441
+            _factorize_keys(left_group_key, right_group_key, sort=sort)
440 442
 
441 443
     join_func = _join_functions[how]
442 444
     return join_func(left_group_key, right_group_key, max_groups)
10  pandas/tools/tests/test_merge.py
@@ -849,6 +849,16 @@ def test_merge_na_keys(self):
849 849
 
850 850
         tm.assert_frame_equal(result, expected)
851 851
 
  852
+    def test_int64_overflow_issues(self):
  853
+        # #2690, combinatorial explosion
  854
+        df1 = DataFrame(np.random.randn(1000, 7),
  855
+                        columns=list('ABCDEF') + ['G1'])
  856
+        df2 = DataFrame(np.random.randn(1000, 7),
  857
+                        columns=list('ABCDEF') + ['G2'])
  858
+
  859
+        # it works!
  860
+        result = merge(df1, df2, how='outer')
  861
+        self.assertTrue(len(result) == 2000)
852 862
 
853 863
 def _check_join(left, right, result, join_col, how='left',
854 864
                 lsuffix='_x', rsuffix='_y'):

0 notes on commit 78d090a

Please sign in to comment.
Something went wrong with that request. Please try again.