Skip to content

Commit

Permalink
BUG: reimplement MultiIndex.remove_unused_levels
Browse files Browse the repository at this point in the history
* Add a large random test case for remove_unused_levels that failed the
previous implementation

* Fix pandas-dev#16556, a performance issue with the previous implementation

* Always return at least a view instead of the original index
  • Loading branch information
rhendric committed Jun 1, 2017
1 parent fb47ee5 commit 83bdc59
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 20 deletions.
9 changes: 9 additions & 0 deletions asv_bench/benchmarks/indexing.py
Expand Up @@ -204,6 +204,12 @@ def setup(self):
[np.arange(100), list('A'), list('A')],
names=['one', 'two', 'three'])

rng = np.random.RandomState(4)
size = 1 << 16
self.mi_unused_levels = pd.MultiIndex.from_arrays([
rng.randint(0, 1 << 13, size),
rng.randint(0, 1 << 10, size)])[rng.rand(size) < 0.1]

def time_series_xs_mi_ix(self):
self.s.ix[999]

Expand Down Expand Up @@ -248,6 +254,9 @@ def time_multiindex_small_get_loc_warm(self):
def time_is_monotonic(self):
self.miint.is_monotonic

def time_remove_unused_levels(self):
self.mi_unused_levels.remove_unused_levels()


class IntervalIndexing(object):
goal_time = 0.2
Expand Down
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.20.2.txt
Expand Up @@ -32,6 +32,7 @@ Performance Improvements
- Performance regression fix for MultiIndexes (:issue:`16319`, :issue:`16346`)
- Improved performance of ``.clip()`` with scalar arguments (:issue:`15400`)
- Improved performance of groupby with categorical groupers (:issue:`16413`)
- Improved performance of ``MultiIndex.remove_unused_levels()`` (:issue:`16556`)

.. _whatsnew_0202.bug_fixes:

Expand Down Expand Up @@ -61,6 +62,7 @@ Indexing

- Bug in ``DataFrame.reset_index(level=)`` with single level index (:issue:`16263`)
- Bug in partial string indexing with a monotonic, but not strictly-monotonic, index incorrectly reversing the slice bounds (:issue:`16515`)
- Bug in ``MultiIndex.remove_unused_levels()`` (:issue:`16556`)

I/O
^^^
Expand Down
34 changes: 15 additions & 19 deletions pandas/core/indexes/multi.py
Expand Up @@ -1290,42 +1290,38 @@ def remove_unused_levels(self):
new_levels = []
new_labels = []

changed = np.ones(self.nlevels, dtype=bool)
for i, (lev, lab) in enumerate(zip(self.levels, self.labels)):
changed = False
for lev, lab in zip(self.levels, self.labels):

uniques = algos.unique(lab)

# nothing unused
if len(uniques) == len(lev):
new_levels.append(lev)
new_labels.append(lab)
changed[i] = False
continue

# set difference, then reverse sort
diff = Index(np.arange(len(lev))).difference(uniques)
unused = diff.sort_values(ascending=False)
changed = True

# labels get mapped from uniques to 0:len(uniques)
label_mapping = np.zeros(len(lev))
label_mapping[uniques] = np.arange(len(uniques))
lab = label_mapping[lab]

# new levels are simple
lev = lev.take(uniques)

# new labels, we remove the unsued
# by decrementing the labels for that value
# prob a better way
for u in unused:

lab = np.where(lab > u, lab - 1, lab)

new_levels.append(lev)
new_labels.append(lab)

# nothing changed
if not changed.any():
return self
result = self._shallow_copy()

return MultiIndex(new_levels, new_labels,
names=self.names, sortorder=self.sortorder,
verify_integrity=False)
if changed:
result._reset_identity()
result._set_levels(new_levels, validate=False)
result._set_labels(new_labels, validate=False)

return result

@property
def nlevels(self):
Expand Down
29 changes: 28 additions & 1 deletion pandas/tests/indexes/test_multi.py
Expand Up @@ -2515,7 +2515,34 @@ def test_reconstruct_remove_unused(self):
# idempotent
result2 = result.remove_unused_levels()
tm.assert_index_equal(result2, expected)
assert result2 is result
assert result2.is_(result)

@pytest.mark.parametrize('first_type,second_type', [
('int64', 'int64'),
('datetime64[D]', 'str')])
def test_remove_unused_levels_large(self, first_type, second_type):
# GH16556

# because tests should be deterministic (and this test in particular
# checks that levels are removed, which is not the case for every
# random input):
rng = np.random.RandomState(4) # seed is arbitrary value that works

size = 1 << 16
df = DataFrame(dict(
first=rng.randint(0, 1 << 13, size).astype(first_type),
second=rng.randint(0, 1 << 10, size).astype(second_type),
third=rng.rand(size)))
df = df.groupby(['first', 'second']).sum()
df = df[df.third < 0.1]

result = df.index.remove_unused_levels()
assert len(result.levels[0]) < len(df.index.levels[0])
assert len(result.levels[1]) < len(df.index.levels[1])
assert result.equals(df.index)

expected = df.reset_index().set_index(['first', 'second']).index
tm.assert_index_equal(result, expected)

def test_isin(self):
values = [('foo', 2), ('bar', 3), ('quux', 4)]
Expand Down

0 comments on commit 83bdc59

Please sign in to comment.