diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 31af56b3715a52..e1676715853a4f 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -193,9 +193,15 @@ def setup(self): np.arange(1000)], names=['one', 'two']) import string - self.mistring = MultiIndex.from_product( - [np.arange(1000), - np.arange(20), list(string.ascii_letters)], + + self.mi_large = MultiIndex.from_product( + [np.arange(1000), np.arange(20), list(string.ascii_letters)], + names=['one', 'two', 'three']) + self.mi_med = MultiIndex.from_product( + [np.arange(1000), np.arange(10), list('A')], + names=['one', 'two', 'three']) + self.mi_small = MultiIndex.from_product( + [np.arange(100), list('A'), list('A')], names=['one', 'two', 'three']) def time_series_xs_mi_ix(self): @@ -218,8 +224,14 @@ def time_multiindex_get_indexer(self): (0, 16), (0, 17), (0, 18), (0, 19)], dtype=object)) + def time_multiindex_large_get_loc(self): + self.mi_large.get_loc((999, 19, 'Z')) + + def time_multiindex_med_get_loc(self): + self.mi_med.get_loc((999, 9, 'A')) + def time_multiindex_string_get_loc(self): - self.mistring.get_loc((999, 19, 'Z')) + self.mi_small.get_loc((99, 'A', 'A')) def time_is_monotonic(self): self.miint.is_monotonic diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index 9a3035e4334c7a..1b899dfc8abd5e 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -27,7 +27,7 @@ Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Performance regression fix when indexing with a list-like (:issue:`16285`) - +- Performance regression fix for small MultiIndexes (:issuse:`16319`) .. _whatsnew_0202.bug_fixes: diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index c7a537acf5d6fd..21680fb0b39219 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -553,7 +553,34 @@ cdef inline bint _is_utc(object tz): return tz is UTC or isinstance(tz, _du_utc) -cdef class MultiIndexEngine(IndexEngine): +cdef class MultiIndexObjectEngine(ObjectEngine): + """ + provide the same interface as the MultiIndexEngine + but use the IndexEngine for computation + + This provides good performance with samller MI's + """ + def get_indexer(self, values): + # convert a MI to an ndarray + if hasattr(values, 'values'): + values = values.values + return super(MultiIndexObjectEngine, self).get_indexer(values) + + cpdef get_loc(self, object val): + + # convert a MI to an ndarray + if hasattr(val, 'values'): + val = val.values + return super(MultiIndexObjectEngine, self).get_loc(val) + + +cdef class MultiIndexHashEngine(ObjectEngine): + """ + Use a hashing based MultiIndex impl + but use the IndexEngine for computation + + This provides good performance with larger MI's + """ def _call_monotonic(self, object mi): # defer these back to the mi iteself @@ -584,6 +611,10 @@ cdef class MultiIndexEngine(IndexEngine): except TypeError: raise KeyError(val) + def get_indexer(self, values): + self._ensure_mapping_populated() + return self.mapping.lookup(values) + cdef _make_hash_table(self, n): return _hash.MultiIndexHashTable(n) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 561f1951a4151b..dc2c56ea476f9d 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -88,12 +88,12 @@ def is_dtype(cls, dtype): """ if hasattr(dtype, 'dtype'): dtype = dtype.dtype - if isinstance(dtype, cls): - return True - elif isinstance(dtype, np.dtype): + if isinstance(dtype, np.dtype): return False elif dtype is None: return False + elif isinstance(dtype, cls): + return True try: return cls.construct_from_string(dtype) is not None except: diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 7ef037d8f3536f..3db5633ec30bda 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -75,7 +75,6 @@ class MultiIndex(Index): _levels = FrozenList() _labels = FrozenList() _comparables = ['names'] - _engine_type = libindex.MultiIndexEngine rename = Index.set_names def __new__(cls, levels=None, labels=None, sortorder=None, names=None, @@ -629,7 +628,16 @@ def _get_level_number(self, level): @cache_readonly def _engine(self): - return self._engine_type(lambda: self, len(self)) + + # choose our engine based on our size + # the hashing based MultiIndex for larger + # sizes, and the MultiIndexOjbect for smaller + # xref: https://github.com/pandas-dev/pandas/pull/16324 + l = len(self) + if l > 10000: + return libindex.MultiIndexHashEngine(lambda: self, l) + + return libindex.MultiIndexObjectEngine(lambda: self.values, l) @property def values(self): diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index 6a5343e8a8e252..f0829adc94500d 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -5,16 +5,13 @@ import numpy as np from pandas._libs import hashing -from pandas._libs.lib import is_bool_array from pandas.core.dtypes.generic import ( ABCMultiIndex, ABCIndexClass, ABCSeries, ABCDataFrame) from pandas.core.dtypes.common import ( - is_categorical_dtype, is_numeric_dtype, - is_datetime64_dtype, is_timedelta64_dtype, - is_list_like) + is_categorical_dtype, is_list_like) # 16 byte long hashing key _default_hash_key = '0123456789123456' @@ -136,7 +133,6 @@ def hash_tuples(vals, encoding='utf8', hash_key=None): ------- ndarray of hashed values array """ - is_tuple = False if isinstance(vals, tuple): vals = [vals] @@ -231,6 +227,7 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): if not hasattr(vals, 'dtype'): raise TypeError("must pass a ndarray-like") + dtype = vals.dtype if hash_key is None: hash_key = _default_hash_key @@ -238,22 +235,21 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): # For categoricals, we hash the categories, then remap the codes to the # hash values. (This check is above the complex check so that we don't ask # numpy if categorical is a subdtype of complex, as it will choke. - if is_categorical_dtype(vals.dtype): + if is_categorical_dtype(dtype): return _hash_categorical(vals, encoding, hash_key) # we'll be working with everything as 64-bit values, so handle this # 128-bit value early - if np.issubdtype(vals.dtype, np.complex128): + elif np.issubdtype(dtype, np.complex128): return hash_array(vals.real) + 23 * hash_array(vals.imag) # First, turn whatever array this is into unsigned 64-bit ints, if we can # manage it. - if is_bool_array(vals): + elif isinstance(dtype, np.bool): vals = vals.astype('u8') - elif (is_datetime64_dtype(vals) or - is_timedelta64_dtype(vals)): + elif issubclass(dtype.type, (np.datetime64, np.timedelta64)): vals = vals.view('i8').astype('u8', copy=False) - elif (is_numeric_dtype(vals) and vals.dtype.itemsize <= 8): + elif issubclass(dtype.type, np.number) and dtype.itemsize <= 8: vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8') else: # With repeated values, its MUCH faster to categorize object dtypes,