In [1]:
import numpy as np
import pandas as pd

# Creating a MultiIndex (hierarchical index) object

In [2]:
arrays = [
    ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
    ["one", "two", "one", "two", "one", "two", "one", "two"],
]
arrays

[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
 ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]

In [3]:
tuples = list(zip(*arrays))
print(tuples, type(tuples), sep='\n')

[('bar', 'one'), ('bar', 'two'), ('baz', 'one'), ('baz', 'two'), ('foo', 'one'), ('foo', 'two'), ('qux', 'one'), ('qux', 'two')]
<class 'list'>


In [4]:
index = pd.MultiIndex.from_tuples(tuples, names=["first", "second"])  # 从元组中创建MultiIndex
index

MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])

In [5]:
s = pd.Series(np.random.randint(-10, 10, size=8), index=index)
s

first  second
bar    one       7
       two       3
baz    one       7
       two       5
foo    one      -6
       two       6
qux    one      -4
       two      -9
dtype: int32

![结果](https://cdn.jsdelivr.net/gh/pumpbumb/pictures/img/20220407211913.png)

---

# Basic indexing on axis with MultiIndex

In [6]:
df = pd.DataFrame(np.random.randn(3, 8), index=["A", "B", "C"], columns=index)
df

first,bar,bar,baz,baz,foo,foo,qux,qux
second,one,two,one,two,one,two,one,two
A,0.711737,0.589683,0.59092,0.966168,-1.258487,0.422992,0.54305,-1.649523
B,0.983604,1.240281,0.10947,0.349078,0.610523,-1.039872,-0.192901,0.757224
C,-1.151463,-0.3576,0.460098,0.196811,0.428927,2.162378,-0.690727,-0.544239


In [7]:
df['bar']

second,one,two
A,0.711737,0.589683
B,0.983604,1.240281
C,-1.151463,-0.3576


In [8]:
df['bar', 'one']

A    0.711737
B    0.983604
C   -1.151463
Name: (bar, one), dtype: float64

In [9]:
df['bar']['one']

A    0.711737
B    0.983604
C   -1.151463
Name: one, dtype: float64

In [10]:
df.columns.levels

FrozenList([['bar', 'baz', 'foo', 'qux'], ['one', 'two']])

In [11]:
df[["foo", "qux"]].columns.get_level_values(0)

Index(['foo', 'foo', 'qux', 'qux'], dtype='object', name='first')

In [12]:
df[["foo", "qux"]].columns.get_level_values(1)

Index(['one', 'two', 'one', 'two'], dtype='object', name='second')

# Data alignment and using `reindex`

Operations between differently-indexed objects having MultiIndex on the axes will work as you expect; data alignment will work the same as an Index of tuples:

In [13]:
arrays = [
    np.array(["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"]),
    np.array(["one", "two", "one", "two", "one", "two", "one", "two"]),
]

s = pd.Series(np.random.randn(8), index=arrays)
s

bar  one   -1.266242
     two   -1.296507
baz  one   -0.748485
     two   -2.340093
foo  one   -0.403013
     two   -0.088235
qux  one   -0.608905
     two    2.741266
dtype: float64

In [14]:
s[:-2]

bar  one   -1.266242
     two   -1.296507
baz  one   -0.748485
     two   -2.340093
foo  one   -0.403013
     two   -0.088235
dtype: float64

In [16]:
s + s[:-2]

bar  one   -2.532484
     two   -2.593014
baz  one   -1.496971
     two   -4.680185
foo  one   -0.806026
     two   -0.176471
qux  one         NaN
     two         NaN
dtype: float64

In [17]:
s+s[::2]

bar  one   -2.532484
     two         NaN
baz  one   -1.496971
     two         NaN
foo  one   -0.806026
     two         NaN
qux  one   -1.217810
     two         NaN
dtype: float64

### The reindex() method of Series/DataFrames can be called with another MultiIndex, or even a list or array of tuples:

In [20]:
index

MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])

In [24]:
s.reindex(index[:4])

first  second
bar    one      -1.266242
       two      -1.296507
baz    one      -0.748485
       two      -2.340093
dtype: float64

In [19]:
s.reindex([("foo", "two"), ("bar", "one"), ("qux", "one"), ("baz", "one")])

foo  two   -0.088235
bar  one   -1.266242
qux  one   -0.608905
baz  one   -0.748485
dtype: float64

In [26]:
s.reindex(np.array([("foo", "two"), ("bar", "one"), ("qux", "one"), ("baz", "one")]))

foo  two   -0.088235
bar  one   -1.266242
qux  one   -0.608905
baz  one   -0.748485
dtype: float64

# Advanced indexing with hierarchical index

Syntactically integrating MultiIndex in advanced indexing with .loc is a bit challenging, but we’ve made every effort to do so. In general, MultiIndex keys take the form of tuples. For example, the following works as you would expect:

In [33]:
df 

first,bar,bar,baz,baz,foo,foo,qux,qux
second,one,two,one,two,one,two,one,two
A,0.711737,0.589683,0.59092,0.966168,-1.258487,0.422992,0.54305,-1.649523
B,0.983604,1.240281,0.10947,0.349078,0.610523,-1.039872,-0.192901,0.757224
C,-1.151463,-0.3576,0.460098,0.196811,0.428927,2.162378,-0.690727,-0.544239


In [37]:
df1 = df.T
df1

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bar,one,0.711737,0.983604,-1.151463
bar,two,0.589683,1.240281,-0.3576
baz,one,0.59092,0.10947,0.460098
baz,two,0.966168,0.349078,0.196811
foo,one,-1.258487,0.610523,0.428927
foo,two,0.422992,-1.039872,2.162378
qux,one,0.54305,-0.192901,-0.690727
qux,two,-1.649523,0.757224,-0.544239


In [42]:
df1.loc[("bar", "two")]   # 返回的是Series，它的 name 为 (bar, two)，这是 valid，因为name属性本就可以是 tuple

A    0.589683
B    1.240281
C   -0.357600
Name: (bar, two), dtype: float64

In [43]:
print(type(df1.loc[("bar", "two")]))

<class 'pandas.core.series.Series'>


In [60]:
df1.loc[('bar', 'two'), 'A']  # 具体到某一个元素

0.5896826211307166

In [47]:
# you can use “partial” indexing to get all elements with `bar` in the first level as follows:
df1.loc['bar']

Unnamed: 0_level_0,A,B,C
second,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0.711737,0.983604,-1.151463
two,0.589683,1.240281,-0.3576


In [49]:
print(df1.loc['bar'].index)

Index(['one', 'two'], dtype='object', name='second')


In [50]:
# “Partial” slicing also works quite nicely.
df1.loc['bar':'foo']

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bar,one,0.711737,0.983604,-1.151463
bar,two,0.589683,1.240281,-0.3576
baz,one,0.59092,0.10947,0.460098
baz,two,0.966168,0.349078,0.196811
foo,one,-1.258487,0.610523,0.428927
foo,two,0.422992,-1.039872,2.162378


In [52]:
# You can slice with a ‘range’ of values, by providing a slice of tuples.
df1.loc[("baz", "two"):("qux", "one")]  # 注意是冒号！

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
baz,two,0.966168,0.349078,0.196811
foo,one,-1.258487,0.610523,0.428927
foo,two,0.422992,-1.039872,2.162378
qux,one,0.54305,-0.192901,-0.690727


In [53]:
df1.loc[("baz", "two"):"foo"]  # 可以缺省

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
baz,two,0.966168,0.349078,0.196811
foo,one,-1.258487,0.610523,0.428927
foo,two,0.422992,-1.039872,2.162378


In [57]:
# Passing a list of labels or tuples works similar to reindexing:
df1.loc[[("bar", "two"), ("qux", "one")]]

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bar,two,0.589683,1.240281,-0.3576
qux,one,0.54305,-0.192901,-0.690727


In [61]:
s = pd.Series(
    [1, 2, 3, 4, 5, 6],
    index=pd.MultiIndex.from_product([["A", "B"], ["c", "d", "e"]]),
)
s

A  c    1
   d    2
   e    3
B  c    4
   d    5
   e    6
dtype: int64

### Importantly, a list of tuples indexes several complete MultiIndex keys, whereas a tuple of lists refer to several values within a level:

In [65]:
s.loc[[("A", "c"), ("B", "d")]]  # list of tuples

A  c    1
B  d    5
dtype: int64

In [66]:
s.loc[(["A", "B"], ["c", "d"])]  # tuple of lists

A  c    1
   d    2
B  c    4
   d    5
dtype: int64

# Using slicers 使用切片器
>这部分往后内容都比较难，需要深度思考和实践！