# MultiIndex / advanced indexing
> https://pandas.pydata.org/docs/user_guide/advanced.html#advanced-indexing-with-hierarchical-index

## Hierarchical indexing (MultiIndex)

### Creating a MultiIndex (hierarchical index) object

In [2]:
import numpy as np
import pandas as pd

In [3]:
arrays = [
   ...:     ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
   ...:     ["one", "two", "one", "two", "one", "two", "one", "two"],
   ...: ]

In [4]:
tuples = list(zip(*arrays))

In [5]:
tuples

[('bar', 'one'),
 ('bar', 'two'),
 ('baz', 'one'),
 ('baz', 'two'),
 ('foo', 'one'),
 ('foo', 'two'),
 ('qux', 'one'),
 ('qux', 'two')]

In [6]:
index = pd.MultiIndex.from_tuples(tuples, names=["first", "second"])

In [7]:
index

MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])

In [8]:
s = pd.Series(np.random.randn(8), index=index)

In [9]:
s

first  second
bar    one       1.147143
       two      -0.243880
baz    one      -0.203135
       two       0.287454
foo    one       0.101170
       two      -1.725604
qux    one      -0.574243
       two      -0.457899
dtype: float64

In [10]:
iterables = [["bar", "baz", "foo", "qux"], ["one", "two"]]

In [11]:
pd.MultiIndex.from_product(iterables, names=["first", "second"])

MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])

In [12]:
df = pd.DataFrame(
   ....:     [["bar", "one"], ["bar", "two"], ["foo", "one"], ["foo", "two"]],
   ....:     columns=["first", "second"],
   ....: )

In [13]:
df

Unnamed: 0,first,second
0,bar,one
1,bar,two
2,foo,one
3,foo,two


In [14]:
pd.MultiIndex.from_frame(df)

MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('foo', 'one'),
            ('foo', 'two')],
           names=['first', 'second'])

In [15]:
arrays = [
   ....:     np.array(["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"]),
   ....:     np.array(["one", "two", "one", "two", "one", "two", "one", "two"]),
   ....: ]

In [16]:
s = pd.Series(np.random.randn(8), index=arrays)

In [17]:
s

bar  one   -0.008686
     two   -0.040822
baz  one    0.320482
     two   -1.208865
foo  one   -0.237236
     two   -0.382782
qux  one   -0.077091
     two    0.256874
dtype: float64

In [18]:
df = pd.DataFrame(np.random.randn(8, 4), index=arrays)

In [19]:
df

Unnamed: 0,Unnamed: 1,0,1,2,3
bar,one,-0.665317,0.431389,1.245766,-0.054652
bar,two,-0.163111,-0.274178,-0.615384,-0.796229
baz,one,0.579002,-0.603986,-0.480271,0.122055
baz,two,-0.80065,0.017826,-0.104425,-0.282021
foo,one,-1.238522,-0.077817,-0.129949,-0.438459
foo,two,0.165208,0.941833,-0.038788,-2.139613
qux,one,2.782822,-0.432597,2.084745,-1.056715
qux,two,-1.398033,0.939473,-2.473552,-1.049366


In [20]:
df.index.names

FrozenList([None, None])

In [21]:
df = pd.DataFrame(np.random.randn(3,8), index=["A", "B", "C"], columns=index)

In [22]:
df

first,bar,bar,baz,baz,foo,foo,qux,qux
second,one,two,one,two,one,two,one,two
A,1.056486,0.125929,1.668292,0.667327,-1.379603,1.198458,-0.009858,0.675545
B,2.00317,1.287787,-0.101521,1.034863,1.262768,0.939427,0.869639,1.735534
C,0.166065,-0.107968,-0.792366,-0.362612,0.401966,-0.42889,0.329877,-0.345733


In [23]:
pd.DataFrame(np.random.randn(6,6), index=index[:6], columns=index[:6])

Unnamed: 0_level_0,first,bar,bar,baz,baz,foo,foo
Unnamed: 0_level_1,second,one,two,one,two,one,two
first,second,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
bar,one,1.613947,-0.274439,0.603092,-0.767093,-0.087154,0.80608
bar,two,-0.711183,0.804088,0.431496,-0.907052,0.208003,-1.137661
baz,one,-0.910068,-1.732214,-0.847777,0.382509,-1.471256,0.1478
baz,two,2.219388,1.763585,-0.671203,-0.213083,-0.573633,1.357328
foo,one,-0.341647,-0.069292,-2.326493,-0.125164,0.311547,0.046713
foo,two,1.002345,-0.901687,0.074765,0.02581,-0.113199,0.60357


In [24]:
with pd.option_context("display.multi_sparse", False):
    print(df)

first        bar       bar       baz       baz       foo       foo       qux  \
second       one       two       one       two       one       two       one   
A       1.056486  0.125929  1.668292  0.667327 -1.379603  1.198458 -0.009858   
B       2.003170  1.287787 -0.101521  1.034863  1.262768  0.939427  0.869639   
C       0.166065 -0.107968 -0.792366 -0.362612  0.401966 -0.428890  0.329877   

first        qux  
second       two  
A       0.675545  
B       1.735534  
C      -0.345733  


In [25]:
pd.Series(np.random.randn(8), index=tuples)

(bar, one)    1.651703
(bar, two)    0.349619
(baz, one)   -0.048165
(baz, two)   -1.088204
(foo, one)    0.675343
(foo, two)    1.513576
(qux, one)    0.177487
(qux, two)   -0.369777
dtype: float64

### Rescontructing the level labels

In [26]:
index.get_level_values(0)

Index(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], dtype='object', name='first')

In [27]:
index.get_level_values(1)

Index(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'], dtype='object', name='second')

In [28]:
index.get_level_values("second")

Index(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'], dtype='object', name='second')

### Basic indexing on axis with MultiIndex

In [29]:
df

first,bar,bar,baz,baz,foo,foo,qux,qux
second,one,two,one,two,one,two,one,two
A,1.056486,0.125929,1.668292,0.667327,-1.379603,1.198458,-0.009858,0.675545
B,2.00317,1.287787,-0.101521,1.034863,1.262768,0.939427,0.869639,1.735534
C,0.166065,-0.107968,-0.792366,-0.362612,0.401966,-0.42889,0.329877,-0.345733


In [30]:
df["bar"]

second,one,two
A,1.056486,0.125929
B,2.00317,1.287787
C,0.166065,-0.107968


In [31]:
type(df["bar"])

pandas.core.frame.DataFrame

In [32]:
df["bar", "one"]

A    1.056486
B    2.003170
C    0.166065
Name: (bar, one), dtype: float64

In [33]:
s

bar  one   -0.008686
     two   -0.040822
baz  one    0.320482
     two   -1.208865
foo  one   -0.237236
     two   -0.382782
qux  one   -0.077091
     two    0.256874
dtype: float64

In [34]:
s.index

MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           )

In [35]:
type(s)

pandas.core.series.Series

In [36]:
s["qux"]

one   -0.077091
two    0.256874
dtype: float64

### Defined levels

In [37]:
df.columns.levels

FrozenList([['bar', 'baz', 'foo', 'qux'], ['one', 'two']])

In [38]:
df

first,bar,bar,baz,baz,foo,foo,qux,qux
second,one,two,one,two,one,two,one,two
A,1.056486,0.125929,1.668292,0.667327,-1.379603,1.198458,-0.009858,0.675545
B,2.00317,1.287787,-0.101521,1.034863,1.262768,0.939427,0.869639,1.735534
C,0.166065,-0.107968,-0.792366,-0.362612,0.401966,-0.42889,0.329877,-0.345733


In [39]:
df[["foo", "qux"]].columns.levels

FrozenList([['bar', 'baz', 'foo', 'qux'], ['one', 'two']])

In [40]:
df[["foo", "qux"]]

first,foo,foo,qux,qux
second,one,two,one,two
A,-1.379603,1.198458,-0.009858,0.675545
B,1.262768,0.939427,0.869639,1.735534
C,0.401966,-0.42889,0.329877,-0.345733


In [41]:
df[["foo", "qux"]].columns.to_numpy()

array([('foo', 'one'), ('foo', 'two'), ('qux', 'one'), ('qux', 'two')],
      dtype=object)

In [42]:
df[["foo", "qux"]].columns.get_level_values(0)

Index(['foo', 'foo', 'qux', 'qux'], dtype='object', name='first')

In [43]:
df[["foo", "qux"]].columns.get_level_values(1)

Index(['one', 'two', 'one', 'two'], dtype='object', name='second')

In [44]:
new_mi = df[["foo", "qux"]].columns.remove_unused_levels()

In [45]:
new_mi.levels

FrozenList([['foo', 'qux'], ['one', 'two']])

In [46]:
new_mi

MultiIndex([('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])

In [47]:
type(new_mi)

pandas.core.indexes.multi.MultiIndex

### Data alignment and using `reindex`

In [48]:
s

bar  one   -0.008686
     two   -0.040822
baz  one    0.320482
     two   -1.208865
foo  one   -0.237236
     two   -0.382782
qux  one   -0.077091
     two    0.256874
dtype: float64

In [49]:
type(s)

pandas.core.series.Series

In [50]:
s[:2]

bar  one   -0.008686
     two   -0.040822
dtype: float64

In [51]:
s + s[:2]

bar  one   -0.017372
     two   -0.081644
baz  one         NaN
     two         NaN
foo  one         NaN
     two         NaN
qux  one         NaN
     two         NaN
dtype: float64

In [52]:
s + s[::2]

bar  one   -0.017372
     two         NaN
baz  one    0.640963
     two         NaN
foo  one   -0.474472
     two         NaN
qux  one   -0.154182
     two         NaN
dtype: float64

In [53]:
index

MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])

In [54]:
index[:3]

MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one')],
           names=['first', 'second'])

In [55]:
s.reindex(index[:3])

first  second
bar    one      -0.008686
       two      -0.040822
baz    one       0.320482
dtype: float64

In [56]:
s.reindex([("foo", "two"), ("bar", "one"), ("qux", "one"), ("baz", "one")])

foo  two   -0.382782
bar  one   -0.008686
qux  one   -0.077091
baz  one    0.320482
dtype: float64

## Advanced indexing with hierarchical index

In [57]:
df

first,bar,bar,baz,baz,foo,foo,qux,qux
second,one,two,one,two,one,two,one,two
A,1.056486,0.125929,1.668292,0.667327,-1.379603,1.198458,-0.009858,0.675545
B,2.00317,1.287787,-0.101521,1.034863,1.262768,0.939427,0.869639,1.735534
C,0.166065,-0.107968,-0.792366,-0.362612,0.401966,-0.42889,0.329877,-0.345733


In [58]:
df = df.T

In [59]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bar,one,1.056486,2.00317,0.166065
bar,two,0.125929,1.287787,-0.107968
baz,one,1.668292,-0.101521,-0.792366
baz,two,0.667327,1.034863,-0.362612
foo,one,-1.379603,1.262768,0.401966
foo,two,1.198458,0.939427,-0.42889
qux,one,-0.009858,0.869639,0.329877
qux,two,0.675545,1.735534,-0.345733


In [60]:
df.loc[("bar", "two")]

A    0.125929
B    1.287787
C   -0.107968
Name: (bar, two), dtype: float64

In [61]:
type(df.loc[("bar", "two")])

pandas.core.series.Series

In [62]:
df.loc["bar", "two"]  # lead to ambiguity!!!

A    0.125929
B    1.287787
C   -0.107968
Name: (bar, two), dtype: float64

In [63]:
df.loc[("bar", "two"), "A"]

0.12592905694029427

In [64]:
df.loc["bar"]

Unnamed: 0_level_0,A,B,C
second,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,1.056486,2.00317,0.166065
two,0.125929,1.287787,-0.107968


In [65]:
df["A"]

first  second
bar    one       1.056486
       two       0.125929
baz    one       1.668292
       two       0.667327
foo    one      -1.379603
       two       1.198458
qux    one      -0.009858
       two       0.675545
Name: A, dtype: float64

In [66]:
df.loc[("bar",),]

Unnamed: 0_level_0,A,B,C
second,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,1.056486,2.00317,0.166065
two,0.125929,1.287787,-0.107968


In [67]:
df.loc["baz":"foo"]

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
baz,one,1.668292,-0.101521,-0.792366
baz,two,0.667327,1.034863,-0.362612
foo,one,-1.379603,1.262768,0.401966
foo,two,1.198458,0.939427,-0.42889


In [68]:
df.loc[("baz", "two"):("qux", "one")]

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
baz,two,0.667327,1.034863,-0.362612
foo,one,-1.379603,1.262768,0.401966
foo,two,1.198458,0.939427,-0.42889
qux,one,-0.009858,0.869639,0.329877


In [69]:
df.loc[("baz", "two"):"foo"]

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
baz,two,0.667327,1.034863,-0.362612
foo,one,-1.379603,1.262768,0.401966
foo,two,1.198458,0.939427,-0.42889


In [70]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bar,one,1.056486,2.00317,0.166065
bar,two,0.125929,1.287787,-0.107968
baz,one,1.668292,-0.101521,-0.792366
baz,two,0.667327,1.034863,-0.362612
foo,one,-1.379603,1.262768,0.401966
foo,two,1.198458,0.939427,-0.42889
qux,one,-0.009858,0.869639,0.329877
qux,two,0.675545,1.735534,-0.345733


In [71]:
df.loc[[("bar", "two"), ("qux", "one")]]

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bar,two,0.125929,1.287787,-0.107968
qux,one,-0.009858,0.869639,0.329877


> It is important to note that tuples and lists are not treated identically in pandas when it comes to indexing. Whereas a tuple is interpreted as one multi-level key, a list is used to specify several keys. Or in other words, tuples go horizontally (traversing levels), lists go vertically (scanning levels).

In [72]:
s = pd.Series([1, 2, 3, 4, 5, 6],
    index=pd.MultiIndex.from_product([["A", "B"], ["c", "d", "e"]]),)

In [73]:
s

A  c    1
   d    2
   e    3
B  c    4
   d    5
   e    6
dtype: int64

In [74]:
s.loc[[("A", "c"), ("B", "d")]] # list of tuples

A  c    1
B  d    5
dtype: int64

In [75]:
s.loc[(["A", "B"], ["c", "d"])] # tuple of lists

A  c    1
   d    2
B  c    4
   d    5
dtype: int64

### Using slicers

In [76]:
def mklbl(prefix, n):
    return ['%s%s' % (prefix, i) for i in range(n)]

In [77]:
miindex = pd.MultiIndex.from_product([mklbl("A", 4), mklbl("B", 2), mklbl("C", 4), mklbl("D", 2)])

In [78]:
pd.MultiIndex.from_product([mklbl("A", 4), mklbl("B", 2)])

MultiIndex([('A0', 'B0'),
            ('A0', 'B1'),
            ('A1', 'B0'),
            ('A1', 'B1'),
            ('A2', 'B0'),
            ('A2', 'B1'),
            ('A3', 'B0'),
            ('A3', 'B1')],
           )

In [79]:
pd.MultiIndex.from_product([mklbl("A", 4), mklbl("B", 2), mklbl("C", 4)])

MultiIndex([('A0', 'B0', 'C0'),
            ('A0', 'B0', 'C1'),
            ('A0', 'B0', 'C2'),
            ('A0', 'B0', 'C3'),
            ('A0', 'B1', 'C0'),
            ('A0', 'B1', 'C1'),
            ('A0', 'B1', 'C2'),
            ('A0', 'B1', 'C3'),
            ('A1', 'B0', 'C0'),
            ('A1', 'B0', 'C1'),
            ('A1', 'B0', 'C2'),
            ('A1', 'B0', 'C3'),
            ('A1', 'B1', 'C0'),
            ('A1', 'B1', 'C1'),
            ('A1', 'B1', 'C2'),
            ('A1', 'B1', 'C3'),
            ('A2', 'B0', 'C0'),
            ('A2', 'B0', 'C1'),
            ('A2', 'B0', 'C2'),
            ('A2', 'B0', 'C3'),
            ('A2', 'B1', 'C0'),
            ('A2', 'B1', 'C1'),
            ('A2', 'B1', 'C2'),
            ('A2', 'B1', 'C3'),
            ('A3', 'B0', 'C0'),
            ('A3', 'B0', 'C1'),
            ('A3', 'B0', 'C2'),
            ('A3', 'B0', 'C3'),
            ('A3', 'B1', 'C0'),
            ('A3', 'B1', 'C1'),
            ('A3', 'B1', 'C2'),
        

In [80]:
miindex

MultiIndex([('A0', 'B0', 'C0', 'D0'),
            ('A0', 'B0', 'C0', 'D1'),
            ('A0', 'B0', 'C1', 'D0'),
            ('A0', 'B0', 'C1', 'D1'),
            ('A0', 'B0', 'C2', 'D0'),
            ('A0', 'B0', 'C2', 'D1'),
            ('A0', 'B0', 'C3', 'D0'),
            ('A0', 'B0', 'C3', 'D1'),
            ('A0', 'B1', 'C0', 'D0'),
            ('A0', 'B1', 'C0', 'D1'),
            ('A0', 'B1', 'C1', 'D0'),
            ('A0', 'B1', 'C1', 'D1'),
            ('A0', 'B1', 'C2', 'D0'),
            ('A0', 'B1', 'C2', 'D1'),
            ('A0', 'B1', 'C3', 'D0'),
            ('A0', 'B1', 'C3', 'D1'),
            ('A1', 'B0', 'C0', 'D0'),
            ('A1', 'B0', 'C0', 'D1'),
            ('A1', 'B0', 'C1', 'D0'),
            ('A1', 'B0', 'C1', 'D1'),
            ('A1', 'B0', 'C2', 'D0'),
            ('A1', 'B0', 'C2', 'D1'),
            ('A1', 'B0', 'C3', 'D0'),
            ('A1', 'B0', 'C3', 'D1'),
            ('A1', 'B1', 'C0', 'D0'),
            ('A1', 'B1', 'C0', 'D1'),
            

In [81]:
len(miindex)

64

In [82]:
4 * 2 * 4 * 2

64

In [83]:
micolumns = pd.MultiIndex.from_tuples([("a", "foo"), ("a", "bar"), ("b", "foo"), ("b", "bah")], names=["lv10", "lv11"])

In [84]:
dfmi = (
    pd.DataFrame(
        np.arange(len(miindex) * len(micolumns)).reshape(
            (len(miindex), len(micolumns))
        ),
        index=miindex,
        columns=micolumns,
    )
    .sort_index()
    .sort_index(axis=1)
)

In [85]:
dfmi

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lv10,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,lv11,bar,foo,bah,foo
A0,B0,C0,D0,1,0,3,2
A0,B0,C0,D1,5,4,7,6
A0,B0,C1,D0,9,8,11,10
A0,B0,C1,D1,13,12,15,14
A0,B0,C2,D0,17,16,19,18
...,...,...,...,...,...,...,...
A3,B1,C1,D1,237,236,239,238
A3,B1,C2,D0,241,240,243,242
A3,B1,C2,D1,245,244,247,246
A3,B1,C3,D0,249,248,251,250


In [86]:
dfmi.loc[(slice("A1", "A3"), slice(None), ["C1", "C3"]), :]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lv10,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,lv11,bar,foo,bah,foo
A1,B0,C1,D0,73,72,75,74
A1,B0,C1,D1,77,76,79,78
A1,B0,C3,D0,89,88,91,90
A1,B0,C3,D1,93,92,95,94
A1,B1,C1,D0,105,104,107,106
A1,B1,C1,D1,109,108,111,110
A1,B1,C3,D0,121,120,123,122
A1,B1,C3,D1,125,124,127,126
A2,B0,C1,D0,137,136,139,138
A2,B0,C1,D1,141,140,143,142


In [87]:
idx = pd.IndexSlice

In [88]:
dfmi.loc[idx[:, :, ["C1", "C3"]], idx[:, "foo"]]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lv10,a,b
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,lv11,foo,foo
A0,B0,C1,D0,8,10
A0,B0,C1,D1,12,14
A0,B0,C3,D0,24,26
A0,B0,C3,D1,28,30
A0,B1,C1,D0,40,42
A0,B1,C1,D1,44,46
A0,B1,C3,D0,56,58
A0,B1,C3,D1,60,62
A1,B0,C1,D0,72,74
A1,B0,C1,D1,76,78


In [89]:
dfmi.loc["A1", (slice(None), "foo")]

Unnamed: 0_level_0,Unnamed: 1_level_0,lv10,a,b
Unnamed: 0_level_1,Unnamed: 1_level_1,lv11,foo,foo
B0,C0,D0,64,66
B0,C0,D1,68,70
B0,C1,D0,72,74
B0,C1,D1,76,78
B0,C2,D0,80,82
B0,C2,D1,84,86
B0,C3,D0,88,90
B0,C3,D1,92,94
B1,C0,D0,96,98
B1,C0,D1,100,102


In [90]:
dfmi.loc[idx[:, :,["C1", "C3"]], idx[:, "foo"]]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lv10,a,b
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,lv11,foo,foo
A0,B0,C1,D0,8,10
A0,B0,C1,D1,12,14
A0,B0,C3,D0,24,26
A0,B0,C3,D1,28,30
A0,B1,C1,D0,40,42
A0,B1,C1,D1,44,46
A0,B1,C3,D0,56,58
A0,B1,C3,D1,60,62
A1,B0,C1,D0,72,74
A1,B0,C1,D1,76,78


In [91]:
dfmi.loc[idx[:, :, ["C1", "C3"]], idx[:, "foo"]]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lv10,a,b
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,lv11,foo,foo
A0,B0,C1,D0,8,10
A0,B0,C1,D1,12,14
A0,B0,C3,D0,24,26
A0,B0,C3,D1,28,30
A0,B1,C1,D0,40,42
A0,B1,C1,D1,44,46
A0,B1,C3,D0,56,58
A0,B1,C3,D1,60,62
A1,B0,C1,D0,72,74
A1,B0,C1,D1,76,78


In [92]:
mask = dfmi[("a", "foo")] > 200

In [93]:
dfmi[("a", "foo")]

A0  B0  C0  D0      0
            D1      4
        C1  D0      8
            D1     12
        C2  D0     16
                 ... 
A3  B1  C1  D1    236
        C2  D0    240
            D1    244
        C3  D0    248
            D1    252
Name: (a, foo), Length: 64, dtype: int64

In [94]:
dfmi.loc[idx[mask, :, ["C1", "C3"]], idx[:, "foo"]]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lv10,a,b
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,lv11,foo,foo
A3,B0,C1,D1,204,206
A3,B0,C3,D0,216,218
A3,B0,C3,D1,220,222
A3,B1,C1,D0,232,234
A3,B1,C1,D1,236,238
A3,B1,C3,D0,248,250
A3,B1,C3,D1,252,254


In [95]:
dfmi.loc(axis=0)[:, :, ["C1", "C3"]]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lv10,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,lv11,bar,foo,bah,foo
A0,B0,C1,D0,9,8,11,10
A0,B0,C1,D1,13,12,15,14
A0,B0,C3,D0,25,24,27,26
A0,B0,C3,D1,29,28,31,30
A0,B1,C1,D0,41,40,43,42
A0,B1,C1,D1,45,44,47,46
A0,B1,C3,D0,57,56,59,58
A0,B1,C3,D1,61,60,63,62
A1,B0,C1,D0,73,72,75,74
A1,B0,C1,D1,77,76,79,78


In [96]:
df2 = dfmi.copy()

In [97]:
df2.loc(axis=0)[:, :, ["C1", "C3"]] = -10

In [98]:
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lv10,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,lv11,bar,foo,bah,foo
A0,B0,C0,D0,1,0,3,2
A0,B0,C0,D1,5,4,7,6
A0,B0,C1,D0,-10,-10,-10,-10
A0,B0,C1,D1,-10,-10,-10,-10
A0,B0,C2,D0,17,16,19,18
...,...,...,...,...,...,...,...
A3,B1,C1,D1,-10,-10,-10,-10
A3,B1,C2,D0,241,240,243,242
A3,B1,C2,D1,245,244,247,246
A3,B1,C3,D0,-10,-10,-10,-10


In [99]:
df2 = dfmi.copy()

In [100]:
df2.loc[idx[:, :, ["C1", "C3"]], :] = df2 * 1000

In [101]:
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lv10,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,lv11,bar,foo,bah,foo
A0,B0,C0,D0,1,0,3,2
A0,B0,C0,D1,5,4,7,6
A0,B0,C1,D0,9000,8000,11000,10000
A0,B0,C1,D1,13000,12000,15000,14000
A0,B0,C2,D0,17,16,19,18
...,...,...,...,...,...,...,...
A3,B1,C1,D1,237000,236000,239000,238000
A3,B1,C2,D0,241,240,243,242
A3,B1,C2,D1,245,244,247,246
A3,B1,C3,D0,249000,248000,251000,250000


### Cross-section

In [102]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bar,one,1.056486,2.00317,0.166065
bar,two,0.125929,1.287787,-0.107968
baz,one,1.668292,-0.101521,-0.792366
baz,two,0.667327,1.034863,-0.362612
foo,one,-1.379603,1.262768,0.401966
foo,two,1.198458,0.939427,-0.42889
qux,one,-0.009858,0.869639,0.329877
qux,two,0.675545,1.735534,-0.345733


In [103]:
df.xs("one", level="second")

Unnamed: 0_level_0,A,B,C
first,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,1.056486,2.00317,0.166065
baz,1.668292,-0.101521,-0.792366
foo,-1.379603,1.262768,0.401966
qux,-0.009858,0.869639,0.329877


In [104]:
df.loc[(slice(None), "one"), :]

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bar,one,1.056486,2.00317,0.166065
baz,one,1.668292,-0.101521,-0.792366
foo,one,-1.379603,1.262768,0.401966
qux,one,-0.009858,0.869639,0.329877


In [105]:
df = df.T

In [106]:
df

first,bar,bar,baz,baz,foo,foo,qux,qux
second,one,two,one,two,one,two,one,two
A,1.056486,0.125929,1.668292,0.667327,-1.379603,1.198458,-0.009858,0.675545
B,2.00317,1.287787,-0.101521,1.034863,1.262768,0.939427,0.869639,1.735534
C,0.166065,-0.107968,-0.792366,-0.362612,0.401966,-0.42889,0.329877,-0.345733


In [107]:
df.xs("one", level="second", axis=1)

first,bar,baz,foo,qux
A,1.056486,1.668292,-1.379603,-0.009858
B,2.00317,-0.101521,1.262768,0.869639
C,0.166065,-0.792366,0.401966,0.329877


In [108]:
df.loc[:, (slice(None), "one")]

first,bar,baz,foo,qux
second,one,one,one,one
A,1.056486,1.668292,-1.379603,-0.009858
B,2.00317,-0.101521,1.262768,0.869639
C,0.166065,-0.792366,0.401966,0.329877


In [109]:
df.xs(("one", "bar"), level=("second", "first"), axis=1)

first,bar
second,one
A,1.056486
B,2.00317
C,0.166065


In [110]:
df.loc[:, ("bar", "one")]

A    1.056486
B    2.003170
C    0.166065
Name: (bar, one), dtype: float64

In [111]:
df.xs("one", level="second", axis=1, drop_level=False)

first,bar,baz,foo,qux
second,one,one,one,one
A,1.056486,1.668292,-1.379603,-0.009858
B,2.00317,-0.101521,1.262768,0.869639
C,0.166065,-0.792366,0.401966,0.329877


In [112]:
df.xs("one", level="second", axis=1, drop_level=True)

first,bar,baz,foo,qux
A,1.056486,1.668292,-1.379603,-0.009858
B,2.00317,-0.101521,1.262768,0.869639
C,0.166065,-0.792366,0.401966,0.329877


### Advanced reindexing and alignment

In [113]:
midx = pd.MultiIndex(
    levels=[["zero", "one"], ["x", "y"]], codes=[[1,1,0,0], [1,0,1,0]]
)

In [114]:
midx

MultiIndex([( 'one', 'y'),
            ( 'one', 'x'),
            ('zero', 'y'),
            ('zero', 'x')],
           )

In [115]:
pd.MultiIndex(
    levels=[["zero", "one"], ["x", "y"]], codes=[[1,0,0, 1], [1,0,1,0]]
)

MultiIndex([( 'one', 'y'),
            ('zero', 'x'),
            ('zero', 'y'),
            ( 'one', 'x')],
           )

In [116]:
df = pd.DataFrame(np.random.randn(4,2), index=midx)

In [117]:
df

Unnamed: 0,Unnamed: 1,0,1
one,y,1.686035,-1.931286
one,x,-0.192168,-0.8295
zero,y,1.011643,0.36747
zero,x,-0.428094,0.557708


In [118]:
df2 = df.mean(level=0)

In [119]:
df2

Unnamed: 0,0,1
one,0.746934,-1.380393
zero,0.291774,0.462589


In [120]:
(-0.055193 -0.642253) / 2

-0.348723

In [121]:
df2.reindex(df.index, level=0)

Unnamed: 0,Unnamed: 1,0,1
one,y,0.746934,-1.380393
one,x,0.746934,-1.380393
zero,y,0.291774,0.462589
zero,x,0.291774,0.462589


In [122]:
df_ailgned, df2_aligned = df.align(df2, level=0)

### Swapping levels with swaplevel

In [123]:
df[:5]

Unnamed: 0,Unnamed: 1,0,1
one,y,1.686035,-1.931286
one,x,-0.192168,-0.8295
zero,y,1.011643,0.36747
zero,x,-0.428094,0.557708


In [124]:
df[:5].swaplevel(0,1,axis=0)

Unnamed: 0,Unnamed: 1,0,1
y,one,1.686035,-1.931286
x,one,-0.192168,-0.8295
y,zero,1.011643,0.36747
x,zero,-0.428094,0.557708


In [125]:
df[:5].reorder_levels([1,0], axis=0)

Unnamed: 0,Unnamed: 1,0,1
y,one,1.686035,-1.931286
x,one,-0.192168,-0.8295
y,zero,1.011643,0.36747
x,zero,-0.428094,0.557708


### Renaming names of an Index or MultiIndex

In [126]:
df.rename(columns={0: "col0", 1:"col1"})

Unnamed: 0,Unnamed: 1,col0,col1
one,y,1.686035,-1.931286
one,x,-0.192168,-0.8295
zero,y,1.011643,0.36747
zero,x,-0.428094,0.557708


In [127]:
df.rename(index={"one": "two", "y": "z"})

Unnamed: 0,Unnamed: 1,0,1
two,z,1.686035,-1.931286
two,x,-0.192168,-0.8295
zero,z,1.011643,0.36747
zero,x,-0.428094,0.557708


In [128]:
df.rename_axis(index=["abc", "def"])

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1
abc,def,Unnamed: 2_level_1,Unnamed: 3_level_1
one,y,1.686035,-1.931286
one,x,-0.192168,-0.8295
zero,y,1.011643,0.36747
zero,x,-0.428094,0.557708


In [129]:
df.rename_axis(columns="Cols").columns

RangeIndex(start=0, stop=2, step=1, name='Cols')

In [130]:
df

Unnamed: 0,Unnamed: 1,0,1
one,y,1.686035,-1.931286
one,x,-0.192168,-0.8295
zero,y,1.011643,0.36747
zero,x,-0.428094,0.557708


In [131]:
mi = pd.MultiIndex.from_product([[1,2], ["a", "b"]], names=["x", "y"])

In [132]:
mi

MultiIndex([(1, 'a'),
            (1, 'b'),
            (2, 'a'),
            (2, 'b')],
           names=['x', 'y'])

In [133]:
mi.names

FrozenList(['x', 'y'])

In [134]:
mi2 = mi.rename("new name", level=0)

In [135]:
mi2

MultiIndex([(1, 'a'),
            (1, 'b'),
            (2, 'a'),
            (2, 'b')],
           names=['new name', 'y'])

In [136]:
# You cannot set the names of the MultiIndex via a level.
# mi.levels[0].name = "name via level"

## Sorting a MultiIndex

In [137]:
import random

In [138]:
random.shuffle(tuples)

In [139]:
tuples

[('bar', 'two'),
 ('baz', 'one'),
 ('bar', 'one'),
 ('foo', 'two'),
 ('qux', 'one'),
 ('baz', 'two'),
 ('qux', 'two'),
 ('foo', 'one')]

In [140]:
s  = pd.Series(np.random.randn(8), index=pd.MultiIndex.from_tuples(tuples))

In [141]:
s.sort_index()

bar  one    1.588487
     two   -1.104189
baz  one   -0.298863
     two    0.025731
foo  one    0.075274
     two    0.399557
qux  one    0.856413
     two    1.721948
dtype: float64

In [142]:
s.sort_index(level=0)

bar  one    1.588487
     two   -1.104189
baz  one   -0.298863
     two    0.025731
foo  one    0.075274
     two    0.399557
qux  one    0.856413
     two    1.721948
dtype: float64

In [143]:
s.sort_index(level=1)

bar  one    1.588487
baz  one   -0.298863
foo  one    0.075274
qux  one    0.856413
bar  two   -1.104189
baz  two    0.025731
foo  two    0.399557
qux  two    1.721948
dtype: float64

In [144]:
s.index.set_names(["L1", "L2"], inplace=True)

In [145]:
s.sort_index(level="L1")

L1   L2 
bar  one    1.588487
     two   -1.104189
baz  one   -0.298863
     two    0.025731
foo  one    0.075274
     two    0.399557
qux  one    0.856413
     two    1.721948
dtype: float64

In [146]:
s.sort_index(level="L2")

L1   L2 
bar  one    1.588487
baz  one   -0.298863
foo  one    0.075274
qux  one    0.856413
bar  two   -1.104189
baz  two    0.025731
foo  two    0.399557
qux  two    1.721948
dtype: float64

In [147]:
df.T.sort_index(level=1, axis=1)

Unnamed: 0_level_0,one,zero,one,zero
Unnamed: 0_level_1,x,x,y,y
0,-0.192168,-0.428094,1.686035,1.011643
1,-0.8295,0.557708,-1.931286,0.36747


In [148]:
dfm = pd.DataFrame(
    {"jim": [0,0,1,1], "joe": ["x", "x", "z", "y"], "jolie": np.random.rand(4)}
)

In [149]:
dfm = dfm.set_index(["jim", "joe"])

In [150]:
dfm

Unnamed: 0_level_0,Unnamed: 1_level_0,jolie
jim,joe,Unnamed: 2_level_1
0,x,0.244344
0,x,0.656066
1,z,0.961588
1,y,0.847174


In [151]:
dfm.loc[((1,'z'))]

  dfm.loc[((1,'z'))]


Unnamed: 0_level_0,Unnamed: 1_level_0,jolie
jim,joe,Unnamed: 2_level_1
1,z,0.961588


In [152]:
# dfm.loc[(0, 'y'):(1,'z')]

In [153]:
dfm.index.is_lexsorted()

False

In [154]:
dfm.index.lexsort_depth

1

In [155]:
dfm = dfm.sort_index()

In [156]:
dfm

Unnamed: 0_level_0,Unnamed: 1_level_0,jolie
jim,joe,Unnamed: 2_level_1
0,x,0.244344
0,x,0.656066
1,y,0.847174
1,z,0.961588


In [157]:
df.index.is_lexsorted()

False

In [158]:
dfm.index.lexsort_depth

2

In [159]:
dfm.loc[(0, 'y'):(1,'z')]

Unnamed: 0_level_0,Unnamed: 1_level_0,jolie
jim,joe,Unnamed: 2_level_1
1,y,0.847174
1,z,0.961588


## Take methods

In [160]:
index = pd.Index(np.random.randint(0,1000,10))

In [161]:
index

Int64Index([918, 42, 421, 454, 647, 266, 385, 924, 992, 713], dtype='int64')

In [162]:
positions = [0,9,3]

In [163]:
index[positions]

Int64Index([918, 713, 454], dtype='int64')

In [164]:
index.take(positions)

Int64Index([918, 713, 454], dtype='int64')

In [165]:
ser = pd.Series(np.random.randn(10))

In [166]:
ser.iloc[positions]

0   -0.421720
9   -0.965177
3   -0.054751
dtype: float64

In [167]:
ser.take(positions)

0   -0.421720
9   -0.965177
3   -0.054751
dtype: float64

In [168]:
frm = pd.DataFrame(np.random.randn(5,3))

In [169]:
frm

Unnamed: 0,0,1,2
0,-0.29892,0.911416,0.251053
1,0.129913,-0.956189,-1.776143
2,-1.465286,1.999764,-1.30365
3,-1.134783,0.559337,-1.586181
4,1.115177,-1.540541,0.051761


In [170]:
frm.take([1,4,3])

Unnamed: 0,0,1,2
1,0.129913,-0.956189,-1.776143
4,1.115177,-1.540541,0.051761
3,-1.134783,0.559337,-1.586181


In [171]:
frm.take([0,2], axis=1)

Unnamed: 0,0,2
0,-0.29892,0.251053
1,0.129913,-1.776143
2,-1.465286,-1.30365
3,-1.134783,-1.586181
4,1.115177,0.051761


In [175]:
arr = np.random.randn(10)

In [176]:
arr

array([-1.05860411,  0.02335861, -1.74817545, -0.88844715, -1.57449312,
        0.40657269,  0.05599295, -1.11693399, -0.76971779,  0.06798818])

In [177]:
arr.take([False, False, True, True])

array([-1.05860411, -1.05860411,  0.02335861,  0.02335861])

In [179]:
arr[[0,1]]

array([-1.05860411,  0.02335861])

In [180]:
ser = pd.Series(np.random.randn(10))

In [181]:
ser.take([False, False, True, True])

0   -0.076486
0   -0.076486
1   -1.931413
1   -1.931413
dtype: float64

In [182]:
ser.iloc[[0,1]]

0   -0.076486
1   -1.931413
dtype: float64

In [183]:
arr = np.random.randn(10000, 5)

In [187]:
indexer = np.arange(10000)

In [188]:
random.shuffle(indexer)

In [189]:
%timeit arr[indexer]

176 µs ± 17.1 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [190]:
%timeit arr.take(indexer, axis=0)

64.6 µs ± 1.48 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [191]:
ser = pd.Series(arr[:, 0])

In [192]:
%timeit ser.iloc[indexer]

159 µs ± 19.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [193]:
%timeit ser.take(indexer)

146 µs ± 18.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


## Index types

### CategoricalIndex

In [194]:
from pandas.api.types import CategoricalDtype

In [195]:
df = pd.DataFrame({"A": np.arange(6), "B": list("aabbca")})

In [196]:
df

Unnamed: 0,A,B
0,0,a
1,1,a
2,2,b
3,3,b
4,4,c
5,5,a


In [197]:
df["B"] = df["B"].astype(CategoricalDtype(list("cab")))

In [198]:
df

Unnamed: 0,A,B
0,0,a
1,1,a
2,2,b
3,3,b
4,4,c
5,5,a


In [199]:
df.dtypes

A       int64
B    category
dtype: object

In [200]:
df["B"].cat.categories

Index(['c', 'a', 'b'], dtype='object')

In [201]:
df2 = df.set_index("B")

In [202]:
df2.index

CategoricalIndex(['a', 'a', 'b', 'b', 'c', 'a'], categories=['c', 'a', 'b'], ordered=False, name='B', dtype='category')

In [204]:
df2

Unnamed: 0_level_0,A
B,Unnamed: 1_level_1
a,0
a,1
b,2
b,3
c,4
a,5


In [203]:
df2.loc["a"]

Unnamed: 0_level_0,A
B,Unnamed: 1_level_1
a,0
a,1
a,5


In [205]:
df2.loc["a"].index

CategoricalIndex(['a', 'a', 'a'], categories=['c', 'a', 'b'], ordered=False, name='B', dtype='category')

In [206]:
df2.sort_index()

Unnamed: 0_level_0,A
B,Unnamed: 1_level_1
c,4
a,0
a,1
a,5
b,2
b,3


In [207]:
df2.groupby(level=0).sum()

Unnamed: 0_level_0,A
B,Unnamed: 1_level_1
c,4
a,6
b,5


In [208]:
df2.groupby(level=0).sum().index

CategoricalIndex(['c', 'a', 'b'], categories=['c', 'a', 'b'], ordered=False, name='B', dtype='category')

In [209]:
df3 = pd.DataFrame(
    {"A": np.arange(3), "B": pd.Series(list("abc")).astype("category")}
)

In [210]:
df3

Unnamed: 0,A,B
0,0,a
1,1,b
2,2,c


In [211]:
df3 = df3.set_index("B")

In [212]:
df3

Unnamed: 0_level_0,A
B,Unnamed: 1_level_1
a,0
b,1
c,2


In [213]:
df3.reindex(["a", "e"])

Unnamed: 0_level_0,A
B,Unnamed: 1_level_1
a,0.0
e,


In [214]:
df3.reindex(["a", "e"]).index

Index(['a', 'e'], dtype='object', name='B')

In [215]:
df3.reindex(pd.Categorical(["a", "e"], categories=list("abe")))

Unnamed: 0_level_0,A
B,Unnamed: 1_level_1
a,0.0
e,


In [216]:
df3.reindex(pd.Categorical(["a", "e"], categories=list("abe"))).index

CategoricalIndex(['a', 'e'], categories=['a', 'b', 'e'], ordered=False, name='B', dtype='category')

In [217]:
df4 = pd.DataFrame({"A": np.arange(2), "B": list("ba")})