# MultiIndex / advanced indexing
> https://pandas.pydata.org/docs/user_guide/advanced.html#advanced-indexing-with-hierarchical-index

## Hierarchical indexing (MultiIndex)

### Creating a MultiIndex (hierarchical index) object

In [1]:
import numpy as np
import pandas as pd

In [2]:
arrays = [
   ...:     ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
   ...:     ["one", "two", "one", "two", "one", "two", "one", "two"],
   ...: ]

In [3]:
tuples = list(zip(*arrays))

In [4]:
tuples

[('bar', 'one'),
 ('bar', 'two'),
 ('baz', 'one'),
 ('baz', 'two'),
 ('foo', 'one'),
 ('foo', 'two'),
 ('qux', 'one'),
 ('qux', 'two')]

In [5]:
index = pd.MultiIndex.from_tuples(tuples, names=["first", "second"])

In [6]:
index

MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])

In [7]:
s = pd.Series(np.random.randn(8), index=index)

In [8]:
s

first  second
bar    one      -1.305103
       two      -0.871037
baz    one       0.441388
       two       1.045646
foo    one      -3.267235
       two      -0.159993
qux    one      -0.951188
       two      -1.591583
dtype: float64

In [9]:
iterables = [["bar", "baz", "foo", "qux"], ["one", "two"]]

In [10]:
pd.MultiIndex.from_product(iterables, names=["first", "second"])

MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])

In [11]:
df = pd.DataFrame(
   ....:     [["bar", "one"], ["bar", "two"], ["foo", "one"], ["foo", "two"]],
   ....:     columns=["first", "second"],
   ....: )

In [12]:
df

Unnamed: 0,first,second
0,bar,one
1,bar,two
2,foo,one
3,foo,two


In [13]:
pd.MultiIndex.from_frame(df)

MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('foo', 'one'),
            ('foo', 'two')],
           names=['first', 'second'])

In [14]:
arrays = [
   ....:     np.array(["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"]),
   ....:     np.array(["one", "two", "one", "two", "one", "two", "one", "two"]),
   ....: ]

In [15]:
s = pd.Series(np.random.randn(8), index=arrays)

In [16]:
s

bar  one    0.779771
     two   -0.921293
baz  one    1.346612
     two    0.398405
foo  one    0.331960
     two    1.692337
qux  one   -0.948225
     two   -1.106752
dtype: float64

In [17]:
df = pd.DataFrame(np.random.randn(8, 4), index=arrays)

In [18]:
df

Unnamed: 0,Unnamed: 1,0,1,2,3
bar,one,0.119003,0.187999,-0.31457,0.059115
bar,two,-1.288561,0.728717,1.188854,-1.078912
baz,one,-0.986029,1.045426,-0.925135,1.632275
baz,two,2.174691,0.578743,-0.266996,-0.733817
foo,one,0.802504,-0.384544,1.314185,-1.474942
foo,two,-1.526377,-0.271441,1.332812,1.064613
qux,one,-0.755631,0.125257,0.438465,1.909822
qux,two,-2.248991,-0.666946,0.829409,0.142948


In [19]:
df.index.names

FrozenList([None, None])

In [20]:
df = pd.DataFrame(np.random.randn(3,8), index=["A", "B", "C"], columns=index)

In [21]:
df

first,bar,bar,baz,baz,foo,foo,qux,qux
second,one,two,one,two,one,two,one,two
A,-1.129645,-1.148067,0.383512,0.175234,1.682195,0.424873,0.205087,-0.57935
B,-0.953406,-0.196163,-0.726828,0.221658,-0.85063,2.03903,-2.8652,0.998255
C,-0.757268,2.049972,0.728863,-0.829424,1.063771,-0.091753,0.387784,0.538309


In [22]:
pd.DataFrame(np.random.randn(6,6), index=index[:6], columns=index[:6])

Unnamed: 0_level_0,first,bar,bar,baz,baz,foo,foo
Unnamed: 0_level_1,second,one,two,one,two,one,two
first,second,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
bar,one,0.748823,0.395316,1.235987,-0.199643,-0.522501,0.048851
bar,two,-0.078967,-0.249171,-0.537105,-0.408189,0.05492,0.915509
baz,one,-1.331309,-1.350481,-1.711529,-0.756529,2.010548,-0.168475
baz,two,0.515114,-0.513228,1.570405,0.216595,0.505545,0.608356
foo,one,0.357755,-0.905813,0.552185,0.102509,-1.350912,-0.712057
foo,two,0.027545,1.909855,-0.319934,-1.261065,-1.743929,-0.608337


In [23]:
with pd.option_context("display.multi_sparse", False):
    print(df)

first        bar       bar       baz       baz       foo       foo       qux  \
second       one       two       one       two       one       two       one   
A      -1.129645 -1.148067  0.383512  0.175234  1.682195  0.424873  0.205087   
B      -0.953406 -0.196163 -0.726828  0.221658 -0.850630  2.039030 -2.865200   
C      -0.757268  2.049972  0.728863 -0.829424  1.063771 -0.091753  0.387784   

first        qux  
second       two  
A      -0.579350  
B       0.998255  
C       0.538309  


In [24]:
pd.Series(np.random.randn(8), index=tuples)

(bar, one)    0.415960
(bar, two)    0.187235
(baz, one)    0.557852
(baz, two)   -1.091558
(foo, one)    1.502113
(foo, two)    1.271822
(qux, one)    1.025286
(qux, two)    0.009874
dtype: float64

### Rescontructing the level labels

In [25]:
index.get_level_values(0)

Index(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], dtype='object', name='first')

In [26]:
index.get_level_values(1)

Index(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'], dtype='object', name='second')

In [27]:
index.get_level_values("second")

Index(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'], dtype='object', name='second')

### Basic indexing on axis with MultiIndex

In [28]:
df

first,bar,bar,baz,baz,foo,foo,qux,qux
second,one,two,one,two,one,two,one,two
A,-1.129645,-1.148067,0.383512,0.175234,1.682195,0.424873,0.205087,-0.57935
B,-0.953406,-0.196163,-0.726828,0.221658,-0.85063,2.03903,-2.8652,0.998255
C,-0.757268,2.049972,0.728863,-0.829424,1.063771,-0.091753,0.387784,0.538309


In [29]:
df["bar"]

second,one,two
A,-1.129645,-1.148067
B,-0.953406,-0.196163
C,-0.757268,2.049972


In [30]:
type(df["bar"])

pandas.core.frame.DataFrame

In [31]:
df["bar", "one"]

A   -1.129645
B   -0.953406
C   -0.757268
Name: (bar, one), dtype: float64

In [32]:
s

bar  one    0.779771
     two   -0.921293
baz  one    1.346612
     two    0.398405
foo  one    0.331960
     two    1.692337
qux  one   -0.948225
     two   -1.106752
dtype: float64

In [33]:
s.index

MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           )

In [34]:
type(s)

pandas.core.series.Series

In [35]:
s["qux"]

one   -0.948225
two   -1.106752
dtype: float64

### Defined levels

In [36]:
df.columns.levels

FrozenList([['bar', 'baz', 'foo', 'qux'], ['one', 'two']])

In [37]:
df

first,bar,bar,baz,baz,foo,foo,qux,qux
second,one,two,one,two,one,two,one,two
A,-1.129645,-1.148067,0.383512,0.175234,1.682195,0.424873,0.205087,-0.57935
B,-0.953406,-0.196163,-0.726828,0.221658,-0.85063,2.03903,-2.8652,0.998255
C,-0.757268,2.049972,0.728863,-0.829424,1.063771,-0.091753,0.387784,0.538309


In [38]:
df[["foo", "qux"]].columns.levels

FrozenList([['bar', 'baz', 'foo', 'qux'], ['one', 'two']])

In [39]:
df[["foo", "qux"]]

first,foo,foo,qux,qux
second,one,two,one,two
A,1.682195,0.424873,0.205087,-0.57935
B,-0.85063,2.03903,-2.8652,0.998255
C,1.063771,-0.091753,0.387784,0.538309


In [40]:
df[["foo", "qux"]].columns.to_numpy()

array([('foo', 'one'), ('foo', 'two'), ('qux', 'one'), ('qux', 'two')],
      dtype=object)

In [41]:
df[["foo", "qux"]].columns.get_level_values(0)

Index(['foo', 'foo', 'qux', 'qux'], dtype='object', name='first')

In [42]:
df[["foo", "qux"]].columns.get_level_values(1)

Index(['one', 'two', 'one', 'two'], dtype='object', name='second')

In [43]:
new_mi = df[["foo", "qux"]].columns.remove_unused_levels()

In [44]:
new_mi.levels

FrozenList([['foo', 'qux'], ['one', 'two']])

In [45]:
new_mi

MultiIndex([('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])

In [46]:
type(new_mi)

pandas.core.indexes.multi.MultiIndex

### Data alignment and using `reindex`

In [47]:
s

bar  one    0.779771
     two   -0.921293
baz  one    1.346612
     two    0.398405
foo  one    0.331960
     two    1.692337
qux  one   -0.948225
     two   -1.106752
dtype: float64

In [48]:
type(s)

pandas.core.series.Series

In [49]:
s[:2]

bar  one    0.779771
     two   -0.921293
dtype: float64

In [50]:
s + s[:2]

bar  one    1.559542
     two   -1.842586
baz  one         NaN
     two         NaN
foo  one         NaN
     two         NaN
qux  one         NaN
     two         NaN
dtype: float64

In [51]:
s + s[::2]

bar  one    1.559542
     two         NaN
baz  one    2.693224
     two         NaN
foo  one    0.663919
     two         NaN
qux  one   -1.896450
     two         NaN
dtype: float64

In [52]:
index

MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])

In [53]:
index[:3]

MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one')],
           names=['first', 'second'])

In [54]:
s.reindex(index[:3])

first  second
bar    one       0.779771
       two      -0.921293
baz    one       1.346612
dtype: float64

In [55]:
s.reindex([("foo", "two"), ("bar", "one"), ("qux", "one"), ("baz", "one")])

foo  two    1.692337
bar  one    0.779771
qux  one   -0.948225
baz  one    1.346612
dtype: float64

## Advanced indexing with hierarchical index

In [56]:
df

first,bar,bar,baz,baz,foo,foo,qux,qux
second,one,two,one,two,one,two,one,two
A,-1.129645,-1.148067,0.383512,0.175234,1.682195,0.424873,0.205087,-0.57935
B,-0.953406,-0.196163,-0.726828,0.221658,-0.85063,2.03903,-2.8652,0.998255
C,-0.757268,2.049972,0.728863,-0.829424,1.063771,-0.091753,0.387784,0.538309


In [57]:
df = df.T

In [58]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bar,one,-1.129645,-0.953406,-0.757268
bar,two,-1.148067,-0.196163,2.049972
baz,one,0.383512,-0.726828,0.728863
baz,two,0.175234,0.221658,-0.829424
foo,one,1.682195,-0.85063,1.063771
foo,two,0.424873,2.03903,-0.091753
qux,one,0.205087,-2.8652,0.387784
qux,two,-0.57935,0.998255,0.538309


In [59]:
df.loc[("bar", "two")]

A   -1.148067
B   -0.196163
C    2.049972
Name: (bar, two), dtype: float64

In [60]:
type(df.loc[("bar", "two")])

pandas.core.series.Series

In [61]:
df.loc["bar", "two"]  # lead to ambiguity!!!

A   -1.148067
B   -0.196163
C    2.049972
Name: (bar, two), dtype: float64

In [62]:
df.loc[("bar", "two"), "A"]

-1.1480665754013264

In [63]:
df.loc["bar"]

Unnamed: 0_level_0,A,B,C
second,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,-1.129645,-0.953406,-0.757268
two,-1.148067,-0.196163,2.049972


In [64]:
df["A"]

first  second
bar    one      -1.129645
       two      -1.148067
baz    one       0.383512
       two       0.175234
foo    one       1.682195
       two       0.424873
qux    one       0.205087
       two      -0.579350
Name: A, dtype: float64

In [65]:
df.loc[("bar",),]

Unnamed: 0_level_0,A,B,C
second,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,-1.129645,-0.953406,-0.757268
two,-1.148067,-0.196163,2.049972


In [66]:
df.loc["baz":"foo"]

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
baz,one,0.383512,-0.726828,0.728863
baz,two,0.175234,0.221658,-0.829424
foo,one,1.682195,-0.85063,1.063771
foo,two,0.424873,2.03903,-0.091753


In [67]:
df.loc[("baz", "two"):("qux", "one")]

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
baz,two,0.175234,0.221658,-0.829424
foo,one,1.682195,-0.85063,1.063771
foo,two,0.424873,2.03903,-0.091753
qux,one,0.205087,-2.8652,0.387784


In [68]:
df.loc[("baz", "two"):"foo"]

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
baz,two,0.175234,0.221658,-0.829424
foo,one,1.682195,-0.85063,1.063771
foo,two,0.424873,2.03903,-0.091753


In [69]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bar,one,-1.129645,-0.953406,-0.757268
bar,two,-1.148067,-0.196163,2.049972
baz,one,0.383512,-0.726828,0.728863
baz,two,0.175234,0.221658,-0.829424
foo,one,1.682195,-0.85063,1.063771
foo,two,0.424873,2.03903,-0.091753
qux,one,0.205087,-2.8652,0.387784
qux,two,-0.57935,0.998255,0.538309


In [70]:
df.loc[[("bar", "two"), ("qux", "one")]]

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bar,two,-1.148067,-0.196163,2.049972
qux,one,0.205087,-2.8652,0.387784


> It is important to note that tuples and lists are not treated identically in pandas when it comes to indexing. Whereas a tuple is interpreted as one multi-level key, a list is used to specify several keys. Or in other words, tuples go horizontally (traversing levels), lists go vertically (scanning levels).

In [71]:
s = pd.Series([1, 2, 3, 4, 5, 6],
    index=pd.MultiIndex.from_product([["A", "B"], ["c", "d", "e"]]),)

In [72]:
s

A  c    1
   d    2
   e    3
B  c    4
   d    5
   e    6
dtype: int64

In [73]:
s.loc[[("A", "c"), ("B", "d")]] # list of tuples

A  c    1
B  d    5
dtype: int64

In [74]:
s.loc[(["A", "B"], ["c", "d"])] # tuple of lists

A  c    1
   d    2
B  c    4
   d    5
dtype: int64

### Using slicers

In [75]:
def mklbl(prefix, n):
    return ['%s%s' % (prefix, i) for i in range(n)]

In [76]:
miindex = pd.MultiIndex.from_product([mklbl("A", 4), mklbl("B", 2), mklbl("C", 4), mklbl("D", 2)])

In [77]:
pd.MultiIndex.from_product([mklbl("A", 4), mklbl("B", 2)])

MultiIndex([('A0', 'B0'),
            ('A0', 'B1'),
            ('A1', 'B0'),
            ('A1', 'B1'),
            ('A2', 'B0'),
            ('A2', 'B1'),
            ('A3', 'B0'),
            ('A3', 'B1')],
           )

In [78]:
pd.MultiIndex.from_product([mklbl("A", 4), mklbl("B", 2), mklbl("C", 4)])

MultiIndex([('A0', 'B0', 'C0'),
            ('A0', 'B0', 'C1'),
            ('A0', 'B0', 'C2'),
            ('A0', 'B0', 'C3'),
            ('A0', 'B1', 'C0'),
            ('A0', 'B1', 'C1'),
            ('A0', 'B1', 'C2'),
            ('A0', 'B1', 'C3'),
            ('A1', 'B0', 'C0'),
            ('A1', 'B0', 'C1'),
            ('A1', 'B0', 'C2'),
            ('A1', 'B0', 'C3'),
            ('A1', 'B1', 'C0'),
            ('A1', 'B1', 'C1'),
            ('A1', 'B1', 'C2'),
            ('A1', 'B1', 'C3'),
            ('A2', 'B0', 'C0'),
            ('A2', 'B0', 'C1'),
            ('A2', 'B0', 'C2'),
            ('A2', 'B0', 'C3'),
            ('A2', 'B1', 'C0'),
            ('A2', 'B1', 'C1'),
            ('A2', 'B1', 'C2'),
            ('A2', 'B1', 'C3'),
            ('A3', 'B0', 'C0'),
            ('A3', 'B0', 'C1'),
            ('A3', 'B0', 'C2'),
            ('A3', 'B0', 'C3'),
            ('A3', 'B1', 'C0'),
            ('A3', 'B1', 'C1'),
            ('A3', 'B1', 'C2'),
        

In [79]:
miindex

MultiIndex([('A0', 'B0', 'C0', 'D0'),
            ('A0', 'B0', 'C0', 'D1'),
            ('A0', 'B0', 'C1', 'D0'),
            ('A0', 'B0', 'C1', 'D1'),
            ('A0', 'B0', 'C2', 'D0'),
            ('A0', 'B0', 'C2', 'D1'),
            ('A0', 'B0', 'C3', 'D0'),
            ('A0', 'B0', 'C3', 'D1'),
            ('A0', 'B1', 'C0', 'D0'),
            ('A0', 'B1', 'C0', 'D1'),
            ('A0', 'B1', 'C1', 'D0'),
            ('A0', 'B1', 'C1', 'D1'),
            ('A0', 'B1', 'C2', 'D0'),
            ('A0', 'B1', 'C2', 'D1'),
            ('A0', 'B1', 'C3', 'D0'),
            ('A0', 'B1', 'C3', 'D1'),
            ('A1', 'B0', 'C0', 'D0'),
            ('A1', 'B0', 'C0', 'D1'),
            ('A1', 'B0', 'C1', 'D0'),
            ('A1', 'B0', 'C1', 'D1'),
            ('A1', 'B0', 'C2', 'D0'),
            ('A1', 'B0', 'C2', 'D1'),
            ('A1', 'B0', 'C3', 'D0'),
            ('A1', 'B0', 'C3', 'D1'),
            ('A1', 'B1', 'C0', 'D0'),
            ('A1', 'B1', 'C0', 'D1'),
            

In [80]:
len(miindex)

64

In [81]:
4 * 2 * 4 * 2

64

In [82]:
micolumns = pd.MultiIndex.from_tuples([("a", "foo"), ("a", "bar"), ("b", "foo"), ("b", "bah")], names=["lv10", "lv11"])

In [83]:
dfmi = (
    pd.DataFrame(
        np.arange(len(miindex) * len(micolumns)).reshape(
            (len(miindex), len(micolumns))
        ),
        index=miindex,
        columns=micolumns,
    )
    .sort_index()
    .sort_index(axis=1)
)

In [84]:
dfmi

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lv10,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,lv11,bar,foo,bah,foo
A0,B0,C0,D0,1,0,3,2
A0,B0,C0,D1,5,4,7,6
A0,B0,C1,D0,9,8,11,10
A0,B0,C1,D1,13,12,15,14
A0,B0,C2,D0,17,16,19,18
...,...,...,...,...,...,...,...
A3,B1,C1,D1,237,236,239,238
A3,B1,C2,D0,241,240,243,242
A3,B1,C2,D1,245,244,247,246
A3,B1,C3,D0,249,248,251,250


In [85]:
dfmi.loc[(slice("A1", "A3"), slice(None), ["C1", "C3"]), :]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lv10,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,lv11,bar,foo,bah,foo
A1,B0,C1,D0,73,72,75,74
A1,B0,C1,D1,77,76,79,78
A1,B0,C3,D0,89,88,91,90
A1,B0,C3,D1,93,92,95,94
A1,B1,C1,D0,105,104,107,106
A1,B1,C1,D1,109,108,111,110
A1,B1,C3,D0,121,120,123,122
A1,B1,C3,D1,125,124,127,126
A2,B0,C1,D0,137,136,139,138
A2,B0,C1,D1,141,140,143,142


In [86]:
idx = pd.IndexSlice

In [87]:
dfmi.loc[idx[:, :, ["C1", "C3"]], idx[:, "foo"]]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lv10,a,b
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,lv11,foo,foo
A0,B0,C1,D0,8,10
A0,B0,C1,D1,12,14
A0,B0,C3,D0,24,26
A0,B0,C3,D1,28,30
A0,B1,C1,D0,40,42
A0,B1,C1,D1,44,46
A0,B1,C3,D0,56,58
A0,B1,C3,D1,60,62
A1,B0,C1,D0,72,74
A1,B0,C1,D1,76,78


In [88]:
dfmi.loc["A1", (slice(None), "foo")]

Unnamed: 0_level_0,Unnamed: 1_level_0,lv10,a,b
Unnamed: 0_level_1,Unnamed: 1_level_1,lv11,foo,foo
B0,C0,D0,64,66
B0,C0,D1,68,70
B0,C1,D0,72,74
B0,C1,D1,76,78
B0,C2,D0,80,82
B0,C2,D1,84,86
B0,C3,D0,88,90
B0,C3,D1,92,94
B1,C0,D0,96,98
B1,C0,D1,100,102


In [89]:
dfmi.loc[idx[:, :,["C1", "C3"]], idx[:, "foo"]]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lv10,a,b
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,lv11,foo,foo
A0,B0,C1,D0,8,10
A0,B0,C1,D1,12,14
A0,B0,C3,D0,24,26
A0,B0,C3,D1,28,30
A0,B1,C1,D0,40,42
A0,B1,C1,D1,44,46
A0,B1,C3,D0,56,58
A0,B1,C3,D1,60,62
A1,B0,C1,D0,72,74
A1,B0,C1,D1,76,78


In [90]:
dfmi.loc[idx[:, :, ["C1", "C3"]], idx[:, "foo"]]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lv10,a,b
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,lv11,foo,foo
A0,B0,C1,D0,8,10
A0,B0,C1,D1,12,14
A0,B0,C3,D0,24,26
A0,B0,C3,D1,28,30
A0,B1,C1,D0,40,42
A0,B1,C1,D1,44,46
A0,B1,C3,D0,56,58
A0,B1,C3,D1,60,62
A1,B0,C1,D0,72,74
A1,B0,C1,D1,76,78


In [91]:
mask = dfmi[("a", "foo")] > 200

In [92]:
dfmi[("a", "foo")]

A0  B0  C0  D0      0
            D1      4
        C1  D0      8
            D1     12
        C2  D0     16
                 ... 
A3  B1  C1  D1    236
        C2  D0    240
            D1    244
        C3  D0    248
            D1    252
Name: (a, foo), Length: 64, dtype: int64

In [93]:
dfmi.loc[idx[mask, :, ["C1", "C3"]], idx[:, "foo"]]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lv10,a,b
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,lv11,foo,foo
A3,B0,C1,D1,204,206
A3,B0,C3,D0,216,218
A3,B0,C3,D1,220,222
A3,B1,C1,D0,232,234
A3,B1,C1,D1,236,238
A3,B1,C3,D0,248,250
A3,B1,C3,D1,252,254


In [94]:
dfmi.loc(axis=0)[:, :, ["C1", "C3"]]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lv10,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,lv11,bar,foo,bah,foo
A0,B0,C1,D0,9,8,11,10
A0,B0,C1,D1,13,12,15,14
A0,B0,C3,D0,25,24,27,26
A0,B0,C3,D1,29,28,31,30
A0,B1,C1,D0,41,40,43,42
A0,B1,C1,D1,45,44,47,46
A0,B1,C3,D0,57,56,59,58
A0,B1,C3,D1,61,60,63,62
A1,B0,C1,D0,73,72,75,74
A1,B0,C1,D1,77,76,79,78


In [95]:
df2 = dfmi.copy()

In [96]:
df2.loc(axis=0)[:, :, ["C1", "C3"]] = -10

In [97]:
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lv10,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,lv11,bar,foo,bah,foo
A0,B0,C0,D0,1,0,3,2
A0,B0,C0,D1,5,4,7,6
A0,B0,C1,D0,-10,-10,-10,-10
A0,B0,C1,D1,-10,-10,-10,-10
A0,B0,C2,D0,17,16,19,18
...,...,...,...,...,...,...,...
A3,B1,C1,D1,-10,-10,-10,-10
A3,B1,C2,D0,241,240,243,242
A3,B1,C2,D1,245,244,247,246
A3,B1,C3,D0,-10,-10,-10,-10


In [98]:
df2 = dfmi.copy()

In [99]:
df2.loc[idx[:, :, ["C1", "C3"]], :] = df2 * 1000

In [100]:
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lv10,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,lv11,bar,foo,bah,foo
A0,B0,C0,D0,1,0,3,2
A0,B0,C0,D1,5,4,7,6
A0,B0,C1,D0,9000,8000,11000,10000
A0,B0,C1,D1,13000,12000,15000,14000
A0,B0,C2,D0,17,16,19,18
...,...,...,...,...,...,...,...
A3,B1,C1,D1,237000,236000,239000,238000
A3,B1,C2,D0,241,240,243,242
A3,B1,C2,D1,245,244,247,246
A3,B1,C3,D0,249000,248000,251000,250000


### Cross-section

In [101]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bar,one,-1.129645,-0.953406,-0.757268
bar,two,-1.148067,-0.196163,2.049972
baz,one,0.383512,-0.726828,0.728863
baz,two,0.175234,0.221658,-0.829424
foo,one,1.682195,-0.85063,1.063771
foo,two,0.424873,2.03903,-0.091753
qux,one,0.205087,-2.8652,0.387784
qux,two,-0.57935,0.998255,0.538309


In [102]:
df.xs("one", level="second")

Unnamed: 0_level_0,A,B,C
first,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,-1.129645,-0.953406,-0.757268
baz,0.383512,-0.726828,0.728863
foo,1.682195,-0.85063,1.063771
qux,0.205087,-2.8652,0.387784


In [103]:
df.loc[(slice(None), "one"), :]

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bar,one,-1.129645,-0.953406,-0.757268
baz,one,0.383512,-0.726828,0.728863
foo,one,1.682195,-0.85063,1.063771
qux,one,0.205087,-2.8652,0.387784


In [104]:
df = df.T

In [105]:
df

first,bar,bar,baz,baz,foo,foo,qux,qux
second,one,two,one,two,one,two,one,two
A,-1.129645,-1.148067,0.383512,0.175234,1.682195,0.424873,0.205087,-0.57935
B,-0.953406,-0.196163,-0.726828,0.221658,-0.85063,2.03903,-2.8652,0.998255
C,-0.757268,2.049972,0.728863,-0.829424,1.063771,-0.091753,0.387784,0.538309


In [106]:
df.xs("one", level="second", axis=1)

first,bar,baz,foo,qux
A,-1.129645,0.383512,1.682195,0.205087
B,-0.953406,-0.726828,-0.85063,-2.8652
C,-0.757268,0.728863,1.063771,0.387784


In [107]:
df.loc[:, (slice(None), "one")]

first,bar,baz,foo,qux
second,one,one,one,one
A,-1.129645,0.383512,1.682195,0.205087
B,-0.953406,-0.726828,-0.85063,-2.8652
C,-0.757268,0.728863,1.063771,0.387784


In [108]:
df.xs(("one", "bar"), level=("second", "first"), axis=1)

first,bar
second,one
A,-1.129645
B,-0.953406
C,-0.757268


In [109]:
df.loc[:, ("bar", "one")]

A   -1.129645
B   -0.953406
C   -0.757268
Name: (bar, one), dtype: float64

In [110]:
df.xs("one", level="second", axis=1, drop_level=False)

first,bar,baz,foo,qux
second,one,one,one,one
A,-1.129645,0.383512,1.682195,0.205087
B,-0.953406,-0.726828,-0.85063,-2.8652
C,-0.757268,0.728863,1.063771,0.387784


In [111]:
df.xs("one", level="second", axis=1, drop_level=True)

first,bar,baz,foo,qux
A,-1.129645,0.383512,1.682195,0.205087
B,-0.953406,-0.726828,-0.85063,-2.8652
C,-0.757268,0.728863,1.063771,0.387784


### Advanced reindexing and alignment

In [112]:
midx = pd.MultiIndex(
    levels=[["zero", "one"], ["x", "y"]], codes=[[1,1,0,0], [1,0,1,0]]
)

In [113]:
midx

MultiIndex([( 'one', 'y'),
            ( 'one', 'x'),
            ('zero', 'y'),
            ('zero', 'x')],
           )

In [114]:
pd.MultiIndex(
    levels=[["zero", "one"], ["x", "y"]], codes=[[1,0,0, 1], [1,0,1,0]]
)

MultiIndex([( 'one', 'y'),
            ('zero', 'x'),
            ('zero', 'y'),
            ( 'one', 'x')],
           )

In [115]:
df = pd.DataFrame(np.random.randn(4,2), index=midx)

In [116]:
df

Unnamed: 0,Unnamed: 1,0,1
one,y,0.87292,-0.117557
one,x,-1.04709,0.768476
zero,y,-1.057378,0.715885
zero,x,-0.830331,0.97582


In [117]:
df2 = df.mean(level=0)

In [118]:
df2

Unnamed: 0,0,1
one,-0.087085,0.325459
zero,-0.943854,0.845853


In [119]:
(-0.055193 -0.642253) / 2

-0.348723

In [120]:
df2.reindex(df.index, level=0)

Unnamed: 0,Unnamed: 1,0,1
one,y,-0.087085,0.325459
one,x,-0.087085,0.325459
zero,y,-0.943854,0.845853
zero,x,-0.943854,0.845853


In [121]:
df_ailgned, df2_aligned = df.align(df2, level=0)

### Swapping levels with swaplevel

In [122]:
df[:5]

Unnamed: 0,Unnamed: 1,0,1
one,y,0.87292,-0.117557
one,x,-1.04709,0.768476
zero,y,-1.057378,0.715885
zero,x,-0.830331,0.97582


In [123]:
df[:5].swaplevel(0,1,axis=0)

Unnamed: 0,Unnamed: 1,0,1
y,one,0.87292,-0.117557
x,one,-1.04709,0.768476
y,zero,-1.057378,0.715885
x,zero,-0.830331,0.97582


In [124]:
df[:5].reorder_levels([1,0], axis=0)

Unnamed: 0,Unnamed: 1,0,1
y,one,0.87292,-0.117557
x,one,-1.04709,0.768476
y,zero,-1.057378,0.715885
x,zero,-0.830331,0.97582


### Renaming names of an Index or MultiIndex

In [125]:
df.rename(columns={0: "col0", 1:"col1"})

Unnamed: 0,Unnamed: 1,col0,col1
one,y,0.87292,-0.117557
one,x,-1.04709,0.768476
zero,y,-1.057378,0.715885
zero,x,-0.830331,0.97582


In [126]:
df.rename(index={"one": "two", "y": "z"})

Unnamed: 0,Unnamed: 1,0,1
two,z,0.87292,-0.117557
two,x,-1.04709,0.768476
zero,z,-1.057378,0.715885
zero,x,-0.830331,0.97582


In [127]:
df.rename_axis(index=["abc", "def"])

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1
abc,def,Unnamed: 2_level_1,Unnamed: 3_level_1
one,y,0.87292,-0.117557
one,x,-1.04709,0.768476
zero,y,-1.057378,0.715885
zero,x,-0.830331,0.97582


In [128]:
df.rename_axis(columns="Cols").columns

RangeIndex(start=0, stop=2, step=1, name='Cols')

In [129]:
df

Unnamed: 0,Unnamed: 1,0,1
one,y,0.87292,-0.117557
one,x,-1.04709,0.768476
zero,y,-1.057378,0.715885
zero,x,-0.830331,0.97582


In [130]:
mi = pd.MultiIndex.from_product([[1,2], ["a", "b"]], names=["x", "y"])

In [131]:
mi

MultiIndex([(1, 'a'),
            (1, 'b'),
            (2, 'a'),
            (2, 'b')],
           names=['x', 'y'])

In [132]:
mi.names

FrozenList(['x', 'y'])

In [133]:
mi2 = mi.rename("new name", level=0)

In [134]:
mi2

MultiIndex([(1, 'a'),
            (1, 'b'),
            (2, 'a'),
            (2, 'b')],
           names=['new name', 'y'])

In [135]:
# You cannot set the names of the MultiIndex via a level.
# mi.levels[0].name = "name via level"

## Sorting a MultiIndex

In [136]:
import random

In [137]:
random.shuffle(tuples)

In [138]:
tuples

[('foo', 'one'),
 ('baz', 'one'),
 ('baz', 'two'),
 ('qux', 'one'),
 ('qux', 'two'),
 ('foo', 'two'),
 ('bar', 'two'),
 ('bar', 'one')]

In [139]:
s  = pd.Series(np.random.randn(8), index=pd.MultiIndex.from_tuples(tuples))

In [140]:
s.sort_index()

bar  one    0.280066
     two    0.248930
baz  one   -0.373707
     two   -0.772975
foo  one   -0.667239
     two   -0.878750
qux  one   -0.913404
     two    0.846176
dtype: float64

In [141]:
s.sort_index(level=0)

bar  one    0.280066
     two    0.248930
baz  one   -0.373707
     two   -0.772975
foo  one   -0.667239
     two   -0.878750
qux  one   -0.913404
     two    0.846176
dtype: float64

In [142]:
s.sort_index(level=1)

bar  one    0.280066
baz  one   -0.373707
foo  one   -0.667239
qux  one   -0.913404
bar  two    0.248930
baz  two   -0.772975
foo  two   -0.878750
qux  two    0.846176
dtype: float64

In [143]:
s.index.set_names(["L1", "L2"], inplace=True)

In [144]:
s.sort_index(level="L1")

L1   L2 
bar  one    0.280066
     two    0.248930
baz  one   -0.373707
     two   -0.772975
foo  one   -0.667239
     two   -0.878750
qux  one   -0.913404
     two    0.846176
dtype: float64

In [145]:
s.sort_index(level="L2")

L1   L2 
bar  one    0.280066
baz  one   -0.373707
foo  one   -0.667239
qux  one   -0.913404
bar  two    0.248930
baz  two   -0.772975
foo  two   -0.878750
qux  two    0.846176
dtype: float64

In [146]:
df.T.sort_index(level=1, axis=1)

Unnamed: 0_level_0,one,zero,one,zero
Unnamed: 0_level_1,x,x,y,y
0,-1.04709,-0.830331,0.87292,-1.057378
1,0.768476,0.97582,-0.117557,0.715885


In [147]:
dfm = pd.DataFrame(
    {"jim": [0,0,1,1], "joe": ["x", "x", "z", "y"], "jolie": np.random.rand(4)}
)

In [148]:
dfm = dfm.set_index(["jim", "joe"])

In [149]:
dfm

Unnamed: 0_level_0,Unnamed: 1_level_0,jolie
jim,joe,Unnamed: 2_level_1
0,x,0.203767
0,x,0.597239
1,z,0.804807
1,y,0.971974


In [150]:
dfm.loc[((1,'z'))]

  dfm.loc[((1,'z'))]


Unnamed: 0_level_0,Unnamed: 1_level_0,jolie
jim,joe,Unnamed: 2_level_1
1,z,0.804807


In [151]:
# dfm.loc[(0, 'y'):(1,'z')]

In [152]:
dfm.index.is_lexsorted()

False

In [153]:
dfm.index.lexsort_depth

1

In [154]:
dfm = dfm.sort_index()

In [155]:
dfm

Unnamed: 0_level_0,Unnamed: 1_level_0,jolie
jim,joe,Unnamed: 2_level_1
0,x,0.203767
0,x,0.597239
1,y,0.971974
1,z,0.804807


In [156]:
df.index.is_lexsorted()

False

In [157]:
dfm.index.lexsort_depth

2

In [158]:
dfm.loc[(0, 'y'):(1,'z')]

Unnamed: 0_level_0,Unnamed: 1_level_0,jolie
jim,joe,Unnamed: 2_level_1
1,y,0.971974
1,z,0.804807


## Take methods

In [159]:
index = pd.Index(np.random.randint(0,1000,10))

In [160]:
index

Int64Index([593, 244, 657, 832, 481, 110, 392, 942, 455, 62], dtype='int64')

In [161]:
positions = [0,9,3]

In [162]:
index[positions]

Int64Index([593, 62, 832], dtype='int64')

In [163]:
index.take(positions)

Int64Index([593, 62, 832], dtype='int64')

In [164]:
ser = pd.Series(np.random.randn(10))

In [165]:
ser.iloc[positions]

0   -0.396233
9   -1.757009
3    0.056865
dtype: float64

In [166]:
ser.take(positions)

0   -0.396233
9   -1.757009
3    0.056865
dtype: float64

In [167]:
frm = pd.DataFrame(np.random.randn(5,3))

In [168]:
frm

Unnamed: 0,0,1,2
0,-1.401067,0.031451,0.302358
1,-0.115064,1.558759,0.68706
2,0.029768,1.212268,0.239396
3,-0.616763,0.424589,1.093377
4,-2.469385,-0.743334,-0.709936


In [169]:
frm.take([1,4,3])

Unnamed: 0,0,1,2
1,-0.115064,1.558759,0.68706
4,-2.469385,-0.743334,-0.709936
3,-0.616763,0.424589,1.093377


In [170]:
frm.take([0,2], axis=1)

Unnamed: 0,0,2
0,-1.401067,0.302358
1,-0.115064,0.68706
2,0.029768,0.239396
3,-0.616763,1.093377
4,-2.469385,-0.709936


In [171]:
arr = np.random.randn(10)

In [172]:
arr

array([ 0.18943073,  0.52784783, -1.00779419,  0.55038571,  1.0012323 ,
        0.9713963 ,  0.93073501, -0.14717055, -2.36356997,  0.04641709])

In [173]:
arr.take([False, False, True, True])

array([0.18943073, 0.18943073, 0.52784783, 0.52784783])

In [174]:
arr[[0,1]]

array([0.18943073, 0.52784783])

In [175]:
ser = pd.Series(np.random.randn(10))

In [176]:
ser.take([False, False, True, True])

0   -0.531163
0   -0.531163
1    0.240218
1    0.240218
dtype: float64

In [177]:
ser.iloc[[0,1]]

0   -0.531163
1    0.240218
dtype: float64

In [178]:
arr = np.random.randn(10000, 5)

In [179]:
indexer = np.arange(10000)

In [180]:
random.shuffle(indexer)

In [181]:
%timeit arr[indexer]

148 µs ± 1.29 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [182]:
%timeit arr.take(indexer, axis=0)

60.9 µs ± 1.2 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [183]:
ser = pd.Series(arr[:, 0])

In [184]:
%timeit ser.iloc[indexer]

128 µs ± 1.31 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [185]:
%timeit ser.take(indexer)

122 µs ± 3.42 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


## Index types

### CategoricalIndex

In [186]:
from pandas.api.types import CategoricalDtype

In [187]:
df = pd.DataFrame({"A": np.arange(6), "B": list("aabbca")})

In [188]:
df

Unnamed: 0,A,B
0,0,a
1,1,a
2,2,b
3,3,b
4,4,c
5,5,a


In [189]:
df["B"] = df["B"].astype(CategoricalDtype(list("cab")))

In [190]:
df

Unnamed: 0,A,B
0,0,a
1,1,a
2,2,b
3,3,b
4,4,c
5,5,a


In [191]:
df.dtypes

A       int64
B    category
dtype: object

In [192]:
df["B"].cat.categories

Index(['c', 'a', 'b'], dtype='object')

In [193]:
df2 = df.set_index("B")

In [194]:
df2.index

CategoricalIndex(['a', 'a', 'b', 'b', 'c', 'a'], categories=['c', 'a', 'b'], ordered=False, name='B', dtype='category')

In [195]:
df2

Unnamed: 0_level_0,A
B,Unnamed: 1_level_1
a,0
a,1
b,2
b,3
c,4
a,5


In [196]:
df2.loc["a"]

Unnamed: 0_level_0,A
B,Unnamed: 1_level_1
a,0
a,1
a,5


In [197]:
df2.loc["a"].index

CategoricalIndex(['a', 'a', 'a'], categories=['c', 'a', 'b'], ordered=False, name='B', dtype='category')

In [198]:
df2.sort_index()

Unnamed: 0_level_0,A
B,Unnamed: 1_level_1
c,4
a,0
a,1
a,5
b,2
b,3


In [199]:
df2.groupby(level=0).sum()

Unnamed: 0_level_0,A
B,Unnamed: 1_level_1
c,4
a,6
b,5


In [200]:
df2.groupby(level=0).sum().index

CategoricalIndex(['c', 'a', 'b'], categories=['c', 'a', 'b'], ordered=False, name='B', dtype='category')

In [201]:
df3 = pd.DataFrame(
    {"A": np.arange(3), "B": pd.Series(list("abc")).astype("category")}
)

In [202]:
df3

Unnamed: 0,A,B
0,0,a
1,1,b
2,2,c


In [203]:
df3 = df3.set_index("B")

In [204]:
df3

Unnamed: 0_level_0,A
B,Unnamed: 1_level_1
a,0
b,1
c,2


In [205]:
df3.reindex(["a", "e"])

Unnamed: 0_level_0,A
B,Unnamed: 1_level_1
a,0.0
e,


In [206]:
df3.reindex(["a", "e"]).index

Index(['a', 'e'], dtype='object', name='B')

In [207]:
df3.reindex(pd.Categorical(["a", "e"], categories=list("abe")))

Unnamed: 0_level_0,A
B,Unnamed: 1_level_1
a,0.0
e,


In [208]:
df3.reindex(pd.Categorical(["a", "e"], categories=list("abe"))).index

CategoricalIndex(['a', 'e'], categories=['a', 'b', 'e'], ordered=False, name='B', dtype='category')

In [209]:
df4 = pd.DataFrame({"A": np.arange(2), "B": list("ba")})

In [210]:
df4["B"] = df4["B"].astype(CategoricalDtype(list("ab")))

In [211]:
list("ab")

['a', 'b']

In [212]:
df4

Unnamed: 0,A,B
0,0,b
1,1,a


In [213]:
df4 = df4.set_index("B")

In [214]:
df4

Unnamed: 0_level_0,A
B,Unnamed: 1_level_1
b,0
a,1


In [215]:
df4.index

CategoricalIndex(['b', 'a'], categories=['a', 'b'], ordered=False, name='B', dtype='category')

In [216]:
df5 = pd.DataFrame({"A": np.arange(2), "B": list("bc")})

In [217]:
df5["B"] = df5["B"].astype(CategoricalDtype(list("bc")))

In [218]:
df5 = df5.set_index("B")

In [219]:
df5.index

CategoricalIndex(['b', 'c'], categories=['b', 'c'], ordered=False, name='B', dtype='category')

In [222]:
pd.concat([df4,df5])

Unnamed: 0_level_0,A
B,Unnamed: 1_level_1
b,0
a,1
b,0
c,1


### Int64Index and RangeIndex

In [223]:
indexf = pd.Index([1.5, 2, 3, 4.5, 5])

In [224]:
indexf

Float64Index([1.5, 2.0, 3.0, 4.5, 5.0], dtype='float64')

In [225]:
sf = pd.Series(range(5), index=indexf)

In [226]:
sf

1.5    0
2.0    1
3.0    2
4.5    3
5.0    4
dtype: int64

In [227]:
sf[3]

2

In [229]:
sf[3.0]

2

In [231]:
sf.loc[3]

2

In [233]:
sf.loc[3.0]

2

In [234]:
sf.iloc[3]

3

In [235]:
sf[2:4]

2.0    1
3.0    2
dtype: int64

In [237]:
sf.loc[2:4]

2.0    1
3.0    2
dtype: int64

In [238]:
sf.iloc[2:4]

3.0    2
4.5    3
dtype: int64

In [242]:
sf[2.1:4.6]

3.0    2
4.5    3
dtype: int64

In [243]:
sf.loc[2.1:4.6]

3.0    2
4.5    3
dtype: int64

In [244]:
pd.Series(range(5))[3.5]

KeyError: 3.5

In [245]:
pd.Series(range(5))[3.5:4.5]

TypeError: cannot do slice indexing on RangeIndex with these indexers [3.5] of type float

In [248]:
dfir = pd.concat(
    [
        pd.DataFrame(
            np.random.randn(5,2), index=np.arange(5) * 250.0, columns=list("AB")
        ),
        pd.DataFrame(
            np.random.randn(6,2),
            index=np.arange(4,10) * 250.1,
            columns=list("AB")
        )
    ]
)

In [249]:
dfir

Unnamed: 0,A,B
0.0,-0.783996,-1.155068
250.0,0.037051,-1.789661
500.0,0.023925,-1.059183
750.0,-0.465191,-0.632338
1000.0,0.687721,-0.073737
1000.4,0.190233,0.891009
1250.5,-0.257994,-1.273639
1500.6,-0.216298,1.114513
1750.7,-1.837628,-1.288847
2000.8,0.457224,0.983914


In [250]:
dfir[0:1000.4]

Unnamed: 0,A,B
0.0,-0.783996,-1.155068
250.0,0.037051,-1.789661
500.0,0.023925,-1.059183
750.0,-0.465191,-0.632338
1000.0,0.687721,-0.073737
1000.4,0.190233,0.891009


In [251]:
dfir.loc[0:1001, "A"]

0.0      -0.783996
250.0     0.037051
500.0     0.023925
750.0    -0.465191
1000.0    0.687721
1000.4    0.190233
Name: A, dtype: float64

In [252]:
dfir[0:1000]

Unnamed: 0,A,B
0.0,-0.783996,-1.155068
250.0,0.037051,-1.789661
500.0,0.023925,-1.059183
750.0,-0.465191,-0.632338
1000.0,0.687721,-0.073737


In [253]:
dfir.iloc[0:5]

Unnamed: 0,A,B
0.0,-0.783996,-1.155068
250.0,0.037051,-1.789661
500.0,0.023925,-1.059183
750.0,-0.465191,-0.632338
1000.0,0.687721,-0.073737


In [254]:
df = pd.DataFrame(
    {"A":[1,2,3,4]}, index=pd.IntervalIndex.from_breaks([0,1,2,3,4])
)

In [255]:
df

Unnamed: 0,A
"(0, 1]",1
"(1, 2]",2
"(2, 3]",3
"(3, 4]",4


In [256]:
df.loc[2]

A    2
Name: (1, 2], dtype: int64

In [257]:
df.loc[[2,3]]

Unnamed: 0,A
"(1, 2]",2
"(2, 3]",3


In [258]:
df.loc[2.5]

A    3
Name: (2, 3], dtype: int64

In [259]:
df.loc[[2.5, 3.5]]

Unnamed: 0,A
"(2, 3]",3
"(3, 4]",4


In [260]:
df.loc[pd.Interval(1,2)]

A    2
Name: (1, 2], dtype: int64

In [261]:
df.loc[pd.Interval(0.5, 2.5)]

KeyError: Interval(0.5, 2.5, closed='right')

In [262]:
idxr = df.index.overlaps(pd.Interval(0.5,2.4))

In [263]:
idxr

array([ True,  True,  True, False])

In [264]:
df[idxr]

Unnamed: 0,A
"(0, 1]",1
"(1, 2]",2
"(2, 3]",3


In [265]:
c = pd.cut(range(4), bins=2)

In [266]:
c

[(-0.003, 1.5], (-0.003, 1.5], (1.5, 3.0], (1.5, 3.0]]
Categories (2, interval[float64]): [(-0.003, 1.5] < (1.5, 3.0]]

### Binning data with `cut` and `qcut`

In [267]:
c = pd.cut(range(4), bins=2)

In [268]:
c

[(-0.003, 1.5], (-0.003, 1.5], (1.5, 3.0], (1.5, 3.0]]
Categories (2, interval[float64]): [(-0.003, 1.5] < (1.5, 3.0]]

In [269]:
c.categories

IntervalIndex([(-0.003, 1.5], (1.5, 3.0]],
              closed='right',
              dtype='interval[float64]')

In [270]:
pd.cut([0,3,5,1], bins=c.categories)

[(-0.003, 1.5], (1.5, 3.0], NaN, (-0.003, 1.5]]
Categories (2, interval[float64]): [(-0.003, 1.5] < (1.5, 3.0]]

### Generating ranges of intervals

In [271]:
pd.interval_range(start=0, end=5)

IntervalIndex([(0, 1], (1, 2], (2, 3], (3, 4], (4, 5]],
              closed='right',
              dtype='interval[int64]')

In [272]:
pd.interval_range(start=pd.Timestamp("2017-01-01"), periods=4)

IntervalIndex([(2017-01-01, 2017-01-02], (2017-01-02, 2017-01-03], (2017-01-03, 2017-01-04], (2017-01-04, 2017-01-05]],
              closed='right',
              dtype='interval[datetime64[ns]]')

In [274]:
pd.interval_range(end=pd.Timedelta("3 days"), periods=3)

IntervalIndex([(0 days 00:00:00, 1 days 00:00:00], (1 days 00:00:00, 2 days 00:00:00], (2 days 00:00:00, 3 days 00:00:00]],
              closed='right',
              dtype='interval[timedelta64[ns]]')

In [275]:
pd.interval_range(start=0, periods=5, freq=1.5)

IntervalIndex([(0.0, 1.5], (1.5, 3.0], (3.0, 4.5], (4.5, 6.0], (6.0, 7.5]],
              closed='right',
              dtype='interval[float64]')

In [276]:
pd.interval_range(start=pd.Timestamp("2017-01-01"), periods=4, freq="W")

IntervalIndex([(2017-01-01, 2017-01-08], (2017-01-08, 2017-01-15], (2017-01-15, 2017-01-22], (2017-01-22, 2017-01-29]],
              closed='right',
              dtype='interval[datetime64[ns]]')

In [277]:
pd.interval_range(start=pd.Timedelta("0 days"), periods=3, freq="9H")

IntervalIndex([(0 days 00:00:00, 0 days 09:00:00], (0 days 09:00:00, 0 days 18:00:00], (0 days 18:00:00, 1 days 03:00:00]],
              closed='right',
              dtype='interval[timedelta64[ns]]')

In [278]:
pd.interval_range(start=0, end=4, closed="both")

IntervalIndex([[0, 1], [1, 2], [2, 3], [3, 4]],
              closed='both',
              dtype='interval[int64]')

In [279]:
pd.interval_range(start=0, end=4, closed="neither")

IntervalIndex([(0, 1), (1, 2), (2, 3), (3, 4)],
              closed='neither',
              dtype='interval[int64]')

In [280]:
pd.interval_range(start=0, end=6, periods = 4)

IntervalIndex([(0.0, 1.5], (1.5, 3.0], (3.0, 4.5], (4.5, 6.0]],
              closed='right',
              dtype='interval[float64]')

In [281]:
pd.interval_range(pd.Timestamp("2018-01-01"), pd.Timestamp("2018-02-28"), periods=3)

IntervalIndex([(2018-01-01, 2018-01-20 08:00:00], (2018-01-20 08:00:00, 2018-02-08 16:00:00], (2018-02-08 16:00:00, 2018-02-28]],
              closed='right',
              dtype='interval[datetime64[ns]]')

## Miscellaneous indexing FAQ

In [287]:
s = pd.Series(range(5))

In [288]:
s

0    0
1    1
2    2
3    3
4    4
dtype: int64

In [289]:
s[-1]

KeyError: -1

In [290]:
df = pd.DataFrame(np.random.randn(5,4))

In [291]:
df

Unnamed: 0,0,1,2,3
0,0.21411,-0.009059,1.443001,-0.035226
1,0.898497,-1.155351,-0.424064,0.560022
2,0.877879,-0.352413,0.389925,-0.346234
3,2.138098,0.396498,-0.734844,1.669956
4,-3.078316,2.106157,0.9536,-0.301431


In [292]:
df.loc[-2:0]

Unnamed: 0,0,1,2,3
0,0.21411,-0.009059,1.443001,-0.035226


In [293]:
df = pd.DataFrame(index=[1,2,3,4,5], columns=["data"], data=list(range(5)))

In [294]:
df

Unnamed: 0,data
1,0
2,1
3,2
4,3
5,4


In [295]:
df.index.is_monotonic_increasing

True

In [296]:
df.loc[0:5, :]

Unnamed: 0,data
1,0
2,1
3,2
4,3
5,4


In [297]:
df.loc[13:15, :]

Unnamed: 0,data


In [298]:
df = pd.DataFrame(index=[2,3,1,4,3,5], columns=["data"], data=list(range(6)))

In [299]:
df.index.is_monotonic_increasing

False

In [300]:
df.loc[2:4, :]

Unnamed: 0,data
2,0
3,1
1,2
4,3


In [301]:
df.loc[0:4, :]

KeyError: 0

In [302]:
df.loc[2:3, :]

KeyError: 'Cannot get right slice bound for non-unique label: 3'

In [305]:
weakly_monotonic = pd.Index(["a", "b", "c", "c"])

In [306]:
weakly_monotonic

Index(['a', 'b', 'c', 'c'], dtype='object')

In [307]:
weakly_monotonic.is_monotonic_increasing

True

In [308]:
weakly_monotonic.is_monotonic_increasing & weakly_monotonic.is_unique

False

In [309]:
s = pd.Series(np.random.randn(6), index=list("abcdef"))

In [310]:
s

a   -0.462652
b   -0.714209
c    0.016611
d   -1.167894
e   -1.658206
f    0.805133
dtype: float64

In [311]:
s[2:5]

c    0.016611
d   -1.167894
e   -1.658206
dtype: float64

In [312]:
s.loc["c":"e"]

c    0.016611
d   -1.167894
e   -1.658206
dtype: float64

In [315]:
series1 = pd.Series([1,2,3])

In [316]:
series1.dtype

dtype('int64')

In [317]:
res = series1.reindex([0,4])

In [318]:
res.dtype

dtype('float64')

In [319]:
res

0    1.0
4    NaN
dtype: float64

In [320]:
series2 = pd.Series([True])

In [321]:
series2.dtype

dtype('bool')

In [322]:
res = series2.reindex_like(series1)

In [323]:
res.dtype

dtype('O')

In [324]:
res

0    True
1     NaN
2     NaN
dtype: object