In [128]:
%pylab inline
import numpy as np
import pandas as pd

Populating the interactive namespace from numpy and matplotlib


### Index
* Basics
    * [Series](#basics_series)
        * [use unstack to reshape a hierarchical-indexed series into a DataFrame](#unstack)
    * [DataFrame](#basics_frame)
        * [get_level_values](#get_level_values)
* [Summary Statistics by Level](#statistics)
* [set_index and reset_index](#set_index)
* get_level_values

## Basics
<a id="basics_series"></a>
### Series

In [129]:
data = pd.Series(np.arange(1,11), 
                 index=[['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'],
                        ["one", "two", "three", "one", "two", "three","one", "two","two", "three"]])
data # when printed out, The “gaps” in the index display mean “use the label directly above”:

a  one       1
   two       2
   three     3
b  one       4
   two       5
   three     6
c  one       7
   two       8
d  two       9
   three    10
dtype: int32

In [130]:
# partical index
data["a"]

one      1
two      2
three    3
dtype: int32

In [131]:
data.loc[["a","d"]]

a  one       1
   two       2
   three     3
d  two       9
   three    10
dtype: int32

<a id="unstack"></a>
<span style="color:green;font-weight:bold;font-size:1.5em">use **unstack** to reshape a hierarchical-indexed series into a DataFrame</span>

In [132]:
data.unstack()

Unnamed: 0,one,three,two
a,1.0,3.0,2
b,4.0,6.0,5
c,7.0,,8
d,,10.0,9


<a id="basics_frame"></a>
### DataFrame

In [133]:
# index is immutable, so can be created by itself and reused
rowindex = pd.MultiIndex.from_arrays( [['a', 'a', 'b', 'b'], 
                                       [1, 2, 1, 2]],names=['key1', 'key2'])
colindex = pd.MultiIndex.from_arrays([['Ohio', 'Ohio', 'Colorado'], 
                                      ['Green', 'Red', 'Green']],names=['state', 'color'])
df = pd.DataFrame(np.arange(12).reshape((4, 3)), index= rowindex, columns= colindex)
df

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [134]:
df.Ohio

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


In [135]:
df.loc["a"]

state,Ohio,Ohio,Colorado
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,0,1,2
2,3,4,5


In [136]:
df.loc[("a",1),"Ohio"]

color
Green    0
Red      1
Name: (a, 1), dtype: int32

In [137]:
df.swaplevel("key1","key2").sortlevel(0)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


<a id="get_level_values"></a>
<span style="color:green;font-weight:bold;font-size:1.5em">get all values in certain level</span>

In [138]:
rowindex = pd.MultiIndex.from_arrays( [['a', 'a', 'b', 'b'], 
                                       [1, 2, 1, 2]],names=['key1', 'key2'])
colindex = pd.MultiIndex.from_arrays([['Ohio', 'Ohio', 'Colorado'], 
                                      ['Green', 'Red', 'Green']],names=['state', 'color'])
df = pd.DataFrame(np.arange(12).reshape((4, 3)), index= rowindex, columns= colindex)
df

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [139]:
# specify the level by numeric index
df.index.get_level_values(1)

Int64Index([1, 2, 1, 2], dtype='int64')

In [140]:
# specify the level by its name
df.index.get_level_values("key1")# returned values will be repeated, so not unique

Index([u'a', u'a', u'b', u'b'], dtype='object')

<a id="statistics"></a>
## Summary Statistics by Level
statistics by level. we can understand it like: without level, then statistics along each row/column will give a single result. now, <span style="color:orange;font-weight:bold;font-size:1.2em">do statistics by level, that will give a result for each level</span>. so statistics along column, give a result for each row level; and for statistics along each row, give a result for each column level.

In [141]:
rowindex = pd.MultiIndex.from_arrays( [['a', 'a', 'b', 'b'], 
                                       [1, 2, 1, 2]],names=['key1', 'key2'])
colindex = pd.MultiIndex.from_arrays([['Ohio', 'Ohio', 'Colorado'], 
                                      ['Green', 'Red', 'Green']],names=['state', 'color'])
df = pd.DataFrame(np.arange(12).reshape((4, 3)), index= rowindex, columns= colindex)
df

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [142]:
df.sum()# sum each columns

state     color
Ohio      Green    18
          Red      22
Colorado  Green    26
dtype: int64

In [143]:
df.sum(level="key2")# sum each column, then level should be row level

state,Ohio,Ohio,Colorado
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


In [144]:
df.sum(axis=1)# sum each rows

key1  key2
a     1        3
      2       12
b     1       21
      2       30
dtype: int64

In [145]:
df.sum(axis=1,level="color")# sum each row, then level should be column level

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,1
a,2,8,4
b,1,14,7
b,2,20,10


<a id="set_index"></a>
## set_index and reset_index
* **set_index**: use one or more columns from a DataFrame as the index
* **reset_index**: you may wish to move the row index into the DataFrame's columns.

In [146]:
df = pd.DataFrame({'a': range(7),                       
                   'b': range(7, 0, -1),                      
                   'c': ['one', 'one', 'one', 'two', 'two', 'two', 'two'],                      
                   'd': [0, 1, 2, 0, 1, 2, 3]})
df

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [147]:
newdf = df.set_index(['c', 'd'])
newdf

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [148]:
newdf.reset_index()

Unnamed: 0,c,d,a,b
0,one,0,0,7
1,one,1,1,6
2,one,2,2,5
3,two,0,3,4
4,two,1,4,3
5,two,2,5,2
6,two,3,6,1
