In [1]:
import pandas as pd
import numpy as np

In [2]:
# First attempt to crete multi index type of implementation!

index = [ ('America','California', 2000), ('America','California', 2010),
          ('Asia','Delhi', 2000), ('Asia','Delhi', 2010),
          ('Asia','HK', 2000), ('Asia','HK', 2010),
          ('America','Texas', 2000), ('America','Texas', 2010)
        ]

populations = [33871648, 37253956,18976457, 19378102,30861820, 35145561,20851820, 25145561]

# Lets create a Series with index containing two informations

pop = pd.Series(populations, index=index)
pop

(America, California, 2000)    33871648
(America, California, 2010)    37253956
(Asia, Delhi, 2000)            18976457
(Asia, Delhi, 2010)            19378102
(Asia, HK, 2000)               30861820
(Asia, HK, 2010)               35145561
(America, Texas, 2000)         20851820
(America, Texas, 2010)         25145561
dtype: int64

In [3]:
# Looks like we have extra dimensions in a Series

# We can get Pop of CA on 2000 like this , i.e supplying the whole Index

pop[('America','California',2000)]

33871648

In [4]:
# same thing like

pop[('America','Texas',2010)]

25145561

In [5]:
# also something like using index slicing
pop[('America','California', 2010):('America','Texas', 2000)]

(America, California, 2010)    37253956
(Asia, Delhi, 2000)            18976457
(Asia, Delhi, 2010)            19378102
(Asia, HK, 2000)               30861820
(Asia, HK, 2010)               35145561
(America, Texas, 2000)         20851820
dtype: int64

In [6]:
# But if we want something like , give me all records for year 2010 ?
# we can do but it will require some python code intervention 

# Step 1
# Extract the index as Tuples

indexes = pop.index
indexes

Index([('America', 'California', 2000), ('America', 'California', 2010),
               ('Asia', 'Delhi', 2000),         ('Asia', 'Delhi', 2010),
                  ('Asia', 'HK', 2000),            ('Asia', 'HK', 2010),
            ('America', 'Texas', 2000),      ('America', 'Texas', 2010)],
      dtype='object')

In [7]:
# Step 2
# Create Boolean Mask
mask = [i for i in indexes if i[2] == 2010]
mask

[('America', 'California', 2010),
 ('Asia', 'Delhi', 2010),
 ('Asia', 'HK', 2010),
 ('America', 'Texas', 2010)]

In [8]:
# Step 3 , pass the mask to pop
pop[mask]

(America, California, 2010)    37253956
(Asia, Delhi, 2010)            19378102
(Asia, HK, 2010)               35145561
(America, Texas, 2010)         25145561
dtype: int64

In [9]:
# The above approach took few lines of code
# Lets create MULTI INDEX

new_index =  pd.MultiIndex.from_tuples(index)
new_index

MultiIndex([('America', 'California', 2000),
            ('America', 'California', 2010),
            (   'Asia',      'Delhi', 2000),
            (   'Asia',      'Delhi', 2010),
            (   'Asia',         'HK', 2000),
            (   'Asia',         'HK', 2010),
            ('America',      'Texas', 2000),
            ('America',      'Texas', 2010)],
           )

In [10]:
# lets reindex the existing pop DF

pop = pop.reindex(new_index)
pop

America  California  2000    33871648
                     2010    37253956
Asia     Delhi       2000    18976457
                     2010    19378102
         HK          2000    30861820
                     2010    35145561
America  Texas       2000    20851820
                     2010    25145561
dtype: int64

In [11]:
# The above looks bit odd, we can sort the index for better understanding

pop.sort_index(inplace=True)
pop

America  California  2000    33871648
                     2010    37253956
         Texas       2000    20851820
                     2010    25145561
Asia     Delhi       2000    18976457
                     2010    19378102
         HK          2000    30861820
                     2010    35145561
dtype: int64

In [12]:
# Now the index part of POP looks different! Now its a multi indexed index
# THis new approach is easy to query
# Lets do the pythonic approach with new technique

pop[:2010] # very simple

America  California  2000    33871648
                     2010    37253956
         Texas       2000    20851820
                     2010    25145561
Asia     Delhi       2000    18976457
                     2010    19378102
         HK          2000    30861820
                     2010    35145561
dtype: int64

In [13]:
pop[:,'California']

America  2000    33871648
         2010    37253956
dtype: int64

In [14]:
# When we use Muiti index, it creates a Levels.
# In this example, There are two levels
# Level 0 -> Continent
# Level 1 -> City
# Level 2 -> Year

# Therefore, to get all info related to Texas we write

pop['America','Texas']
# It shows data for the levels underneath it , i.e Year

2000    20851820
2010    25145561
dtype: int64

In [15]:
# Now If we want to get all values related to a specific year, i.e Level 2
# THerefore its like a tree, we need to specify the path above

pop[:,:,2010] # Here we used : for specifying all from level 0 and 1

America  California    37253956
         Texas         25145561
Asia     Delhi         19378102
         HK            35145561
dtype: int64

In [16]:
# Lets write some examples to understand it better

idx = pd.IndexSlice
pop.loc['America'] # as its the topmost, we dont need to speficy any path
pop.loc['America','Texas',2010]

# its better to use IndexSlice for more control as shown below.
pop.loc[idx['America',:,2010]] # America, any state, but only 2010
pop.loc[idx[:,'HK',2010]]
pop.loc[idx[:,'HK',:]]
pop.loc[idx[:,:,2000:2010]]




America  California  2000    33871648
                     2010    37253956
         Texas       2000    20851820
                     2010    25145561
Asia     Delhi       2000    18976457
                     2010    19378102
         HK          2000    30861820
                     2010    35145561
dtype: int64

In [17]:
# Therefore its easy to retrieve items using multi index , its like selecting a brunch 
# from a tree.

# Now lets explor more, Our DF looks like this now: 

pop

America  California  2000    33871648
                     2010    37253956
         Texas       2000    20851820
                     2010    25145561
Asia     Delhi       2000    18976457
                     2010    19378102
         HK          2000    30861820
                     2010    35145561
dtype: int64

### unstack()

In [18]:
# Unstack is used to remove a level from row index and assign it a column index.lets see example

df = pop.unstack()
print(type(df)) # Therefore from MutiIndex Series, it converted to DF
print(df) # Here the innermost (Default) level's item moved to column (year)

<class 'pandas.core.frame.DataFrame'>
                        2000      2010
America California  33871648  37253956
        Texas       20851820  25145561
Asia    Delhi       18976457  19378102
        HK          30861820  35145561


In [19]:
pop.unstack(level=0)

Unnamed: 0,Unnamed: 1,America,Asia
California,2000,33871648.0,
California,2010,37253956.0,
Delhi,2000,,18976457.0
Delhi,2010,,19378102.0
HK,2000,,30861820.0
HK,2010,,35145561.0
Texas,2000,20851820.0,
Texas,2010,25145561.0,


In [20]:
pop.unstack(level=1)

Unnamed: 0,Unnamed: 1,California,Delhi,HK,Texas
America,2000,33871648.0,,,20851820.0
America,2010,37253956.0,,,25145561.0
Asia,2000,,18976457.0,30861820.0,
Asia,2010,,19378102.0,35145561.0,


In [21]:
pop.unstack(level=2)

Unnamed: 0,Unnamed: 1,2000,2010
America,California,33871648,37253956
America,Texas,20851820,25145561
Asia,Delhi,18976457,19378102
Asia,HK,30861820,35145561


### stack()

In [22]:
# Stack is just opposite, it stacks column index to row index. 

In [23]:
multi_idx_col_df = pop.unstack([1,2])
multi_idx_col_df

Unnamed: 0_level_0,California,California,Texas,Texas,Delhi,Delhi,HK,HK
Unnamed: 0_level_1,2000,2010,2000,2010,2000,2010,2000,2010
America,33871648.0,37253956.0,20851820.0,25145561.0,,,,
Asia,,,,,18976457.0,19378102.0,30861820.0,35145561.0


In [24]:
# lets stack the City Level( 0 ) to row index.

multi_idx_col_df.stack(0)

Unnamed: 0,Unnamed: 1,2000,2010
America,California,33871648.0,37253956.0
America,Texas,20851820.0,25145561.0
Asia,Delhi,18976457.0,19378102.0
Asia,HK,30861820.0,35145561.0


In [25]:
# we can also use stack() in chains.

multi_idx_col_df.stack().stack() # when all physical column index are marked for stack, DF got converted to
                                 # multi index Series

America  2000  California    33871648.0
               Texas         20851820.0
         2010  California    37253956.0
               Texas         25145561.0
Asia     2000  Delhi         18976457.0
               HK            30861820.0
         2010  Delhi         19378102.0
               HK            35145561.0
dtype: float64

In [26]:
# Stack , Unstack are ways to transform a data representation. 

### Pivot

In [27]:
pop

America  California  2000    33871648
                     2010    37253956
         Texas       2000    20851820
                     2010    25145561
Asia     Delhi       2000    18976457
                     2010    19378102
         HK          2000    30861820
                     2010    35145561
dtype: int64

In [28]:
pop.reset_index()

Unnamed: 0,level_0,level_1,level_2,0
0,America,California,2000,33871648
1,America,California,2010,37253956
2,America,Texas,2000,20851820
3,America,Texas,2010,25145561
4,Asia,Delhi,2000,18976457
5,Asia,Delhi,2010,19378102
6,Asia,HK,2000,30861820
7,Asia,HK,2010,35145561
