### Hierarchical Indexing:
    Data indexed by more than one or two keys is called Hierarchical Indexing.
    Pandas does provide Panel and Panel4D objects that natively handle three-dimensional and four-dimensional data

In [1]:
# import numpy as np
# import pandas as pd
import numpy as np
import pandas as pd

In [3]:
# The bad way
index = [('California', 2000), ('California', 2001), ('New York', 2000), ('New York', 2001), ('Texas', 2000), ('Texas', 2001)]
populations = [33871648, 37253956, 18976457, 19378102, 20851820, 25145561]
pop = pd.Series(populations, index = index)
pop

(California, 2000)    33871648
(California, 2001)    37253956
(New York, 2000)      18976457
(New York, 2001)      19378102
(Texas, 2000)         20851820
(Texas, 2001)         25145561
dtype: int64

In [4]:
# The better way: Pandas MultiIndex
index = pd.MultiIndex.from_tuples(index)
index

MultiIndex(levels=[['California', 'New York', 'Texas'], [2000, 2001]],
           codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]])

In [5]:
# Reindex our series with this MultiIndex
pop = pop.reindex(index)
pop

California  2000    33871648
            2001    37253956
New York    2000    18976457
            2001    19378102
Texas       2000    20851820
            2001    25145561
dtype: int64

In [13]:
# Now to access all data for which the second index is 2010
pop[:, 2000]

California    33871648
New York      18976457
Texas         20851820
dtype: int64

### MultiIndex as extra dimension
    The unstack() method will quickly convert a multiply-indexed Series into a conventionally indexed DataFrame.

In [14]:
# MultiIndex as extra dimension
pop_df = pop.unstack()
pop_df

Unnamed: 0,2000,2001
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [15]:
# Applying stack() method on pop_df
pop_df.stack()

California  2000    33871648
            2001    37253956
New York    2000    18976457
            2001    19378102
Texas       2000    20851820
            2001    25145561
dtype: int64

In [16]:
# Adding another column to the DataFrame
pop_df = pd.DataFrame({'total' : pop, 'under18' : [9267089, 9284094, 4687374, 4318033, 5906301, 6879014]})
pop_df

Unnamed: 0,Unnamed: 1,total,under18
California,2000,33871648,9267089
California,2001,37253956,9284094
New York,2000,18976457,4687374
New York,2001,19378102,4318033
Texas,2000,20851820,5906301
Texas,2001,25145561,6879014


In [18]:
# Compute the fraction of people under 18 by year
f_u18 = pop_df['under18'] / pop_df['total']
f_u18.unstack()

Unnamed: 0,2000,2001
California,0.273594,0.249211
New York,0.24701,0.222831
Texas,0.283251,0.273568


### Methods of MultiIndex Creation
    The most straightforward way to construct a multiply indexed Series or DataFrame is to simply pass a list of two or more index arrays to the constructor.

In [32]:
# Creating a MultiIndex DataFrame using list
df = pd.DataFrame(np.random.rand(4, 2), index = [['a', 'a', 'b', 'b'], [1, 2, 1, 2]], columns = ['Data1', 'Data2'])
df

Unnamed: 0,Unnamed: 1,Data1,Data2
a,1,0.012506,0.314613
a,2,0.268693,0.513909
b,1,0.983802,0.416267
b,2,0.42524,0.879471


In [35]:
# Creating a MultiIndex Series using dictionary with appropriate tuples as keys
data = {('California', 2000) : 33871648, ('California', 2001) : 37253956, ('Texas', 2000) : 20851820, ('Texas', 2001) : 25145561, ('New York', 2000) : 18976457, ('New York', 2001) : 19378102}
pd.Series(data)

Unnamed: 0,2000,2001
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


### MultiIndex level names
    Sometimes it is convenient to name the levels of the MultiIndex.
    You can accomplish this by passing the names argument to any of the above MultiIndex constructors, or by setting the names attribute of the index after the fact.

In [41]:
pop.index.names = ['state', 'year']
pop

state       year
California  2000    33871648
            2001    37253956
New York    2000    18976457
            2001    19378102
Texas       2000    20851820
            2001    25145561
dtype: int64

### MultiIndex for columns
    In a DataFrame , the rows and columns are completely symmetric, and just as the rows can have multiple levels of indices, the columns can have multiple levels as well.

In [51]:
# Hierarchical indices & columns
index = pd.MultiIndex.from_product([[2012, 2013], [1, 2]], names = ['Year', 'Visits'])
columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], ['HR', 'Temp']], names = ['Subject', 'Type'])

In [52]:
# Mock some data
data = np.round(np.random.randn(4, 6),1)
data[:, ::2] *= 10
data += 37

In [53]:
# Create the DataFrame
health_data = pd.DataFrame(data, index = index, columns = columns)
health_data

Unnamed: 0_level_0,Subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,Type,HR,Temp,HR,Temp,HR,Temp
Year,Visits,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2012,1,25.0,37.3,38.0,35.4,34.0,37.2
2012,2,42.0,36.8,34.0,37.7,41.0,37.7
2013,1,36.0,38.2,26.0,37.2,51.0,37.3
2013,2,29.0,37.7,36.0,38.2,42.0,36.1


In [64]:
# Extracting the data from DataFrame using index 'Guido'
health_data['Guido', 'HR']

Year  Visits
2012  1         38.0
      2         34.0
2013  1         26.0
      2         36.0
Name: (Guido, HR), dtype: float64

In [59]:
# Indexing and Slicing a MultiIndex
# Multiply indexed Series
pop

state       year
California  2000    33871648
            2001    37253956
New York    2000    18976457
            2001    19378102
Texas       2000    20851820
            2001    25145561
dtype: int64

In [61]:
# We can access single elements by indexing with multiple terms:
pop['California', 2000]

33871648

In [65]:
# Partial slicing in MultiIndex
pop.loc['New York' : 'Texas']

state     year
New York  2000    18976457
          2001    19378102
Texas     2000    20851820
          2001    25145561
dtype: int64

In [66]:
# Partial indexing on lower levels
pop[:, 2001]

state
California    37253956
New York      19378102
Texas         25145561
dtype: int64

In [67]:
# Multiply indexed DataFrames
health_data

Unnamed: 0_level_0,Subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,Type,HR,Temp,HR,Temp,HR,Temp
Year,Visits,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2012,1,25.0,37.3,38.0,35.4,34.0,37.2
2012,2,42.0,36.8,34.0,37.7,41.0,37.7
2013,1,36.0,38.2,26.0,37.2,51.0,37.3
2013,2,29.0,37.7,36.0,38.2,42.0,36.1


In [68]:
health_data['Guido', 'HR']

Year  Visits
2012  1         38.0
      2         34.0
2013  1         26.0
      2         36.0
Name: (Guido, HR), dtype: float64

In [70]:
# Extracting health_data using loc
health_data.loc[(2013, 1), ('Guido')]

Type
HR      26.0
Temp    37.2
Name: (2013, 1), dtype: float64

In [73]:
# Extracting health_data using iloc
health_data.iloc[:2, :2]

Unnamed: 0_level_0,Subject,Bob,Bob
Unnamed: 0_level_1,Type,HR,Temp
Year,Visits,Unnamed: 2_level_2,Unnamed: 3_level_2
2012,1,25.0,37.3
2012,2,42.0,36.8


In [80]:
# Sorted and unsorted indices
index = pd.MultiIndex.from_product([['a', 'c', 'b'],[1, 2]])
data = pd.Series(np.random.randn(6), index = index)
data.index.names = ['char', 'int']
data

char  int
a     1      0.422598
      2      0.361320
c     1      0.912390
      2     -0.994197
b     1     -1.372256
      2      0.931445
dtype: float64

In [81]:
# perform partial slicing in unordered Series
data['a' : 'b']

UnsortedIndexError: 'Key length (1) was greater than MultiIndex lexsort depth (0)'

In [82]:
# Sorting the DataFrame with sort_index() method
data = data.sort_index()

In [83]:
# perform partial slicing in ordered Series
data['a' : 'b']

char  int
a     1      0.422598
      2      0.361320
b     1     -1.372256
      2      0.931445
dtype: float64

In [84]:
pop

state       year
California  2000    33871648
            2001    37253956
New York    2000    18976457
            2001    19378102
Texas       2000    20851820
            2001    25145561
dtype: int64

In [86]:
# Stacking and unstacking indices
pop.unstack(level = 1)

year,2000,2001
state,Unnamed: 1_level_1,Unnamed: 2_level_1
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [88]:
# Unstacking indices
pop.unstack(level = 1).stack()

state       year
California  2000    33871648
            2001    37253956
New York    2000    18976457
            2001    19378102
Texas       2000    20851820
            2001    25145561
dtype: int64

### Index setting and resetting:
    Another way to rearrange hierarchical data is to turn the index labels into columns;
    this can be accomplished with the reset_index method.

In [90]:
# Removing the index
pop_flat = pop.reset_index(name = 'population')
pop_flat

Unnamed: 0,state,year,population
0,California,2000,33871648
1,California,2001,37253956
2,New York,2000,18976457
3,New York,2001,19378102
4,Texas,2000,20851820
5,Texas,2001,25145561


In [93]:
# Adding the index
pop_flat.set_index(['state', 'year'])

Unnamed: 0_level_0,Unnamed: 1_level_0,population
state,year,Unnamed: 2_level_1
California,2000,33871648
California,2001,37253956
New York,2000,18976457
New York,2001,19378102
Texas,2000,20851820
Texas,2001,25145561


### Data Aggregations on Multi-Indices:
    We’ve previously seen that Pandas has built-in data aggregation methods, such as mean() , sum() and max().
    For hierarchically indexed data, these can be passed a level parameter that controls which subset of the data the aggregate is computed on.

In [94]:
health_data

Unnamed: 0_level_0,Subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,Type,HR,Temp,HR,Temp,HR,Temp
Year,Visits,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2012,1,25.0,37.3,38.0,35.4,34.0,37.2
2012,2,42.0,36.8,34.0,37.7,41.0,37.7
2013,1,36.0,38.2,26.0,37.2,51.0,37.3
2013,2,29.0,37.7,36.0,38.2,42.0,36.1


In [98]:
# Finding the mean with 'year' index
mean_data = health_data.mean(level = 'Year')
mean_data

Subject,Bob,Bob,Guido,Guido,Sue,Sue
Type,HR,Temp,HR,Temp,HR,Temp
Year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2012,33.5,37.05,36.0,36.55,37.5,37.45
2013,32.5,37.95,31.0,37.7,46.5,36.7


In [101]:
# take the mean among levels on the columns
mean_data = health_data.mean(axis = 1, level = 'Type')
mean_data

Unnamed: 0_level_0,Type,HR,Temp
Year,Visits,Unnamed: 2_level_1,Unnamed: 3_level_1
2012,1,32.333333,36.633333
2012,2,39.0,37.4
2013,1,37.666667,37.566667
2013,2,35.666667,37.333333
