# Hierarchical Indexing

* Hierarchical indexing is a method of creating structured group relationships in data.
* These hierarchical indexes, or MultiIndexes, are highly flexible and offer a range of options when performing complex data queries.
* A key stage in any data analysis procedure is to split the initial dataset into more meaningful groups, which can be achieved in Pandas using the DataFrame groupby() method.


In [None]:
# Start writing code here...
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

## A Multiply Indexed Series
* The bad Way Multiply Indexed 
* Good way for Multiply Indexed 


### The Bad way

In [None]:
index = [('California', 2000), ('California', 2010),
         ('New York', 2000), ('New York', 2010),
         ('Texas', 2000), ('Texas', 2010)]
populations = [33871648, 37253956,
               18976457, 19378102,
               20851820, 25145561]

In [None]:
pop = pd.Series(populations,index=index)
pop

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64

In [None]:
pop[0] # default indexing

33871648

In [None]:
pop[('California', 2000)] # Indexing

33871648

In [None]:
pop[[i for i in pop.index if i[1] == 2010]]

(California, 2010)    37253956
(New York, 2010)      19378102
(Texas, 2010)         25145561
dtype: int64

### The Better Way: Pandas MultiIndex

In [None]:
index = pd.MultiIndex.from_tuples(index)
index

MultiIndex([('California', 2000),
            ('California', 2010),
            (  'New York', 2000),
            (  'New York', 2010),
            (     'Texas', 2000),
            (     'Texas', 2010)],
           )

In [None]:
pop = pop.reindex(index)

In [None]:
pop

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [None]:
pop[:,2000]

California    33871648
New York      18976457
Texas         20851820
dtype: int64

In [None]:
pop.ndim

1

In [None]:
pop_df = pop.unstack() # method will quickly convert a multiply indexed
pop_df

Unnamed: 0,2000,2010
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [None]:
pop_df.ndim

2

In [None]:
pop_df_st = pop_df.stack() # Converting N dim to 1 Dim 

In [None]:
pop_df_st.ndim

1

In [None]:
pop_df = pd.DataFrame({'total': pop,
                       'under18': [9267089, 9284094,
                                   4687374, 4318033,
                                   5906301, 6879014]})
pop_df

Unnamed: 0,Unnamed: 1,total,under18
California,2000,33871648,9267089
California,2010,37253956,9284094
New York,2000,18976457,4687374
New York,2010,19378102,4318033
Texas,2000,20851820,5906301
Texas,2010,25145561,6879014


In [None]:
f_u18 = pop_df['under18'] / pop_df['total']
f_u18.unstack()

Unnamed: 0,2000,2010
California,0.273594,0.249211
New York,0.24701,0.222831
Texas,0.283251,0.273568


In [None]:
data = {('California', 2000): 33871648,
        ('California', 2010): 37253956,
        ('Texas', 2000): 20851820,
        ('Texas', 2010): 25145561,
        ('New York', 2000): 18976457,
        ('New York', 2010): 19378102}
pd.Series(data)

California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
New York    2000    18976457
            2010    19378102
dtype: int64

In [None]:
pd.MultiIndex.from_tuples(data)

MultiIndex([('California', 2000),
            ('California', 2010),
            (     'Texas', 2000),
            (     'Texas', 2010),
            (  'New York', 2000),
            (  'New York', 2010)],
           )

In [None]:
pop

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [None]:
pop.index.names = ["State","year"]

In [None]:
pop

State       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [None]:
# hierarchical indices and columns
index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]],
                                   names=['year', 'visit'])
columns = pd.MultiIndex.from_product([['Raju', 'Rani', 'Reddy'], ['HG', 'Temp']],
                                     names=['subject', 'type'])

# mock some data
data = np.round(np.random.randn(4, 6), 1)
data[:, ::2] *= 10
data += 37

# create the DataFrame
health_data = pd.DataFrame(data, index=index, columns=columns)
health_data

Unnamed: 0_level_0,subject,Raju,Raju,Rani,Rani,Reddy,Reddy
Unnamed: 0_level_1,type,HG,Temp,HG,Temp,HG,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,50.0,36.7,19.0,35.7,35.0,37.6
2013,2,23.0,37.2,40.0,37.1,37.0,35.7
2014,1,21.0,37.4,21.0,37.5,29.0,37.1
2014,2,53.0,37.3,34.0,37.2,32.0,36.4


In [None]:
health_data['Raju']

Unnamed: 0_level_0,type,HG,Temp
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,1,50.0,36.7
2013,2,23.0,37.2
2014,1,21.0,37.4
2014,2,53.0,37.3


# MultiIndex from frame

In [None]:
df = pd.DataFrame([['HI', 'Temp'], ['HI', 'Precip'],
                    ["NJ","Temp"],["NJ","Perecip"]],
                    columns=['a','b'])
df

Unnamed: 0,a,b
0,HI,Temp
1,HI,Precip
2,NJ,Temp
3,NJ,Perecip


In [None]:
pd.MultiIndex.from_frame(df)

MultiIndex([('HI',    'Temp'),
            ('HI',  'Precip'),
            ('NJ',    'Temp'),
            ('NJ', 'Perecip')],
           names=['a', 'b'])

In [None]:
pd.MultiIndex.from_frame(df, names=['state', 'observation'])x`

MultiIndex([('HI',    'Temp'),
            ('HI',  'Precip'),
            ('NJ',    'Temp'),
            ('NJ', 'Perecip')],
           names=['state', 'observation'])

In [None]:
mi = pd.MultiIndex.from_arrays([[1,2],[3,4],[5,6]],names=["x","y","z"])
mi

MultiIndex([(1, 3, 5),
            (2, 4, 6)],
           names=['x', 'y', 'z'])

In [None]:
mi.names

FrozenList(['x', 'y', 'z'])

In [None]:
mi.levels

FrozenList([[1, 2], [3, 4], [5, 6]])

In [None]:
mi.codes

FrozenList([[0, 1], [0, 1], [0, 1]])

In [None]:
mi.nlevels

3

In [None]:
mi.levshape

(2, 2, 2)

In [None]:
mi.dtype

dtype('O')

In [None]:
idx = pd.MultiIndex.from_tuples([
    (1,"one"),
    (1, "two"),
    (2, "one"),
    (2, "two"),
     (3, "one"),
      (3, "two")

],names = ["number",'char'])

In [None]:
idx

MultiIndex([(1, 'one'),
            (1, 'two'),
            (2, 'one'),
            (2, 'two'),
            (3, 'one'),
            (3, 'two')],
           names=['number', 'char'])

In [None]:
idx.set_levels([['a', 'b', 'c'], [1, 2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2),
            ('c', 1),
            ('c', 2)],
           names=['number', 'char'])

In [None]:
idx.set_levels(['a', 'b', 'c'], level=0)

MultiIndex([('a', 'one'),
            ('a', 'two'),
            ('b', 'one'),
            ('b', 'two'),
            ('c', 'one'),
            ('c', 'two')],
           names=['number', 'char'])

In [None]:
idx.set_levels(['a', 'b'], level='bar')

KeyError: 'Level bar not found'

In [None]:
raw_data = {"regiment": ["Nighthawks", "Nighthawks", "Nighthawks", "Nighthawks",
                             "Dragoons", "Dragoons", "Dragoons", "Dragoons", "Scouts",
                             "Scouts", "Scouts", "Scouts"],
                "company": ["1st", "1st", "2nd", "2nd", "1st", "1st", "2nd",
                            "2nd","1st", "1st", "2nd", "2nd"],
                "Rating_Score": [4, 24, 94, 25, 4, 24, 24, 31, 2, 3, 2, 3],
                "Public_Score": [25, 94, 31, 2, 70, 25, 4, 24, 31, 2, 3, 4]}
raw_data

{'regiment': ['Nighthawks',
  'Nighthawks',
  'Nighthawks',
  'Nighthawks',
  'Dragoons',
  'Dragoons',
  'Dragoons',
  'Dragoons',
  'Scouts',
  'Scouts',
  'Scouts',
  'Scouts'],
 'company': ['1st',
  '1st',
  '2nd',
  '2nd',
  '1st',
  '1st',
  '2nd',
  '2nd',
  '1st',
  '1st',
  '2nd',
  '2nd'],
 'Rating_Score': [4, 24, 94, 25, 4, 24, 24, 31, 2, 3, 2, 3],
 'Public_Score': [25, 94, 31, 2, 70, 25, 4, 24, 31, 2, 3, 4]}

In [None]:
df = pd.DataFrame(raw_data, columns = ["regiment", "company","Rating_Score", "Public_Score"])
df

Unnamed: 0,regiment,company,Rating_Score,Public_Score
0,Nighthawks,1st,4,25
1,Nighthawks,1st,24,94
2,Nighthawks,2nd,94,31
3,Nighthawks,2nd,25,2
4,Dragoons,1st,4,70
5,Dragoons,1st,24,25
6,Dragoons,2nd,24,4
7,Dragoons,2nd,31,24
8,Scouts,1st,2,31
9,Scouts,1st,3,2


In [None]:
df = df.set_index(["regiment","company"])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Rating_Score,Public_Score
regiment,company,Unnamed: 2_level_1,Unnamed: 3_level_1
Nighthawks,1st,4,25
Nighthawks,1st,24,94
Nighthawks,2nd,94,31
Nighthawks,2nd,25,2
Dragoons,1st,4,70
Dragoons,1st,24,25
Dragoons,2nd,24,4
Dragoons,2nd,31,24
Scouts,1st,2,31
Scouts,1st,3,2


In [None]:
df.index

MultiIndex([('Nighthawks', '1st'),
            ('Nighthawks', '1st'),
            ('Nighthawks', '2nd'),
            ('Nighthawks', '2nd'),
            (  'Dragoons', '1st'),
            (  'Dragoons', '1st'),
            (  'Dragoons', '2nd'),
            (  'Dragoons', '2nd'),
            (    'Scouts', '1st'),
            (    'Scouts', '1st'),
            (    'Scouts', '2nd'),
            (    'Scouts', '2nd')],
           names=['regiment', 'company'])

In [None]:
df.swaplevel("regiment", "company")

Unnamed: 0_level_0,Unnamed: 1_level_0,Rating_Score,Public_Score
company,regiment,Unnamed: 2_level_1,Unnamed: 3_level_1
1st,Nighthawks,4,25
1st,Nighthawks,24,94
2nd,Nighthawks,94,31
2nd,Nighthawks,25,2
1st,Dragoons,4,70
1st,Dragoons,24,25
2nd,Dragoons,24,4
2nd,Dragoons,31,24
1st,Scouts,2,31
1st,Scouts,3,2


In [None]:
print(df.sum(level="regiment"))

            Rating_Score  Public_Score
regiment                              
Nighthawks           147           152
Dragoons              83           123
Scouts                10            40


In [None]:
df.sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,Rating_Score,Public_Score
regiment,company,Unnamed: 2_level_1,Unnamed: 3_level_1
Dragoons,1st,4,70
Dragoons,1st,24,25
Dragoons,2nd,24,4
Dragoons,2nd,31,24
Nighthawks,1st,4,25
Nighthawks,1st,24,94
Nighthawks,2nd,94,31
Nighthawks,2nd,25,2
Scouts,1st,2,31
Scouts,1st,3,2


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=f5f90ba1-3290-463e-8fc6-44108f4fa21b' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>