# Hierarchy in Indexes

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv(
    'WSH_HYGIENE_BASIC.csv', 
    header=[0, 2],
    index_col=0,
)  
df.head()

Unnamed: 0_level_0,2015,2015,2015,2014,2014,2014,2013,2013,2013,2012,...,2003,2002,2002,2002,2001,2001,2001,2000,2000,2000
Country,Rural,Urban,Total,Rural,Urban,Total,Rural,Urban,Total,Rural,...,Total,Rural,Urban,Total,Rural,Urban,Total,Rural,Urban,Total
Afghanistan,29.0,64.0,38.0,29.0,64.0,38.0,29.0,64.0,38.0,29.0,...,,,,,,,,,,
Algeria,73.0,88.0,84.0,73.0,88.0,83.0,73.0,88.0,83.0,73.0,...,,,,,,,,,,
Angola,15.0,37.0,25.0,15.0,37.0,24.0,15.0,37.0,24.0,15.0,...,,,,,,,,,,
Armenia,77.0,93.0,87.0,77.0,93.0,87.0,77.0,93.0,87.0,77.0,...,85.0,68.0,94.0,84.0,67.0,94.0,84.0,66.0,94.0,84.0
Bangladesh,31.0,58.0,40.0,31.0,58.0,40.0,31.0,58.0,40.0,31.0,...,,,,,,,,,,


In [3]:
# How coud we have made something like the chart above with the basics of Pandas
df.loc['Afghanistan':'Bangladesh', '2015']

Country,Rural,Urban,Total
Afghanistan,29.0,64.0,38.0
Algeria,73.0,88.0,84.0
Angola,15.0,37.0,25.0
Armenia,77.0,93.0,87.0
Bangladesh,31.0,58.0,40.0


In [4]:
rural = {
    'Afghanistan': 29.0,
    'Algeria': 73.0,
    'Angola': 15.0,
    'Armenia': 77.0,
    'Bangladesh': 31.0,
}

urban = {
    'Afghanistan': 64.0,
    'Algeria': 88.0,
    'Angola': 37.0,
    'Armenia': 93.0,
    'Bangladesh': 58.0,
}

total = {
    'Afghanistan': 38.0,
    'Algeria': 84.0,
    'Angola': 25.0,
    'Armenia': 87.0,
    'Bangladesh': 40.0,
}

data = {
    'Rural': pd.Series(rural),
    'Urban': pd.Series(urban),
    'Total': pd.Series(total),
}

df = pd.DataFrame(data)  # Not quite it..
df

Unnamed: 0,Rural,Urban,Total
Afghanistan,29.0,64.0,38.0
Algeria,73.0,88.0,84.0
Angola,15.0,37.0,25.0
Armenia,77.0,93.0,87.0
Bangladesh,31.0,58.0,40.0


In [5]:
columns = pd.MultiIndex.from_product([['2015'], ['Rural', 'Total', 'Urban']])
columns

MultiIndex([('2015', 'Rural'),
            ('2015', 'Total'),
            ('2015', 'Urban')],
           )

In [6]:
index = pd.Index(['Afghanistan', 'Algeria', 'Angola', 'Armenia', 'Bangladesh'])
index

Index(['Afghanistan', 'Algeria', 'Angola', 'Armenia', 'Bangladesh'], dtype='object')

In [7]:
values = df.values
values

array([[29., 64., 38.],
       [73., 88., 84.],
       [15., 37., 25.],
       [77., 93., 87.],
       [31., 58., 40.]])

In [8]:
df = pd.DataFrame(values, index=index, columns=columns)
df

Unnamed: 0_level_0,2015,2015,2015
Unnamed: 0_level_1,Rural,Total,Urban
Afghanistan,29.0,64.0,38.0
Algeria,73.0,88.0,84.0
Angola,15.0,37.0,25.0
Armenia,77.0,93.0,87.0
Bangladesh,31.0,58.0,40.0


In [9]:
df.stack()  # Move a level of columns to be part of the row index

Unnamed: 0,Unnamed: 1,2015
Afghanistan,Rural,29.0
Afghanistan,Total,64.0
Afghanistan,Urban,38.0
Algeria,Rural,73.0
Algeria,Total,88.0
Algeria,Urban,84.0
Angola,Rural,15.0
Angola,Total,37.0
Angola,Urban,25.0
Armenia,Rural,77.0


In [10]:
df.stack().stack()  # Move another level of column to be part of the row index

Afghanistan  Rural  2015    29.0
             Total  2015    64.0
             Urban  2015    38.0
Algeria      Rural  2015    73.0
             Total  2015    88.0
             Urban  2015    84.0
Angola       Rural  2015    15.0
             Total  2015    37.0
             Urban  2015    25.0
Armenia      Rural  2015    77.0
             Total  2015    93.0
             Urban  2015    87.0
Bangladesh   Rural  2015    31.0
             Total  2015    58.0
             Urban  2015    40.0
dtype: float64

In [11]:
df.stack().stack().unstack()  # Move a level of index to be part of the columns

Unnamed: 0,Unnamed: 1,2015
Afghanistan,Rural,29.0
Afghanistan,Total,64.0
Afghanistan,Urban,38.0
Algeria,Rural,73.0
Algeria,Total,88.0
Algeria,Urban,84.0
Angola,Rural,15.0
Angola,Total,37.0
Angola,Urban,25.0
Armenia,Rural,77.0


In [12]:
df.stack().stack().unstack().unstack()  # Move a level of index to be part of the columns

Unnamed: 0_level_0,2015,2015,2015
Unnamed: 0_level_1,Rural,Total,Urban
Afghanistan,29.0,64.0,38.0
Algeria,73.0,88.0,84.0
Angola,15.0,37.0,25.0
Armenia,77.0,93.0,87.0
Bangladesh,31.0,58.0,40.0


### Indexing

In [13]:
df = pd.read_csv(
    'WSH_HYGIENE_BASIC.csv', 
    header=[0, 2],
    index_col=0,
)  
df.head()

Unnamed: 0_level_0,2015,2015,2015,2014,2014,2014,2013,2013,2013,2012,...,2003,2002,2002,2002,2001,2001,2001,2000,2000,2000
Country,Rural,Urban,Total,Rural,Urban,Total,Rural,Urban,Total,Rural,...,Total,Rural,Urban,Total,Rural,Urban,Total,Rural,Urban,Total
Afghanistan,29.0,64.0,38.0,29.0,64.0,38.0,29.0,64.0,38.0,29.0,...,,,,,,,,,,
Algeria,73.0,88.0,84.0,73.0,88.0,83.0,73.0,88.0,83.0,73.0,...,,,,,,,,,,
Angola,15.0,37.0,25.0,15.0,37.0,24.0,15.0,37.0,24.0,15.0,...,,,,,,,,,,
Armenia,77.0,93.0,87.0,77.0,93.0,87.0,77.0,93.0,87.0,77.0,...,85.0,68.0,94.0,84.0,67.0,94.0,84.0,66.0,94.0,84.0
Bangladesh,31.0,58.0,40.0,31.0,58.0,40.0,31.0,58.0,40.0,31.0,...,,,,,,,,,,


In [14]:
df.loc[:, '2015'].head()  # First index operation operates on the outer layer of the hierarchical index.

Country,Rural,Urban,Total
Afghanistan,29.0,64.0,38.0
Algeria,73.0,88.0,84.0
Angola,15.0,37.0,25.0
Armenia,77.0,93.0,87.0
Bangladesh,31.0,58.0,40.0


In [15]:
df.loc[:, ('2015', 'Rural')].head()  # Second moves us down through the second level of the index, use tuple

Afghanistan    29.0
Algeria        73.0
Angola         15.0
Armenia        77.0
Bangladesh     31.0
Name: (2015, Rural), dtype: float64

In [16]:
df.loc['Algeria', ('2015', 'Rural')]

73.0

### Slicing

In [17]:
df.loc['Chad':'Costa Rica']  # Slicing still works

Unnamed: 0_level_0,2015,2015,2015,2014,2014,2014,2013,2013,2013,2012,...,2003,2002,2002,2002,2001,2001,2001,2000,2000,2000
Country,Rural,Urban,Total,Rural,Urban,Total,Rural,Urban,Total,Rural,...,Total,Rural,Urban,Total,Rural,Urban,Total,Rural,Urban,Total
Chad,2.0,18.0,6.0,2.0,18.0,6.0,2.0,18.0,6.0,2.0,...,,,,,,,,,,
Comoros,15.0,18.0,16.0,15.0,18.0,16.0,15.0,18.0,16.0,15.0,...,,,,,,,,,,
Costa Rica,83.0,84.0,84.0,83.0,84.0,84.0,83.0,84.0,84.0,83.0,...,,,,,,,,,,


In [18]:
df.loc['Chad':'Costa Rica', '2010':'2015']  # UnsortedIndexError, hmmm

UnsortedIndexError: 'Key length (1) was greater than MultiIndex lexsort depth (0)'

In [19]:
df.columns.lexsort_depth  # Looks like the column indexes are not sorted by Pandas terms

0

In [20]:
df = df.sort_index(axis=1)  # Sort the column index with axis = 1
df.head()  # Notice years ascend now

Unnamed: 0_level_0,2000,2000,2000,2001,2001,2001,2002,2002,2002,2003,...,2012,2013,2013,2013,2014,2014,2014,2015,2015,2015
Country,Rural,Total,Urban,Rural,Total,Urban,Rural,Total,Urban,Rural,...,Urban,Rural,Total,Urban,Rural,Total,Urban,Rural,Total,Urban
Afghanistan,,,,,,,,,,,...,64.0,29.0,38.0,64.0,29.0,38.0,64.0,29.0,38.0,64.0
Algeria,,,,,,,,,,,...,88.0,73.0,83.0,88.0,73.0,83.0,88.0,73.0,84.0,88.0
Angola,,,,,,,,,,,...,37.0,15.0,24.0,37.0,15.0,24.0,37.0,15.0,25.0,37.0
Armenia,66.0,84.0,94.0,67.0,84.0,94.0,68.0,84.0,94.0,69.0,...,93.0,77.0,87.0,93.0,77.0,87.0,93.0,77.0,87.0,93.0
Bangladesh,,,,,,,,,,,...,58.0,31.0,40.0,58.0,31.0,40.0,58.0,31.0,40.0,58.0


In [21]:
df.columns.lexsort_depth

2

In [22]:
df.loc['Chad':'Costa Rica', '2010':'2015']

Unnamed: 0_level_0,2010,2010,2010,2011,2011,2011,2012,2012,2012,2013,2013,2013,2014,2014,2014,2015,2015,2015
Country,Rural,Total,Urban,Rural,Total,Urban,Rural,Total,Urban,Rural,Total,Urban,Rural,Total,Urban,Rural,Total,Urban
Chad,,,,2.0,6.0,18.0,2.0,6.0,18.0,2.0,6.0,18.0,2.0,6.0,18.0,2.0,6.0,18.0
Comoros,15.0,16.0,18.0,15.0,16.0,18.0,15.0,16.0,18.0,15.0,16.0,18.0,15.0,16.0,18.0,15.0,16.0,18.0
Costa Rica,83.0,84.0,84.0,83.0,84.0,84.0,83.0,84.0,84.0,83.0,84.0,84.0,83.0,84.0,84.0,83.0,84.0,84.0


In [23]:
df.loc['Chad':'Costa Rica', ('2010':'2015', 'Rural')]  # SyntaxErrors

SyntaxError: invalid syntax (<ipython-input-23-2650bc3e9274>, line 1)

In [24]:
years = pd.IndexSlice['2010':'2015', 'Rural']
df.loc['Chad':'Costa Rica', years]

Unnamed: 0_level_0,2010,2011,2012,2013,2014,2015
Country,Rural,Rural,Rural,Rural,Rural,Rural
Chad,,2.0,2.0,2.0,2.0,2.0
Comoros,15.0,15.0,15.0,15.0,15.0,15.0
Costa Rica,83.0,83.0,83.0,83.0,83.0,83.0


In [25]:
# Setting, Reseting indexes
df = pd.read_csv(
    'WSH_HYGIENE_BASIC.csv', 
    header=[0, 2],
    index_col=0,
)  
df.head()

Unnamed: 0_level_0,2015,2015,2015,2014,2014,2014,2013,2013,2013,2012,...,2003,2002,2002,2002,2001,2001,2001,2000,2000,2000
Country,Rural,Urban,Total,Rural,Urban,Total,Rural,Urban,Total,Rural,...,Total,Rural,Urban,Total,Rural,Urban,Total,Rural,Urban,Total
Afghanistan,29.0,64.0,38.0,29.0,64.0,38.0,29.0,64.0,38.0,29.0,...,,,,,,,,,,
Algeria,73.0,88.0,84.0,73.0,88.0,83.0,73.0,88.0,83.0,73.0,...,,,,,,,,,,
Angola,15.0,37.0,25.0,15.0,37.0,24.0,15.0,37.0,24.0,15.0,...,,,,,,,,,,
Armenia,77.0,93.0,87.0,77.0,93.0,87.0,77.0,93.0,87.0,77.0,...,85.0,68.0,94.0,84.0,67.0,94.0,84.0,66.0,94.0,84.0
Bangladesh,31.0,58.0,40.0,31.0,58.0,40.0,31.0,58.0,40.0,31.0,...,,,,,,,,,,


In [26]:
df.reset_index(col_fill='Ct').head()  # Turn the country index into a regular column.

Unnamed: 0_level_0,index,2015,2015,2015,2014,2014,2014,2013,2013,2013,...,2003,2002,2002,2002,2001,2001,2001,2000,2000,2000
Country,Ct,Rural,Urban,Total,Rural,Urban,Total,Rural,Urban,Total,...,Total,Rural,Urban,Total,Rural,Urban,Total,Rural,Urban,Total
0,Afghanistan,29.0,64.0,38.0,29.0,64.0,38.0,29.0,64.0,38.0,...,,,,,,,,,,
1,Algeria,73.0,88.0,84.0,73.0,88.0,83.0,73.0,88.0,83.0,...,,,,,,,,,,
2,Angola,15.0,37.0,25.0,15.0,37.0,24.0,15.0,37.0,24.0,...,,,,,,,,,,
3,Armenia,77.0,93.0,87.0,77.0,93.0,87.0,77.0,93.0,87.0,...,85.0,68.0,94.0,84.0,67.0,94.0,84.0,66.0,94.0,84.0
4,Bangladesh,31.0,58.0,40.0,31.0,58.0,40.0,31.0,58.0,40.0,...,,,,,,,,,,


In [27]:
# https://catalog.data.gov/dataset/demographic-statistics-by-zip-code-acfc9
df = pd.read_csv(
    'Demographic_Statistics_By_Zip_Code.csv', 
)  
df.head()

Unnamed: 0,JURISDICTION NAME,COUNT PARTICIPANTS,COUNT FEMALE,PERCENT FEMALE,COUNT MALE,PERCENT MALE,COUNT GENDER UNKNOWN,PERCENT GENDER UNKNOWN,COUNT GENDER TOTAL,PERCENT GENDER TOTAL,...,COUNT CITIZEN STATUS TOTAL,PERCENT CITIZEN STATUS TOTAL,COUNT RECEIVES PUBLIC ASSISTANCE,PERCENT RECEIVES PUBLIC ASSISTANCE,COUNT NRECEIVES PUBLIC ASSISTANCE,PERCENT NRECEIVES PUBLIC ASSISTANCE,COUNT PUBLIC ASSISTANCE UNKNOWN,PERCENT PUBLIC ASSISTANCE UNKNOWN,COUNT PUBLIC ASSISTANCE TOTAL,PERCENT PUBLIC ASSISTANCE TOTAL
0,10001,44,22,0.5,22,0.5,0,0,44,100,...,44,100,20,0.45,24,0.55,0,0,44,100
1,10002,35,19,0.54,16,0.46,0,0,35,100,...,35,100,2,0.06,33,0.94,0,0,35,100
2,10003,1,1,1.0,0,0.0,0,0,1,100,...,1,100,0,0.0,1,1.0,0,0,1,100
3,10004,0,0,0.0,0,0.0,0,0,0,0,...,0,0,0,0.0,0,0.0,0,0,0,0
4,10005,2,2,1.0,0,0.0,0,0,2,100,...,2,100,0,0.0,2,1.0,0,0,2,100


In [28]:
df.set_index('JURISDICTION NAME').head()  # Take a current column and make it an index

Unnamed: 0_level_0,COUNT PARTICIPANTS,COUNT FEMALE,PERCENT FEMALE,COUNT MALE,PERCENT MALE,COUNT GENDER UNKNOWN,PERCENT GENDER UNKNOWN,COUNT GENDER TOTAL,PERCENT GENDER TOTAL,COUNT PACIFIC ISLANDER,...,COUNT CITIZEN STATUS TOTAL,PERCENT CITIZEN STATUS TOTAL,COUNT RECEIVES PUBLIC ASSISTANCE,PERCENT RECEIVES PUBLIC ASSISTANCE,COUNT NRECEIVES PUBLIC ASSISTANCE,PERCENT NRECEIVES PUBLIC ASSISTANCE,COUNT PUBLIC ASSISTANCE UNKNOWN,PERCENT PUBLIC ASSISTANCE UNKNOWN,COUNT PUBLIC ASSISTANCE TOTAL,PERCENT PUBLIC ASSISTANCE TOTAL
JURISDICTION NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10001,44,22,0.5,22,0.5,0,0,44,100,0,...,44,100,20,0.45,24,0.55,0,0,44,100
10002,35,19,0.54,16,0.46,0,0,35,100,0,...,35,100,2,0.06,33,0.94,0,0,35,100
10003,1,1,1.0,0,0.0,0,0,1,100,0,...,1,100,0,0.0,1,1.0,0,0,1,100
10004,0,0,0.0,0,0.0,0,0,0,0,0,...,0,0,0,0.0,0,0.0,0,0,0,0
10005,2,2,1.0,0,0.0,0,0,2,100,0,...,2,100,0,0.0,2,1.0,0,0,2,100


In [29]:
# Aggregations
df = pd.read_csv(
    'WSH_HYGIENE_BASIC.csv', 
    header=[0, 2],
    index_col=0,
)  
df.head()

Unnamed: 0_level_0,2015,2015,2015,2014,2014,2014,2013,2013,2013,2012,...,2003,2002,2002,2002,2001,2001,2001,2000,2000,2000
Country,Rural,Urban,Total,Rural,Urban,Total,Rural,Urban,Total,Rural,...,Total,Rural,Urban,Total,Rural,Urban,Total,Rural,Urban,Total
Afghanistan,29.0,64.0,38.0,29.0,64.0,38.0,29.0,64.0,38.0,29.0,...,,,,,,,,,,
Algeria,73.0,88.0,84.0,73.0,88.0,83.0,73.0,88.0,83.0,73.0,...,,,,,,,,,,
Angola,15.0,37.0,25.0,15.0,37.0,24.0,15.0,37.0,24.0,15.0,...,,,,,,,,,,
Armenia,77.0,93.0,87.0,77.0,93.0,87.0,77.0,93.0,87.0,77.0,...,85.0,68.0,94.0,84.0,67.0,94.0,84.0,66.0,94.0,84.0
Bangladesh,31.0,58.0,40.0,31.0,58.0,40.0,31.0,58.0,40.0,31.0,...,,,,,,,,,,


In [30]:
df.min(axis=1, level=0).head()  # Across all locations...

Unnamed: 0,2015,2014,2013,2012,2011,2010,2009,2008,2007,2006,2005,2004,2003,2002,2001,2000
Afghanistan,29.0,29.0,29.0,29.0,29.0,29.0,29.0,29.0,29.0,29.0,29.0,,,,,
Algeria,73.0,73.0,73.0,73.0,73.0,73.0,73.0,,,,,,,,,
Angola,15.0,15.0,15.0,15.0,,,,,,,,,,,,
Armenia,77.0,77.0,77.0,77.0,76.0,75.0,74.0,73.0,73.0,72.0,71.0,70.0,69.0,68.0,67.0,66.0
Bangladesh,31.0,31.0,31.0,31.0,31.0,31.0,31.0,31.0,31.0,31.0,31.0,,,,,


In [31]:
df.mean(axis=1, level=1).head()  # For all years...

Country,Rural,Urban,Total
Afghanistan,29.0,64.0,37.545455
Algeria,73.0,88.0,83.142857
Angola,15.0,37.0,24.25
Armenia,72.625,93.25,85.8125
Bangladesh,31.0,58.0,39.181818


We have enough tools to do some basic analysis!