# Pandas
## DataFrames - MultiIndex - for working with > 2 dimensions



In [11]:
import numpy as np
import pandas as pd

**Create a list of two category indexes**   
City and Year

In [12]:
index = [('California', 2000), ('California', 2010),
         ('Texas', 2000), ('Texas', 2010),
                    ('New York', 2000), ('New York', 2010),
                    ]


**Create a multi-index from tuples, creating multiple levels of indexing**  
City/Year

In [13]:
mul_index = pd.MultiIndex.from_tuples(index)
mul_index

MultiIndex([('California', 2000),
            ('California', 2010),
            (     'Texas', 2000),
            (     'Texas', 2010),
            (  'New York', 2000),
            (  'New York', 2010)],
           )

**Create a dataframe and assigm the multipleindexes**

In [14]:
populations = [33871648, 37253956,
                          18976457, 19378102,
                          20851820, 25145561]
areas = [33871648, 37253956,
                          18976457, 19378102,
                          20851820, 25145561]

pop = pd.DataFrame({'population': populations, "area": areas}, index=mul_index)
pop

Unnamed: 0,Unnamed: 1,population,area
California,2000,33871648,33871648
California,2010,37253956,37253956
Texas,2000,18976457,18976457
Texas,2010,19378102,19378102
New York,2000,20851820,20851820
New York,2010,25145561,25145561


**Rename indexes**

In [16]:
pop.index.names = ['state', 'year']

pop

Unnamed: 0_level_0,Unnamed: 1_level_0,population,area
state,year,Unnamed: 2_level_1,Unnamed: 3_level_1
California,2000,33871648,33871648
California,2010,37253956,37253956
Texas,2000,18976457,18976457
Texas,2010,19378102,19378102
New York,2000,20851820,20851820
New York,2010,25145561,25145561


**Accessing multindex dataframe**

In [21]:
pop.loc['California']

Unnamed: 0_level_0,population,area
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,33871648,33871648
2010,37253956,37253956


In [24]:
pop.loc['California']['population']

year
2000    33871648
2010    37253956
Name: population, dtype: int64

In [25]:
pop.loc['California'].loc[2000]

population    33871648
area          33871648
Name: 2000, dtype: int64

In [27]:
pop.loc['California'].loc[2000]['population']

33871648

**Access an inner level MultiIndex with cross-section (xs)**  
specify level index or name

In [29]:
pop.xs(2010, level=1)

Unnamed: 0_level_0,population,area
state,Unnamed: 1_level_1,Unnamed: 2_level_1
California,37253956,37253956
Texas,19378102,19378102
New York,25145561,25145561


In [30]:
pop.xs(2010, level='year')

Unnamed: 0_level_0,population,area
state,Unnamed: 1_level_1,Unnamed: 2_level_1
California,37253956,37253956
Texas,19378102,19378102
New York,25145561,25145561


In [11]:
pop.xs('California')

Unnamed: 0_level_0,population,area
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,33871648,33871648
2010,37253956,37253956


In [12]:
pop.loc[('California', 2010)]

population    37253956
area          37253956
Name: (California, 2010), dtype: int64

In [14]:
pop.xs(('California', 2010))

population    37253956
area          37253956
Name: (California, 2010), dtype: int64

**Indexes need to be sorted first to call in alphabetical order**

In [15]:
pop = pop.sort_index()
pop['population']['California':'New York']

state       year
California  2000    33871648
            2010    37253956
New York    2000    20851820
            2010    25145561
Name: population, dtype: int64

In [16]:
pop2 = pd.DataFrame({'population': populations}, index=index)

**Stack
Convert a multiply- indexed Series into a conventionally indexed DataFrame**


In [17]:
pop.stack()

state       year            
California  2000  population    33871648
                  area          33871648
            2010  population    37253956
                  area          37253956
New York    2000  population    20851820
                  area          20851820
            2010  population    25145561
                  area          25145561
Texas       2000  population    18976457
                  area          18976457
            2010  population    19378102
                  area          19378102
dtype: int64

In [18]:
pop.stack().unstack()

Unnamed: 0_level_0,Unnamed: 1_level_0,population,area
state,year,Unnamed: 2_level_1,Unnamed: 3_level_1
California,2000,33871648,33871648
California,2010,37253956,37253956
New York,2000,20851820,20851820
New York,2010,25145561,25145561
Texas,2000,18976457,18976457
Texas,2010,19378102,19378102


**Reset indexes and add previous index as columns**

In [20]:
pop_flat = pop.reset_index()
pop_flat

Unnamed: 0,state,year,population,area
0,California,2000,33871648,33871648
1,California,2010,37253956,37253956
2,New York,2000,20851820,20851820
3,New York,2010,25145561,25145561
4,Texas,2000,18976457,18976457
5,Texas,2010,19378102,19378102


**Create MultiIndex from columns (and drop previous indexes)**

In [22]:
pop_flat.set_index(['state', 'year'])

Unnamed: 0_level_0,Unnamed: 1_level_0,population,area
state,year,Unnamed: 2_level_1,Unnamed: 3_level_1
California,2000,33871648,33871648
California,2010,37253956,37253956
New York,2000,20851820,20851820
New York,2010,25145561,25145561
Texas,2000,18976457,18976457
Texas,2010,19378102,19378102


**Create a health_data mock dataset with hierarchical indices and columns**

**hierarchical indices**


In [23]:
index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]],names=['year', 'visit'])

**hierarchical columns**

In [25]:
columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], ['HR', 'Temp']],
                                     names=['subject', 'type'])

**mock data**

In [27]:
data = np.round(np.random.randn(4, 6), 1) 
data[:, ::2] *= 10
data += 37

**create the DataFrame**

In [29]:
health_data = pd.DataFrame(data, index=index, columns=columns) 
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,41.0,37.6,45.0,36.7,50.0,35.6
2013,2,36.0,37.5,23.0,39.1,18.0,36.2
2014,1,57.0,37.8,37.0,36.7,37.0,37.9
2014,2,48.0,37.5,38.0,37.3,54.0,36.9


**Aggregation at year level (level 0)**

In [31]:
# data_mean = health_data.mean(level=0)
data_mean = health_data.mean(level='year')

data_mean

subject,Bob,Bob,Guido,Guido,Sue,Sue
type,HR,Temp,HR,Temp,HR,Temp
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2013,38.5,37.55,34.0,37.9,34.0,35.9
2014,52.5,37.65,37.5,37.0,45.5,37.4


**Aggregation at type level (level 1 along axis 1)
data_mean = health_data.mean(axis=1, level=1)**


In [189]:
data_mean.mean(axis=1, level='type')
data_mean

Unnamed: 0_level_0,type,HR,Temp
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,1,50.333333,36.866667
2013,2,42.333333,37.066667
2014,1,35.666667,37.333333
2014,2,30.333333,36.033333
