In [2]:
import pandas as pd
import numpy as np

# <u>Multi-Index and Index Hierarchy.

Let us go over how to work with Multi-Index, first we'll create a quick example of what a Multi-Indexed DataFrame would look like:

## <u>zip():

    In Python, zip() is a built-in function that pairs elements from two or more iterables (like lists, tuples, strings) into tuples, element by element.
    
    Basic idea:
    It takes the 1st element from each iterable and makes a tuple.
    Then it takes the 2nd element from each iterable, and so on.
    Stops when the shortest iterable runs out of elements.

    Syntax: zip(iterable1, iterable2, ...)

    Example:
    names = ["Alice", "Bob", "Charlie"]
    scores = [85, 90, 88]
    zipped = zip(names, scores)
    print(list(zipped))
    Output: [('Alice', 85), ('Bob', 90), ('Charlie', 88)]

    zip() itself doesn’t give us a list — it gives us a zip object, which is an iterator. That's why we cast it into a list unless we are using it in a for loop.

In [5]:
# Index Levels

outside = ['G1','G1','G1','G2','G2','G2']
inside = [1,2,3,1,2,3]
hier_index = list(zip(outside, inside))
hier_index1 = pd.MultiIndex.from_tuples(hier_index)

In [6]:
# 6 items of 2 diffrent types.

outside

['G1', 'G1', 'G1', 'G2', 'G2', 'G2']

In [7]:
inside

[1, 2, 3, 1, 2, 3]

In [8]:
hier_index

[('G1', 1), ('G1', 2), ('G1', 3), ('G2', 1), ('G2', 2), ('G2', 3)]

In [9]:
# Takes in a list that looks like (hier_index) and create a multi index from it.

hier_index1

MultiIndex([('G1', 1),
            ('G1', 2),
            ('G1', 3),
            ('G2', 1),
            ('G2', 2),
            ('G2', 3)],
           )

---

# Create a 6 X 2 DataFrame. 
    Assign a variable d_f to it.
    Using random uniform distribution.
    Index labels = hier_index. 
    Columns = ['A', 'B']

In [12]:
d_f = pd.DataFrame(data = np.random.rand(6, 2) , index = hier_index, columns = ['A', 'B'])

In [13]:
d_f

Unnamed: 0,A,B
"(G1, 1)",0.44433,0.998381
"(G1, 2)",0.599366,0.285571
"(G1, 3)",0.538744,0.887508
"(G2, 1)",0.942345,0.670493
"(G2, 2)",0.182045,0.071435
"(G2, 3)",0.379754,0.094962


In [14]:
df = pd.DataFrame(data = np.random.rand(6, 2) , index = hier_index1, columns = ['A', 'B'])

In [15]:
df

Unnamed: 0,Unnamed: 1,A,B
G1,1,0.199797,0.344411
G1,2,0.943783,0.799926
G1,3,0.452616,0.437082
G2,1,0.121998,0.455097
G2,2,0.075449,0.907868
G2,3,0.88939,0.010507


- <u>NOTE:

    - I can see that this DataFrame has two levels of index.
    - It has this G1 and G2 level.
    - And then this sub level one two three, one two three and then two columns.
    - And this is a DataFrame that looks like it has <u>**multi-levels of an index**.
    - Otherwise known as an <u>**Index Hierarchy**.

---

# <u>Now let's show how to index this!

### 1) For index hierarchy we use df.loc[]. Calling one level of the index returns the sub-dataframe:

In [20]:
# Grab from the very outside index.

df.loc['G1']

Unnamed: 0,A,B
1,0.199797,0.344411
2,0.943783,0.799926
3,0.452616,0.437082


In [21]:
df.loc['G2']

Unnamed: 0,A,B
1,0.121998,0.455097
2,0.075449,0.907868
3,0.88939,0.010507


### 2) If this was on the columns axis, we would just use normal bracket notation df[].

In [23]:
df['A']

G1  1    0.199797
    2    0.943783
    3    0.452616
G2  1    0.121998
    2    0.075449
    3    0.889390
Name: A, dtype: float64

In [24]:
df['B']

G1  1    0.344411
    2    0.799926
    3    0.437082
G2  1    0.455097
    2    0.907868
    3    0.010507
Name: B, dtype: float64

### 3) We can continue indexing from the sub-DataFrame:

In [26]:
# Sub-DataFrame.

df.loc['G1']

Unnamed: 0,A,B
1,0.199797,0.344411
2,0.943783,0.799926
3,0.452616,0.437082


In [27]:
# Grab the first row and all columns.

df.loc['G1'].loc[1]

A    0.199797
B    0.344411
Name: 1, dtype: float64

In [28]:
# Grab the first row and all columns.

df.loc['G1'].loc[1, ['A', 'B']]

A    0.199797
B    0.344411
Name: 1, dtype: float64

In [29]:
# Grab 1 and 2 rows and all columns.

df.loc['G1'].loc[[1, 2], ['A', 'B']]

Unnamed: 0,A,B
1,0.199797,0.344411
2,0.943783,0.799926


In [30]:
# Grab 1 and 2 rows and all columns.

df.loc['G1'].loc[[1, 2], :]

Unnamed: 0,A,B
1,0.199797,0.344411
2,0.943783,0.799926


In [31]:
# Grab 1 and 2 rows and all columns using iloc method.

df.loc['G1'].iloc[[0, 1], :]

Unnamed: 0,A,B
1,0.199797,0.344411
2,0.943783,0.799926


- <u>NOTE:

    - So the basic idea is you call from the outside index and continue calling inside deeper.

### 4) If we actually want to name these index (G1, G2 and 1,2,3).

In [34]:
# Currently unnamed.

df

Unnamed: 0,Unnamed: 1,A,B
G1,1,0.199797,0.344411
G1,2,0.943783,0.799926
G1,3,0.452616,0.437082
G2,1,0.121998,0.455097
G2,2,0.075449,0.907868
G2,3,0.88939,0.010507


In [35]:
df.index.names

FrozenList([None, None])

- <u>NOTE:

    - FrozenList([None, None])
    - And this is just a pandas index names object type.
    - All it's saying here is that these indexes don't have any names.
    - To name:
        - Syntax: df.index.names = [outside_indexname, inside_indexname]
    - And this can have as many layers as we want.

In [37]:
df.index.names = ['Groups', 'Num']

In [38]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Groups,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,0.199797,0.344411
G1,2,0.943783,0.799926
G1,3,0.452616,0.437082
G2,1,0.121998,0.455097
G2,2,0.075449,0.907868
G2,3,0.88939,0.010507


- <u>NOTE:

    - Hopefully we can begin to see how this would be useful for multilevel index information that we want to use along with a DataFrame.

In [40]:
# ValueError: Length of names must match number of levels in MultiIndex.

# df.index.names = ['Groups', 'Num', '3rd Layer']

### 5) Example: Grab the value 0.042958 from df.

In [42]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Groups,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,0.199797,0.344411
G1,2,0.943783,0.799926
G1,3,0.452616,0.437082
G2,1,0.121998,0.455097
G2,2,0.075449,0.907868
G2,3,0.88939,0.010507


In [43]:
df.loc['G2']

Unnamed: 0_level_0,A,B
Num,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.121998,0.455097
2,0.075449,0.907868
3,0.88939,0.010507


In [44]:
df.loc['G2'].loc[2, 'B']

0.9078678570842457

In [45]:
# OR

df.loc['G2'].iloc[1, 1]

0.9078678570842457

### 6) Example: Grab the value 0.251461, 0.042958, 0.688352 and	0.807982 from df.

In [47]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Groups,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,0.199797,0.344411
G1,2,0.943783,0.799926
G1,3,0.452616,0.437082
G2,1,0.121998,0.455097
G2,2,0.075449,0.907868
G2,3,0.88939,0.010507


In [48]:
df.loc['G2'].iloc[[1, 2], :]

Unnamed: 0_level_0,A,B
Num,Unnamed: 1_level_1,Unnamed: 2_level_1
2,0.075449,0.907868
3,0.88939,0.010507


---

# <u>xs()

    Returns cross-section (rows or columns) from the Series/DataFrame.
    Used when we have a multilevel index.
    What's nice about this cross section is it has the ability to skip or go inside a multilevel index.
    Synatx: df.xs(key, level)
            where;
                    level : object, defaults to first n levels

In [51]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Groups,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,0.199797,0.344411
G1,2,0.943783,0.799926
G1,3,0.452616,0.437082
G2,1,0.121998,0.455097
G2,2,0.075449,0.907868
G2,3,0.88939,0.010507


### 1) Example: Grab the sub-DataFrame G1.

In [53]:
df.loc['G1']

Unnamed: 0_level_0,A,B
Num,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.199797,0.344411
2,0.943783,0.799926
3,0.452616,0.437082


### 2) Another way would be to use xs function (cross-section).

In [55]:
# NOTE we are not using square brackets here.
# df.xs(key)

df.xs('G1')

Unnamed: 0_level_0,A,B
Num,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.199797,0.344411
2,0.943783,0.799926
3,0.452616,0.437082


#### But what if we want to grab the row 1 from both G1 and G2. That would be tricky for loc method and this is where xs() comes in handy.

In [57]:
df.xs(1, level = 'Num')

Unnamed: 0_level_0,A,B
Groups,Unnamed: 1_level_1,Unnamed: 2_level_1
G1,0.199797,0.344411
G2,0.121998,0.455097


---