# MultiIndex / advanced indexing

In [7]:
import numpy as np
import polars as pl
from helper.jupyter import row

## Hierarchical indexing (MultiIndex)

### Creating a MultiIndex (hierarchical index) object

In [6]:
arrays = [
    ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
    ["one", "two", "one", "two", "one", "two", "one", "two"],
]
tuples = list(zip(*arrays))
index = pl.DataFrame(tuples, schema=['first', 'second'], orient='row')
index

first,second
str,str
"""bar""","""one"""
"""bar""","""two"""
"""baz""","""one"""
"""baz""","""two"""
"""foo""","""one"""
"""foo""","""two"""
"""qux""","""one"""
"""qux""","""two"""


In [9]:
s = index.with_columns(
    value=np.random.randn(8)
)
s

first,second,value
str,str,f64
"""bar""","""one""",0.250211
"""bar""","""two""",-1.92903
"""baz""","""one""",-0.926871
"""baz""","""two""",0.119729
"""foo""","""one""",1.100611
"""foo""","""two""",-0.766552
"""qux""","""one""",1.397529
"""qux""","""two""",-0.753019


In [14]:
from itertools import product
iterables = [["bar", "baz", "foo", "qux"], ["one", "two"]]
pl.DataFrame(list(product(*iterables)), orient="row", schema=['first', 'second'])

first,second
str,str
"""bar""","""one"""
"""bar""","""two"""
"""baz""","""one"""
"""baz""","""two"""
"""foo""","""one"""
"""foo""","""two"""
"""qux""","""one"""
"""qux""","""two"""


In [17]:
df = pl.DataFrame(
    [["bar", "one"], ["bar", "two"], ["foo", "one"], ["foo", "two"]],
    schema=["first", "second"], orient='row'
)

In [43]:
arrays = [
    np.array(["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"]),
    np.array(["one", "two", "one", "two", "one", "two", "one", "two"]),
]
s = pl.DataFrame(arrays).with_columns(
    value=np.random.randn(8)
).rename(lambda col:col.replace('column_', 'index'))
s

index0,index1,value
str,str,f64
"""bar""","""one""",1.314541
"""bar""","""two""",0.490712
"""baz""","""one""",0.224043
"""baz""","""two""",0.205625
"""foo""","""one""",-0.005724
"""foo""","""two""",-0.894622
"""qux""","""one""",2.179859
"""qux""","""two""",-0.483736


In [22]:
pl.DataFrame(arrays)

column_0,column_1
str,str
"""bar""","""one"""
"""bar""","""two"""
"""baz""","""one"""
"""baz""","""two"""
"""foo""","""one"""
"""foo""","""two"""
"""qux""","""one"""
"""qux""","""two"""


In [23]:
pl.DataFrame(np.random.randn(8, 4))

column_0,column_1,column_2,column_3
f64,f64,f64,f64
-0.179486,-1.069504,-0.447659,0.048772
-0.636657,-0.348154,-0.008841,0.086617
0.630096,0.744289,-1.200196,0.243811
0.227324,0.311629,0.20198,-0.04592
0.925253,-0.580234,0.504974,0.477518
-1.338663,0.365874,2.511466,0.190634
-0.009257,0.074639,-0.720652,-2.037116
-1.272381,-0.128876,1.754089,0.196494


In [26]:
pl.concat([
    pl.DataFrame(arrays, schema=['index0', 'index1']), 
    pl.DataFrame(np.random.randn(8, 4))], 
    how='horizontal')

index0,index1,column_0,column_1,column_2,column_3
str,str,f64,f64,f64,f64
"""bar""","""one""",0.448595,0.439349,-0.215742,-0.305499
"""bar""","""two""",0.305583,1.749769,1.593628,-0.822102
"""baz""","""one""",2.108508,-0.267871,-0.254204,1.507438
"""baz""","""two""",0.078844,1.204617,0.811816,0.386158
"""foo""","""one""",0.453133,0.944454,0.356213,1.207111
"""foo""","""two""",-0.333459,0.039692,-0.970556,0.445848
"""qux""","""one""",0.427988,-0.12522,1.188076,-0.093413
"""qux""","""two""",0.343967,-0.024648,0.122142,-1.565558


In [38]:
df = pl.DataFrame(
    np.random.randn(3, 8), 
    schema=index.select(pl.concat_list(pl.all()).list.join('-')).to_series().to_list()
).insert_column(0, pl.Series('index', ['A', 'B', 'C']))
df

index,bar-one,bar-two,baz-one,baz-two,foo-one,foo-two,qux-one,qux-two
str,f64,f64,f64,f64,f64,f64,f64,f64
"""A""",-0.348412,1.579732,-1.442768,1.701454,0.108949,1.269859,-1.03049,-0.268433
"""B""",0.260513,0.174605,-0.284583,2.283266,1.444297,-1.031823,0.191269,1.2516
"""C""",0.693662,0.033336,-2.679877,2.060036,-1.424747,-0.597285,1.206023,-0.822538


### Reconstructing the level labels

### Basic indexing on axis with MultiIndex

In [40]:
# df["bar"]
from polars import selectors as cs
df.select(
    'index',
    cs.starts_with('bar-'))

index,bar-one,bar-two
str,f64,f64
"""A""",-0.348412,1.579732
"""B""",0.260513,0.174605
"""C""",0.693662,0.033336


In [42]:
# df["bar", "one"]
df.select(
    'index',
    pl.col('bar-one'))

index,bar-one
str,f64
"""A""",-0.348412
"""B""",0.260513
"""C""",0.693662


In [45]:
# s["qux"]
s.filter(pl.col('index0') == 'qux')

index0,index1,value
str,str,f64
"""qux""","""one""",2.179859
"""qux""","""two""",-0.483736


### Defined levels

### Data alignment and using reindex

In [50]:
# s + s[:-2]
from helper.polars import align_op
align_op(s, s.slice(0, s.shape[0]-2), pl.Expr.add, on=['index0', 'index1'], fill_value=None)

index0,index1,value
str,str,f64
"""bar""","""one""",2.629082
"""bar""","""two""",0.981425
"""baz""","""one""",0.448086
"""baz""","""two""",0.411249
"""foo""","""one""",-0.011448
"""foo""","""two""",-1.789243
"""qux""","""one""",
"""qux""","""two""",


In [53]:
# s + s[::2]
align_op(s, s.gather_every(2), pl.Expr.add, on=['index0', 'index1'], fill_value=None)

index0,index1,value
str,str,f64
"""bar""","""one""",2.629082
"""bar""","""two""",
"""baz""","""one""",0.448086
"""baz""","""two""",
"""foo""","""one""",-0.011448
"""foo""","""two""",
"""qux""","""one""",4.359717
"""qux""","""two""",


In [58]:
# s.reindex(index[:3])
s.join(index[:3], left_on=['index0', 'index1'], right_on=['first', 'second'], how='inner')

index0,index1,value
str,str,f64
"""bar""","""one""",1.314541
"""bar""","""two""",0.490712
"""baz""","""one""",0.224043


In [93]:
e = pl.col('a') + pl.col('b')
e1, e2 = e.meta.pop()

In [99]:
e??

[1;31mType:[0m        Expr
[1;31mString form:[0m [(col("a")) + (col("b"))]
[1;31mFile:[0m        c:\micromamba\envs\cad\lib\site-packages\polars\expr\expr.py
[1;31mSource:[0m     
[1;32mclass[0m [0mExpr[0m[1;33m:[0m[1;33m
[0m    [1;34m"""Expressions that can be used in various contexts."""[0m[1;33m
[0m[1;33m
[0m    [0m_pyexpr[0m[1;33m:[0m [0mPyExpr[0m [1;33m=[0m [1;32mNone[0m[1;33m
[0m    [0m_accessors[0m[1;33m:[0m [0mClassVar[0m[1;33m[[0m[0mset[0m[1;33m[[0m[0mstr[0m[1;33m][0m[1;33m][0m [1;33m=[0m [1;33m{[0m[1;33m
[0m        [1;34m"arr"[0m[1;33m,[0m[1;33m
[0m        [1;34m"cat"[0m[1;33m,[0m[1;33m
[0m        [1;34m"dt"[0m[1;33m,[0m[1;33m
[0m        [1;34m"list"[0m[1;33m,[0m[1;33m
[0m        [1;34m"meta"[0m[1;33m,[0m[1;33m
[0m        [1;34m"name"[0m[1;33m,[0m[1;33m
[0m        [1;34m"str"[0m[1;33m,[0m[1;33m
[0m        [1;34m"bin"[0m[1;33m,[0m[1;33m
[0m        [1;34m"struct"[0m[1;33