# Intro to data structures

In [1]:
import numpy as np
import polars as pl

## Series

In Polars, the Series object has no index, so we need to use a two-column DataFrame to simulate the functions of a Series in Pandas.

In [4]:
s = pl.DataFrame(dict(
    index=["a", "b", "c", "d", "e"],
    value=np.random.randn(5)
))
s

index,value
str,f64
"""a""",0.781795
"""b""",-1.873919
"""c""",1.425785
"""d""",0.318918
"""e""",0.322132


In [5]:
s['index']

index
str
"""a"""
"""b"""
"""c"""
"""d"""
"""e"""


In [6]:
pl.Series(np.random.randn(5))

2.868949
0.129001
-0.303931
0.733275
0.376526


In [11]:
# pd.Series(d)
d = {"b": 1, "a": 0, "c": 2}
pl.DataFrame(list(d.items()), schema=['index', 'value'], orient='row')

index,value
str,i64
"""b""",1
"""a""",0
"""c""",2


In [12]:
pl.select(
    index=pl.Series(["a", "b", "c", "d", "e"]),
    value=5.0
)

index,value
str,f64
"""a""",5.0
"""b""",5.0
"""c""",5.0
"""d""",5.0
"""e""",5.0


### Series is ndarray-like 

In [13]:
s['value'][0]

0.7817950540050607

In [14]:
s['value'][:3]

value
f64
0.781795
-1.873919
1.425785


In [16]:
# s[s > s.median()]
s.filter(pl.col('value') > pl.col('value').median())

index,value
str,f64
"""a""",0.781795
"""c""",1.425785


In [18]:
# s.iloc[[4, 3, 1]]
s[[4, 3, 1]]

index,value
str,f64
"""e""",0.322132
"""d""",0.318918
"""b""",-1.873919


In [19]:
s.with_columns(
    pl.col('value').exp()
)

index,value
str,f64
"""a""",2.185392
"""b""",0.153521
"""c""",4.161125
"""d""",1.375639
"""e""",1.380066


In [20]:
s['value'].dtype

Float64

In [22]:
# s.array
s['value'].to_numpy()

array([ 0.78179505, -1.87391946,  1.42578542,  0.31891839,  0.32213168])

### Series is dict-like

In [25]:
# s["a"]
s.select(pl.col('value').filter(pl.col('index') == 'a')).item()

0.7817950540050607

In [30]:
# s["e"] = 12.0
s = s.with_columns(
    pl.when(pl.col('index') == 'e')
      .then(12.0)
      .otherwise(pl.col('value'))
      .name.keep()
)

In [36]:
# "e" in s
"e" in s['index']

True

In [37]:
"f" in s['index']

False

### Vectorized operations and label alignment with Series

In [39]:
# s + s
from helper.polars import align_op
align_op(s, s, op=pl.Expr.add)

index,value
str,f64
"""a""",1.56359
"""b""",-3.747839
"""c""",2.851571
"""d""",0.637837
"""e""",24.0


In [41]:
# s * 2
s.select(
    'index',
    pl.col('value') * 2
)

index,value
str,f64
"""a""",1.56359
"""b""",-3.747839
"""c""",2.851571
"""d""",0.637837
"""e""",24.0


In [42]:
# np.exp(s)
s.select(
    "index",
    pl.col("value").exp()
)

index,value
str,f64
"""a""",2.185392
"""b""",0.153521
"""c""",4.161125
"""d""",1.375639
"""e""",162754.791419


In [48]:
# s.iloc[1:] + s.iloc[:-1]
align_op(
    s.slice(1), 
    s.slice(0, len(s) - 1), 
    op=pl.Expr.add, 
    fill_value=None, 
    how="full")

index,value
str,f64
"""a""",
"""b""",-3.747839
"""c""",2.851571
"""d""",0.637837
"""e""",


### Name attribute

In [50]:
s = pl.Series("something", np.random.randn(5))
s

something
f64
0.946659
-0.22496
1.0293
0.13003
-0.660966


In [51]:
s.name

'something'

In [52]:
s2 = s.rename('different')
s2.name

'different'

## DataFrame

In [54]:
s1 = pl.DataFrame(dict(index=["a", "b", "c"], one=[1.0, 2.0, 3.0]))
s2 = pl.DataFrame(dict(index=["a", "b", "c", "d"], two=[1.0, 2.0, 3.0, 4.0]))
df = s1.join(s2, on='index', how='full', coalesce=True)
df

index,one,two
str,f64,f64
"""a""",1.0,1.0
"""b""",2.0,2.0
"""c""",3.0,3.0
"""d""",,4.0


In [55]:
df['index']

index
str
"""a"""
"""b"""
"""c"""
"""d"""


In [56]:
df.columns

['index', 'one', 'two']

In [57]:
df.drop('index').columns

['one', 'two']

### From dict of ndarrays / lists

In [59]:
d = {"one": [1.0, 2.0, 3.0, 4.0], "two": [4.0, 3.0, 2.0, 1.0]}
pl.DataFrame(d)

one,two
f64,f64
1.0,4.0
2.0,3.0
3.0,2.0
4.0,1.0


In [60]:
pl.DataFrame(d).insert_column(0, pl.Series('index', ["a", "b", "c", "d"]))

index,one,two
str,f64,f64
"""a""",1.0,4.0
"""b""",2.0,3.0
"""c""",3.0,2.0
"""d""",4.0,1.0


### From structured or record array