In [1]:
from datar.datasets import iris
from datar.all import *

%run nb_helpers.py
nb_header(pack, unpack)



### # pack  

##### Makes df narrow by collapsing a set of columns into a single df-column.

##### Args:
&emsp;&emsp;`_data`: A data frame  
&emsp;&emsp;`**cols`: Columns to pack  
&emsp;&emsp;`_names_sep`: If `None`, the default, the names will be left as is.  
&emsp;&emsp;&emsp;&emsp;Inner names will come from the former outer names  
&emsp;&emsp;&emsp;&emsp;If a string, the inner and outer names will be used together.  
&emsp;&emsp;&emsp;&emsp;The names of the new outer columns will be formed by pasting  
&emsp;&emsp;&emsp;&emsp;together the outer and the inner column names, separated by  
&emsp;&emsp;&emsp;&emsp;`_names_sep`.  

&emsp;&emsp;`base0_`: Whether `**cols` are 0-based  
&emsp;&emsp;&emsp;&emsp;if not provided, will use `datar.base.get_option('index.base.0')`  


### # unpack  

##### Makes df wider by expanding df-columns back out into individual columns.

For empty columns, the column is kept asis, instead of removing it.  

##### Args:
&emsp;&emsp;`data`: A data frame  
&emsp;&emsp;`cols`: Columns to unpack  
&emsp;&emsp;`names_sep`: If `None`, the default, the names will be left as is.  
&emsp;&emsp;&emsp;&emsp;Inner names will come from the former outer names  
&emsp;&emsp;&emsp;&emsp;If a string, the inner and outer names will be used together.  
&emsp;&emsp;&emsp;&emsp;The names of the new outer columns will be formed by pasting  
&emsp;&emsp;&emsp;&emsp;together the outer and the inner column names, separated by  
&emsp;&emsp;&emsp;&emsp;`_names_sep`.  

&emsp;&emsp;`name_repair`: treatment of problematic column names:  
&emsp;&emsp;&emsp;&emsp;- "minimal": No name repair or checks, beyond basic existence,

&emsp;&emsp;&emsp;&emsp;- "unique": Make sure names are unique and not empty,

&emsp;&emsp;&emsp;&emsp;- "check_unique": (default value), no name repair,
&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;but check they are unique,  

&emsp;&emsp;&emsp;&emsp;- "universal": Make the names unique and syntactic

&emsp;&emsp;&emsp;&emsp;- a function: apply custom name repair

&emsp;&emsp;`base0_`: Whether `cols` are 0-based  
&emsp;&emsp;&emsp;&emsp;if not provided, will use `datar.base.get_option('index.base.0')`  

##### Returns:
&emsp;&emsp;Data frame with given columns unpacked.  


In [2]:
df = tibble(x1 = f[1:3], x2 = f[4:6], x3 = f[7:9], y = f[1:3])
df

Unnamed: 0,x1,x2,x3,y
,<int64>,<int64>,<int64>,<int64>
0.0,1,4,7,1
1.0,2,5,8,2
2.0,3,6,9,3


In [3]:
df >> pack(x=starts_with('x'))

Unnamed: 0,y,x$x1,x$x2,x$x3
,<int64>,<int64>,<int64>,<int64>
0.0,1,1,4,7
1.0,2,2,5,8
2.0,3,3,6,9


In [4]:
df >> pack(x=c(f.x1, f.x2, f.x3), y=f.y)

Unnamed: 0,x$x1,x$x2,x$x3,y$y
,<int64>,<int64>,<int64>,<int64>
0.0,1,4,7,1
1.0,2,5,8,2
2.0,3,6,9,3


In [5]:
iris >> pack(
    Sepal=starts_with("Sepal"),
    Petal=starts_with("Petal"),
    _names_sep="_"
)

Unnamed: 0,Species,Sepal$Length,Sepal$Width,Petal$Length,Petal$Width
,<object>,<float64>,<float64>,<float64>,<float64>
0,setosa,5.1,3.5,1.4,0.2
1,setosa,4.9,3.0,1.4,0.2
2,setosa,4.7,3.2,1.3,0.2
3,setosa,4.6,3.1,1.5,0.2
...,...,...,...,...,...
4,setosa,5.0,3.6,1.4,0.2
145,virginica,6.7,3.0,5.2,2.3
146,virginica,6.3,2.5,5.0,1.9
147,virginica,6.5,3.0,5.2,2.0


In [6]:
# Unpacking ===========================================================

df = tibble(
  x = f[1:3],
  y = tibble(a = f[1:3], b = f[3:1]),
  z = tibble(X = c("a", "b", "c"), Y = runif(3), Z = c(TRUE, FALSE, NA))
)
df

Unnamed: 0,x,y$a,y$b,z$X,z$Y,z$Z
,<int64>,<int64>,<int64>,<object>,<float64>,<float64>
0.0,1,1,3,a,0.012264,1.0
1.0,2,2,2,b,0.538775,0.0
2.0,3,3,1,c,0.511123,


In [7]:
df >> unpack(f.y)

Unnamed: 0,x,a,b,z$X,z$Y,z$Z
,<int64>,<int64>,<int64>,<object>,<float64>,<float64>
0.0,1,1,3,a,0.012264,1.0
1.0,2,2,2,b,0.538775,0.0
2.0,3,3,1,c,0.511123,


In [8]:
df >> unpack(c(f.y, f.z))

Unnamed: 0,x,a,b,X,Y,Z
,<int64>,<int64>,<int64>,<object>,<float64>,<float64>
0.0,1,1,3,a,0.012264,1.0
1.0,2,2,2,b,0.538775,0.0
2.0,3,3,1,c,0.511123,


In [9]:
df >> unpack(c(f.y, f.z), names_sep="_")


Unnamed: 0,x,y_a,y_b,z_X,z_Y,z_Z
,<int64>,<int64>,<int64>,<object>,<float64>,<float64>
0.0,1,1,3,a,0.012264,1.0
1.0,2,2,2,b,0.538775,0.0
2.0,3,3,1,c,0.511123,


In [10]:
with try_catch():
    # indexes from inner data frame counts
    df >> unpack(c(2,3))

[ValueError] `y` has already been selected. Number of packed columns also counts when selecting using indexes.


In [11]:
df >> unpack(c(2,4))

Unnamed: 0,x,a,b,X,Y,Z
,<int64>,<int64>,<int64>,<object>,<float64>,<float64>
0.0,1,1,3,a,0.012264,1.0
1.0,2,2,2,b,0.538775,0.0
2.0,3,3,1,c,0.511123,
