In [1]:
from datar.datasets import iris, fish_encounters, mtcars
from datar.all import *

%run nb_helpers.py
nb_header(nest, unnest)

### # nest  

##### Nesting creates a list-column of data frames

##### Args:
&emsp;&emsp;`_data`: A data frame  
&emsp;&emsp;`**cols`: Columns to nest  
&emsp;&emsp;`_names_sep`: If `None`, the default, the names will be left as is.  
&emsp;&emsp;&emsp;&emsp;Inner names will come from the former outer names  
&emsp;&emsp;&emsp;&emsp;If a string, the inner and outer names will be used together.  
&emsp;&emsp;&emsp;&emsp;The names of the new outer columns will be formed by pasting  
&emsp;&emsp;&emsp;&emsp;together the outer and the inner column names, separated by  
&emsp;&emsp;&emsp;&emsp;`_names_sep`.  

&emsp;&emsp;`_base0`: Whether `**cols` are 0-based  
&emsp;&emsp;&emsp;&emsp;if not provided, will use `datar.base.get_option('index.base.0')`  

##### Returns:
&emsp;&emsp;Nested data frame.  


### # unnest  

##### Flattens list-column of data frames back out into regular columns.

##### Args:
&emsp;&emsp;`data`: A data frame to flatten.  
&emsp;&emsp;`*cols`: Columns to unnest.  
&emsp;&emsp;`keep_empty`: By default, you get one row of output for each element  
&emsp;&emsp;&emsp;&emsp;of the list your unchopping/unnesting.  
&emsp;&emsp;&emsp;&emsp;This means that if there's a size-0 element  
&emsp;&emsp;&emsp;&emsp;(like NULL or an empty data frame), that entire row will be  
&emsp;&emsp;&emsp;&emsp;dropped from the output.  
&emsp;&emsp;&emsp;&emsp;If you want to preserve all rows, use `keep_empty` = `True` to  
&emsp;&emsp;&emsp;&emsp;replace size-0 elements with a single row of missing values.  

&emsp;&emsp;`dtypes`: NOT `ptype`. Providing the dtypes for the output columns.  
&emsp;&emsp;&emsp;&emsp;Could be a single dtype, which will be applied to all columns, or  
&emsp;&emsp;&emsp;&emsp;a dictionary of dtypes with keys for the columns and values the  
&emsp;&emsp;&emsp;&emsp;dtypes.  

&emsp;&emsp;`names_sep`: If `None`, the default, the names will be left as is.  
&emsp;&emsp;&emsp;&emsp;Inner names will come from the former outer names  
&emsp;&emsp;&emsp;&emsp;If a string, the inner and outer names will be used together.  
&emsp;&emsp;&emsp;&emsp;The names of the new outer columns will be formed by pasting  
&emsp;&emsp;&emsp;&emsp;together the outer and the inner column names, separated by  
&emsp;&emsp;&emsp;&emsp;`names_sep`.  

&emsp;&emsp;`names_repair`: treatment of problematic column names:  
&emsp;&emsp;&emsp;&emsp;- "minimal": No name repair or checks, beyond basic existence,

&emsp;&emsp;&emsp;&emsp;- "unique": Make sure names are unique and not empty,

&emsp;&emsp;&emsp;&emsp;- "check_unique": (default value), no name repair,
&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;but check they are unique,  

&emsp;&emsp;&emsp;&emsp;- "universal": Make the names unique and syntactic

&emsp;&emsp;&emsp;&emsp;- a function: apply custom name repair

&emsp;&emsp;`_base0`: Whether `cols` are 0-based  
&emsp;&emsp;&emsp;&emsp;if not provided, will use `datar.base.get_option('index.base.0')`  

##### Returns:
&emsp;&emsp;Data frame with selected columns unnested.  


In [2]:
df = tibble(x = c(1, 1, 1, 2, 2, 3), y = f[1:6], z = f[6:1])
df >> nest(data=c(f.y, f.z))

Unnamed: 0,x,data
0,1,y z 0 1 6 1 2 5 2 3 4
1,2,y z 3 4 3 4 5 2
2,3,y z 5 6 1


In [3]:
df >> chop(c(f.y, f.z))

Unnamed: 0,x,y,z
0,1,"[1, 2, 3]","[6, 5, 4]"
1,2,"[4, 5]","[3, 2]"
2,3,[6],[1]


In [4]:
df >> nest(data=any_of(c(f.y, f.z)))

Unnamed: 0,x,data
0,1,y z 0 1 6 1 2 5 2 3 4
1,2,y z 3 4 3 4 5 2
2,3,y z 5 6 1


In [5]:
iris >> nest(data=~f.Species)
_.data[0]

Unnamed: 0,Species,data
0,setosa,Sepal_Length Sepal_Width Petal_Length P...
1,versicolor,Sepal_Length Sepal_Width Petal_Length P...
2,virginica,Sepal_Length Sepal_Width Petal_Length ...


Unnamed: 0,Sepal_Length,Sepal_Width,Petal_Length,Petal_Width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
5,5.4,3.9,1.7,0.4
6,4.6,3.4,1.4,0.3
7,5.0,3.4,1.5,0.2
8,4.4,2.9,1.4,0.2
9,4.9,3.1,1.5,0.1


In [6]:
nest_vars = names(iris)[:4]
iris >> nest(data = any_of(nest_vars))

Unnamed: 0,Species,data
0,setosa,Sepal_Length Sepal_Width Petal_Length P...
1,versicolor,Sepal_Length Sepal_Width Petal_Length P...
2,virginica,Sepal_Length Sepal_Width Petal_Length ...


In [7]:
iris >> nest(petal = starts_with("Petal"), sepal = starts_with("Sepal"))

Unnamed: 0,Species,petal,sepal
0,setosa,Petal_Length Petal_Width 0 1.4...,Sepal_Length Sepal_Width 0 5.1...
1,versicolor,Petal_Length Petal_Width 50 4.7...,Sepal_Length Sepal_Width 50 7.0...
2,virginica,Petal_Length Petal_Width 100 6...,Sepal_Length Sepal_Width 100 6...


In [8]:
iris >> nest(width = contains("Width"), length = contains("Length"))

Unnamed: 0,Species,width,length
0,setosa,Sepal_Width Petal_Width 0 3.5 ...,Sepal_Length Petal_Length 0 5....
1,versicolor,Sepal_Width Petal_Width 50 3.2 ...,Sepal_Length Petal_Length 50 7....
2,virginica,Sepal_Width Petal_Width 100 3.3...,Sepal_Length Petal_Length 100 ...


In [9]:
fish_encounters >> group_by(f.fish) >> nest()

Unnamed: 0,fish,data
0,4842,station seen 0 Release 1 1 I80_...
1,4843,station seen 11 Release 1 12 I80_...
2,4844,station seen 22 Release 1 23 I80_...
3,4845,station seen 33 Release 1 34 I80_...
4,4847,station seen 38 Release 1 39 I80_...
5,4848,station seen 41 Release 1 42 I80_...
6,4849,station seen 45 Release 1 46 I80_...
7,4850,station seen 47 Release 1 48 I80_...
8,4851,station seen 53 Release 1 54 I80_...
9,4854,station seen 55 Release 1 56 I80_...


In [10]:
from pipda import register_func
@register_func(None)
def get_models(dfs):
    # do whatever with the dfs
    return [
        f"<df {df.shape[0]}x{df.shape[1]}>"
        for df in dfs
    ]

mtcars >> group_by(f.cyl) >> nest() >> mutate(
    models=get_models(f.data)
)

Unnamed: 0,cyl,data,models
0,6,mpg disp hp drat wt qsec vs ...,<df 7x10>
1,4,mpg disp hp drat wt qsec vs ...,<df 11x10>
2,8,mpg disp hp drat wt qsec vs ...,<df 14x10>


In [11]:
df = tibble(
  x = f[1:3],
  y = [
    NULL,
    tibble(a = 1, b = 2),
    tibble(a = f[1:3], b = f[3:1])
  ]
)
df >> unnest(f.y, dtypes=int)

Unnamed: 0,x,a,b
0,2,1,2
1,3,1,3
2,3,2,2
3,3,3,1


In [12]:
df >> unnest(f.y, keep_empty=True)

Unnamed: 0,x,a,b
0,1,,
1,2,1.0,2.0
2,3,1.0,3.0
3,3,2.0,2.0
4,3,3.0,1.0


In [13]:
df = tibble(
 a = [c("a", "b"), "c"],
 b = [[1,2], 3],
 c = c(11, 22)
)
df >> unnest(c(f.a, f.b))

Unnamed: 0,a,b,c
0,a,1,11
1,b,2,11
2,c,3,22


In [14]:
df >> unnest(f.a) >> unnest(f.b)

Unnamed: 0,a,b,c
0,a,1,11
1,a,2,11
2,b,1,11
3,b,2,11
4,c,3,22
