In [1]:
from datar.datasets import iris, fish_encounters
from datar.all import *

%run nb_helpers.py
nb_header(chop, unchop)



### # chop  

##### Makes data frame shorter by converting rows within each group
into list-columns.  

##### Args:
&emsp;&emsp;`data`: A data frame  
&emsp;&emsp;`cols`: Columns to chop  

##### Returns:
&emsp;&emsp;Data frame with selected columns chopped  


### # unchop  

##### Makes df longer by expanding list-columns so that each element
of the list-column gets its own row in the output.  

See https://tidyr.tidyverse.org/reference/chop.html  

Recycling size-1 elements might be different from `tidyr`  
&emsp;&emsp;>>> df = tibble(x=[1, [2,3]], y=[[2,3], 1])  
&emsp;&emsp;>>> df >> unchop([f.x, f.y])  
&emsp;&emsp;>>> # tibble(x=[1,2,3], y=[2,3,1])  
&emsp;&emsp;>>> # instead of following in tidyr  
&emsp;&emsp;>>> # tibble(x=[1,1,2,3], y=[2,3,1,1])  

##### Args:
&emsp;&emsp;`data`: A data frame.  
&emsp;&emsp;`cols`: Columns to unchop.  
&emsp;&emsp;`keep_empty`: By default, you get one row of output for each element  
&emsp;&emsp;&emsp;&emsp;of the list your unchopping/unnesting.  
&emsp;&emsp;&emsp;&emsp;This means that if there's a size-0 element  
&emsp;&emsp;&emsp;&emsp;(like NULL or an empty data frame), that entire row will be  
&emsp;&emsp;&emsp;&emsp;dropped from the output.  
&emsp;&emsp;&emsp;&emsp;If you want to preserve all rows, use `keep_empty` = `True` to  
&emsp;&emsp;&emsp;&emsp;replace size-0 elements with a single row of missing values.  

&emsp;&emsp;`dtypes`: Providing the dtypes for the output columns.  
&emsp;&emsp;&emsp;&emsp;Could be a single dtype, which will be applied to all columns, or  
&emsp;&emsp;&emsp;&emsp;a dictionary of dtypes with keys for the columns and values the  
&emsp;&emsp;&emsp;&emsp;dtypes.  
&emsp;&emsp;&emsp;&emsp;For nested data frames, we need to specify `col$a` as key. If `col`  
&emsp;&emsp;&emsp;&emsp;is used as key, all columns of the nested data frames will be casted  
&emsp;&emsp;&emsp;&emsp;into that dtype.  

##### Returns:
&emsp;&emsp;A data frame with selected columns unchopped.  


In [2]:
df = tibble(x = c(1, 1, 1, 2, 2, 3), y = f[1:6:1], z = f[6:1:-1])
df >> nest(data = c(f.y, f.z))


Unnamed: 0,x,data
,<int64>,<object>
0.0,1,<DF 3x2>
1.0,2,<DF 2x2>
2.0,3,<DF 1x2>


In [3]:
df >> chop(c(f.y, f.z))

Unnamed: 0,x,y,z
,<int64>,<object>,<object>
0.0,1,"[1, 2, 3]","[6, 5, 4]"
1.0,2,"[4, 5]","[3, 2]"
2.0,3,[6],[1]


In [5]:
# Unchop
df = tibble(x = f[1:5], y = [[], [1], [1,2], [1,2,3]])
df >> unchop(f.y)

Unnamed: 0,x,y
,<int64>,<object>
0.0,2,1.0
1.0,3,1.0
2.0,3,2.0
3.0,4,1.0
4.0,4,2.0
5.0,4,3.0


In [7]:
df >> unchop(f.y, keep_empty=True, dtypes=int)

Unnamed: 0,x,y
,<int64>,<int64>
0.0,2,1
1.0,3,1
2.0,3,2
3.0,4,1
4.0,4,2
5.0,4,3


In [8]:
df = tibble(x = f[1:2], y = ["a", [1,2,3]])
df >> unchop(f.y)

Unnamed: 0,x,y
,<int64>,<object>
0.0,1,a
1.0,1,1
2.0,1,2
3.0,1,3


In [9]:
with try_catch():
    df >> unchop(f.y, dtypes=int)

[ValueError] invalid literal for int() with base 10: 'a'


In [11]:
df = tibble(x = f[1:4], y = [NULL, tibble(x = 1), tibble(y = f[1:3])])
df >> unchop(f.y)

Unnamed: 0,x,y$x,y$y
,<int64>,<float64>,<float64>
0.0,2,1.0,
1.0,3,,1.0
2.0,3,,2.0


In [12]:
df >> unchop(f.y, keep_empty=True)

Unnamed: 0,x,y$x,y$y
,<int64>,<float64>,<float64>
0.0,1,,
1.0,2,1.0,
2.0,3,,1.0
3.0,3,,2.0
