In [1]:
# https://tidyr.tidyverse.org/reference/expand.html

from datar.all import *

%run nb_helpers.py
nb_header(expand, nesting, crossing)

### # expand  

##### Generates all combination of variables found in a dataset.

##### Args:
&emsp;&emsp;`data`: A data frame  
&emsp;&emsp;`*args`: and,  
&emsp;&emsp;`**kwargs`: columns to expand. Columns can be atomic lists.  
&emsp;&emsp;&emsp;&emsp;- To find all unique combinations of x, y and z, including
&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;those not present in the data, supply each variable as a  
&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;separate argument: `expand(df, x, y, z)`.  

&emsp;&emsp;&emsp;&emsp;- To find only the combinations that occur in the data, use
&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;`nesting`: `expand(df, nesting(x, y, z))`.  

&emsp;&emsp;&emsp;&emsp;- You can combine the two forms. For example,
&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;`expand(df, nesting(school_id, student_id), date)` would  
&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;produce a row for each present school-student combination  
&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;for all possible dates.  

&emsp;&emsp;`_name_repair`: treatment of problematic column names:  
&emsp;&emsp;&emsp;&emsp;- "minimal": No name repair or checks, beyond basic existence,

&emsp;&emsp;&emsp;&emsp;- "unique": Make sure names are unique and not empty,

&emsp;&emsp;&emsp;&emsp;- "check_unique": (default value), no name repair,
&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;but check they are unique,  

&emsp;&emsp;&emsp;&emsp;- "universal": Make the names unique and syntactic

&emsp;&emsp;&emsp;&emsp;- a function: apply custom name repair

&emsp;&emsp;`_base0`: Whether the suffixes of repaired names should be 0-based.  
&emsp;&emsp;&emsp;&emsp;If not provided, will use `datar.base.get_option('index.base.0')`.  

##### Returns:
&emsp;&emsp;A data frame with all combination of variables.  


### # nesting  

##### A helper that only finds combinations already present in the data.

##### Args:
&emsp;&emsp;`*args`: and,  
&emsp;&emsp;`**kwargs`: columns to expand. Columns can be atomic lists.  
&emsp;&emsp;&emsp;&emsp;- To find all unique combinations of x, y and z, including
&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;those not present in the data, supply each variable as a  
&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;separate argument: `expand(df, x, y, z)`.  

&emsp;&emsp;&emsp;&emsp;- To find only the combinations that occur in the data, use
&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;`nesting`: `expand(df, nesting(x, y, z))`.  

&emsp;&emsp;&emsp;&emsp;- You can combine the two forms. For example,
&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;`expand(df, nesting(school_id, student_id), date)` would  
&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;produce a row for each present school-student combination  
&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;for all possible dates.  

&emsp;&emsp;`_name_repair`: treatment of problematic column names:  
&emsp;&emsp;&emsp;&emsp;- "minimal": No name repair or checks, beyond basic existence,

&emsp;&emsp;&emsp;&emsp;- "unique": Make sure names are unique and not empty,

&emsp;&emsp;&emsp;&emsp;- "check_unique": (default value), no name repair,
&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;but check they are unique,  

&emsp;&emsp;&emsp;&emsp;- "universal": Make the names unique and syntactic

&emsp;&emsp;&emsp;&emsp;- a function: apply custom name repair

&emsp;&emsp;`_base0`: Whether the suffixes of repaired names should be 0-based.  
&emsp;&emsp;&emsp;&emsp;If not provided, will use `datar.base.get_option('index.base.0')`.  

##### Returns:
&emsp;&emsp;A data frame with all combinations in data.  


### # crossing  

##### A wrapper around `expand_grid()` that de-duplicates and sorts its inputs

When values are not specified by literal `list`, they will be sorted.  

##### Args:
&emsp;&emsp;`*args`: and,  
&emsp;&emsp;`**kwargs`: columns to expand. Columns can be atomic lists.  
&emsp;&emsp;&emsp;&emsp;- To find all unique combinations of x, y and z, including
&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;those not present in the data, supply each variable as a  
&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;separate argument: `expand(df, x, y, z)`.  

&emsp;&emsp;&emsp;&emsp;- To find only the combinations that occur in the data, use
&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;`nesting`: `expand(df, nesting(x, y, z))`.  

&emsp;&emsp;&emsp;&emsp;- You can combine the two forms. For example,
&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;`expand(df, nesting(school_id, student_id), date)` would  
&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;produce a row for each present school-student combination  
&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;for all possible dates.  

&emsp;&emsp;`_name_repair`: treatment of problematic column names:  
&emsp;&emsp;&emsp;&emsp;- "minimal": No name repair or checks, beyond basic existence,

&emsp;&emsp;&emsp;&emsp;- "unique": Make sure names are unique and not empty,

&emsp;&emsp;&emsp;&emsp;- "check_unique": (default value), no name repair,
&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;but check they are unique,  

&emsp;&emsp;&emsp;&emsp;- "universal": Make the names unique and syntactic

&emsp;&emsp;&emsp;&emsp;- a function: apply custom name repair

&emsp;&emsp;`_base0`: Whether the suffixes of repaired names should be 0-based.  
&emsp;&emsp;&emsp;&emsp;If not provided, will use `datar.base.get_option('index.base.0')`.  

##### Returns:
&emsp;&emsp;A data frame with values deduplicated and sorted.  


In [2]:

fruits = tibble(
  type   = c("apple", "orange", "apple", "orange", "orange", "orange"),
  year   = c(2010, 2010, 2012, 2010, 2010, 2012),
  size  =  factor(
    c("XS", "S",  "M", "S", "S", "M"),
    levels = c("XS", "S", "M", "L")
  ),
  weights = rnorm(6)
)
fruits 

Unnamed: 0,type,year,size,weights
0,apple,2010,XS,0.705952
1,orange,2010,S,-1.150749
2,apple,2012,M,-1.558679
3,orange,2010,S,0.171499
4,orange,2010,S,-1.965301
5,orange,2012,M,0.21212


In [3]:
fruits >> expand(f.type)

Unnamed: 0,type
0,apple
1,orange


In [4]:
fruits >> expand(f.type, f.size) 

Unnamed: 0,type,size
0,apple,XS
1,apple,S
2,apple,M
3,apple,L
4,orange,XS
5,orange,S
6,orange,M
7,orange,L


In [5]:
fruits >> expand(f.type, f.size, f.year)

Unnamed: 0,type,size,year
0,apple,XS,2010
1,apple,XS,2012
2,apple,S,2010
3,apple,S,2012
4,apple,M,2010
5,apple,M,2012
6,apple,L,2010
7,apple,L,2012
8,orange,XS,2010
9,orange,XS,2012


In [6]:
fruits >> expand(nesting(f.type))

Unnamed: 0,type
0,apple
1,orange


In [7]:
fruits >> expand(nesting(f.type, f.size))

Unnamed: 0,type,size
0,apple,XS
1,orange,S
2,apple,M
3,orange,M


In [8]:
fruits >> expand(nesting(f.type, f.size, f.year))

Unnamed: 0,type,size,year
0,apple,XS,2010
1,orange,S,2010
2,apple,M,2012
3,orange,M,2012


In [9]:
fruits >> expand(f.type, f.size, full_seq(f.year, 1))

Unnamed: 0,type,size,_Var2
0,apple,XS,2010
1,apple,XS,2011
2,apple,XS,2012
3,apple,S,2010
4,apple,S,2011
5,apple,S,2012
6,apple,M,2010
7,apple,M,2011
8,apple,M,2012
9,apple,L,2010


In [10]:
fruits >> expand(f.type, f.size, seq(2010, 2012))

Unnamed: 0,type,size,_Var2
0,apple,XS,2010
1,apple,XS,2011
2,apple,XS,2012
3,apple,S,2010
4,apple,S,2011
5,apple,S,2012
6,apple,M,2010
7,apple,M,2011
8,apple,M,2012
9,apple,L,2010


In [11]:
fruits >> expand(f.type, f.size, year=seq(2010, 2012))

Unnamed: 0,type,size,year
0,apple,XS,2010
1,apple,XS,2011
2,apple,XS,2012
3,apple,S,2010
4,apple,S,2011
5,apple,S,2012
6,apple,M,2010
7,apple,M,2011
8,apple,M,2012
9,apple,L,2010


In [12]:
all = fruits >> expand(f.type, f.size, f.year)
all

Unnamed: 0,type,size,year
0,apple,XS,2010
1,apple,XS,2012
2,apple,S,2010
3,apple,S,2012
4,apple,M,2010
5,apple,M,2012
6,apple,L,2010
7,apple,L,2012
8,orange,XS,2010
9,orange,XS,2012


In [13]:
all >> anti_join(fruits)

Unnamed: 0,type,size,year
1,apple,XS,2012
2,apple,S,2010
3,apple,S,2012
4,apple,M,2010
6,apple,L,2010
7,apple,L,2012
8,orange,XS,2010
9,orange,XS,2012
13,orange,S,2012
14,orange,M,2010


In [14]:
fruits >> right_join(all)

Unnamed: 0,type,year,size,weights
0,apple,2010,XS,0.705952
1,apple,2012,XS,
2,apple,2010,S,
3,apple,2012,S,
4,apple,2010,M,
5,apple,2012,M,-1.558679
6,apple,2010,L,
7,apple,2012,L,
8,orange,2010,XS,
9,orange,2012,XS,
