In [1]:
# https://tidyr.tidyverse.org/reference/extract.html
%run nb_helpers.py

from datar.all import *

nb_header(extract)

### <div style="background-color: #EEE; padding: 5px 0 8px 0">★ extract</div>

##### Given a regular expression with capturing groups, extract() turns each
group into a new column. If the groups don't match, or the input is NA,  
the output will be NA.  

See https://tidyr.tidyverse.org/reference/extract.html  

##### Args:
&emsp;&emsp;`data`: The dataframe  
&emsp;&emsp;`col`: Column name or position.  
&emsp;&emsp;`into`: Names of new variables to create as character vector.  
&emsp;&emsp;&emsp;&emsp;Use None to omit the variable in the output.  

&emsp;&emsp;`regex`: a regular expression used to extract the desired values.  
&emsp;&emsp;&emsp;&emsp;There should be one group (defined by ()) for each element of into.  

&emsp;&emsp;`remove`: If TRUE, remove input column from output data frame.  
&emsp;&emsp;`convert`: The universal type for the extracted columns or a dict for  
&emsp;&emsp;&emsp;&emsp;individual ones  

##### Returns:
&emsp;&emsp;Dataframe with extracted columns.  


In [2]:
df = tibble(x = c(NA, "a-b", "a-d", "b-c", "d-e"))
df >> extract(f.x, "A")

Unnamed: 0,A
,<object>
0.0,
1.0,a
2.0,a
3.0,b
4.0,d


In [3]:
df >> extract(f.x, c("A", "B"), r"(\w+)-(\w+)")

Unnamed: 0,A,B
,<object>,<object>
0.0,,
1.0,a,b
2.0,a,d
3.0,b,c
4.0,d,e


In [4]:
df >> extract(f.x, c("A", "B"), r"([a-d]+)-([a-d]+)")

Unnamed: 0,A,B
,<object>,<object>
0.0,,
1.0,a,b
2.0,a,d
3.0,b,c
4.0,,


In [5]:
# combine multiple columns
df = tibble(x='abcd')
df >> extract(f.x, ['a', 'b', 'a', 'b'], r'(.)(.)(.)(.)')

Unnamed: 0,a,b
,<object>,<object>
0.0,ac,bd
