In [1]:
# https://tidyr.tidyverse.org/reference/separate.html

from datar.all import *

%run nb_helpers.py
nb_header(separate, separate_rows)



### # separate  

##### Given either a regular expression or a vector of character positions,
turns a single character column into multiple columns.  

##### Args:
&emsp;&emsp;`data`: The dataframe  
&emsp;&emsp;`col`: Column name or position.  
&emsp;&emsp;`into`: Names of new variables to create as character vector.  
&emsp;&emsp;&emsp;&emsp;Use `None`/`NA`/`NULL` to omit the variable in the output.  

&emsp;&emsp;`sep`: Separator between columns.  
&emsp;&emsp;&emsp;&emsp;If str, `sep` is interpreted as a regular expression.  
&emsp;&emsp;&emsp;&emsp;The default value is a regular expression that matches  
&emsp;&emsp;&emsp;&emsp;any sequence of non-alphanumeric values.  
&emsp;&emsp;&emsp;&emsp;If int, `sep` is interpreted as character positions to split at.  

&emsp;&emsp;`remove`: If TRUE, remove input column from output data frame.  
&emsp;&emsp;`convert`: The universal type for the extracted columns or a dict for  
&emsp;&emsp;&emsp;&emsp;individual ones  
&emsp;&emsp;&emsp;&emsp;Note that when given `TRUE`, `DataFrame.convert_dtypes()` is called,  
&emsp;&emsp;&emsp;&emsp;but it will not convert `str` to other types  
&emsp;&emsp;&emsp;&emsp;(For example, `'1'` to `1`). You have to specify the dtype yourself.  

&emsp;&emsp;`extra`: If sep is a character vector, this controls what happens when  
&emsp;&emsp;&emsp;&emsp;there are too many pieces. There are three valid options:  

&emsp;&emsp;&emsp;&emsp;- "warn" (the default): emit a warning and drop extra values.

&emsp;&emsp;&emsp;&emsp;- "drop": drop any extra values without a warning.

&emsp;&emsp;&emsp;&emsp;- "merge": only splits at most length(into) times

&emsp;&emsp;`fill`: If sep is a character vector, this controls what happens when  
&emsp;&emsp;&emsp;&emsp;there are not enough pieces. There are three valid options:  

&emsp;&emsp;&emsp;&emsp;- "warn" (the default): emit a warning and fill from the right

&emsp;&emsp;&emsp;&emsp;- "right": fill with missing values on the right

&emsp;&emsp;&emsp;&emsp;- "left": fill with missing values on the left

##### Returns:
&emsp;&emsp;Dataframe with separated columns.  


### # separate_rows  

##### Separates the values and places each one in its own row.

##### Args:
&emsp;&emsp;`data`: The dataframe  
&emsp;&emsp;`*columns`: The columns to separate on  
&emsp;&emsp;`sep`: Separator between columns.  
&emsp;&emsp;`convert`: The universal type for the extracted columns or a dict for  
&emsp;&emsp;&emsp;&emsp;individual ones  

##### Returns:
&emsp;&emsp;Dataframe with rows separated and repeated.  


In [2]:
df = tibble(x=c(NA, "x.y", "x.z", "y.z"))
df >> separate(f.x, c("A", "B"))

Unnamed: 0,A,B
,<object>,<object>
0.0,,
1.0,x,y
2.0,x,z
3.0,y,z


In [3]:
df >> separate(f.x, c(NA, "B"))

Unnamed: 0,B
,<object>
0.0,
1.0,y
2.0,z
3.0,z


In [4]:
df = tibble(x=c("x", "x y", "x y z", NA))
df >> separate(f.x, c("a", "b"))



Unnamed: 0,a,b
,<object>,<object>
0.0,x,
1.0,x,y
2.0,x,y
3.0,,


In [5]:
df >> separate(f.x, c("a", "b"), extra="drop", fill="right")

Unnamed: 0,a,b
,<object>,<object>
0.0,x,
1.0,x,y
2.0,x,y
3.0,,


In [6]:
df >> separate(f.x, c("a", "b"), extra="merge", fill="left")

Unnamed: 0,a,b
,<object>,<object>
0.0,,x
1.0,x,y
2.0,x,y z
3.0,,


In [7]:
df >> separate(f.x, c("a", "b", "c"))



Unnamed: 0,a,b,c
,<object>,<object>,<object>
0.0,x,,
1.0,x,y,
2.0,x,y,z
3.0,,,


In [8]:
df = tibble(x=c("x: 123", "y: error: 7"))
df >> separate(f.x, c("key", "value"), ": ", extra="merge")

Unnamed: 0,key,value
,<object>,<object>
0.0,x,123
1.0,y,error: 7


In [9]:
df = tibble(x=c(NA, "x?y", "x.z", "y:z"))
df >> separate(f.x, c("A","B"), sep=r"[.?:]")

Unnamed: 0,A,B
,<object>,<object>
0.0,,
1.0,x,y
2.0,x,z
3.0,y,z


In [10]:
df = tibble(x=c("x:1", "x:2", "y:4", "z", NA))
df >> separate(f.x, c("key","value"), ":")



Unnamed: 0,key,value
,<object>,<object>
0.0,x,1
1.0,x,2
2.0,y,4
3.0,z,
4.0,,


In [11]:
df >> separate(f.x, c("key","value"), ":", convert={'value': float}) 
_.dtypes



Unnamed: 0,key,value
,<object>,<float64>
0.0,x,1.0
1.0,x,2.0
2.0,y,4.0
3.0,z,
4.0,,


key       object
value    float64
dtype: object

In [12]:
df = tibble(
  x=[1,2,3],
  y=c("a", "d,e,f", "g,h"),
  z=c("1", "2,3,4", "5,6")
)

In [13]:
df >> separate_rows(f.y, f.z, convert={'z': int})

Unnamed: 0,x,y,z
,<int64>,<object>,<int64>
0.0,1,a,1
1.0,2,d,2
2.0,2,e,3
3.0,2,f,4
4.0,3,g,5
5.0,3,h,6
