In [1]:
# https://tidyr.tidyverse.org/reference/separate.html

from datar.all import *

from nb_helpers import nb_header
nb_header(separate)

### # separate  

##### Given either a regular expression or a vector of character positions,
turns a single character column into multiple columns.  

##### Args:
&emsp;&emsp;`_data`: The dataframe  
&emsp;&emsp;`col`: Column name or position.  
&emsp;&emsp;`into`: Names of new variables to create as character vector.  
&emsp;&emsp;&emsp;&emsp;Use None to omit the variable in the output.  

&emsp;&emsp;`sep`: Separator between columns.  
&emsp;&emsp;&emsp;&emsp;`TODO`: support index split (sep is an integer)  

&emsp;&emsp;`remove`: If TRUE, remove input column from output data frame.  
&emsp;&emsp;`convert`: The universal type for the extracted columns or a dict for  
&emsp;&emsp;&emsp;&emsp;individual ones  

&emsp;&emsp;`extra`: If sep is a character vector, this controls what happens when  
&emsp;&emsp;&emsp;&emsp;there are too many pieces. There are three valid options:  

&emsp;&emsp;&emsp;&emsp;- "warn" (the default): emit a warning and drop extra values.  

&emsp;&emsp;&emsp;&emsp;- "drop": drop any extra values without a warning.  

&emsp;&emsp;&emsp;&emsp;- "merge": only splits at most length(into) times  

&emsp;&emsp;`fill`: If sep is a character vector, this controls what happens when  
&emsp;&emsp;&emsp;&emsp;there are not enough pieces. There are three valid options:  

&emsp;&emsp;&emsp;&emsp;- "warn" (the default): emit a warning and fill from the right  

&emsp;&emsp;&emsp;&emsp;- "right": fill with missing values on the right  

&emsp;&emsp;&emsp;&emsp;- "left": fill with missing values on the left  

##### Returns:
&emsp;&emsp;Dataframe with separated columns.  


In [2]:
df = tibble(x=c(NA, "x.y", "x.z", "y.z"))
df >> separate(f.x, c("A", "B"))

Unnamed: 0,A,B
0,,
1,x,y
2,x,z
3,y,z


In [3]:
df >> separate(f.x, c(NA, "B"))

Unnamed: 0,B
0,
1,y
2,z
3,z


In [4]:
df = tibble(x=c("x", "x y", "x y z", NA))
df >> separate(f.x, c("a", "b"))



Unnamed: 0,a,b
0,x,
1,x,y
2,x,y
3,,


In [5]:
df >> separate(f.x, c("a", "b"), extra="drop", fill="right")

Unnamed: 0,a,b
0,x,
1,x,y
2,x,y
3,,


In [6]:
df >> separate(f.x, c("a", "b"), extra="merge", fill="left")

Unnamed: 0,a,b
0,,x
1,x,y
2,x,y z
3,,


In [7]:
df >> separate(f.x, c("a", "b", "c"))



Unnamed: 0,a,b,c
0,x,,
1,x,y,
2,x,y,z
3,,,


In [8]:
df = tibble(x=c("x: 123", "y: error: 7"))
df >> separate(f.x, c("key", "value"), ": ", extra="merge")

Unnamed: 0,key,value
0,x,123
1,y,error: 7


In [9]:
df = tibble(x=c(NA, "x?y", "x.z", "y:z"))
df >> separate(f.x, c("A","B"), sep=r"[.?:]")

Unnamed: 0,A,B
0,,
1,x,y
2,x,z
3,y,z


In [10]:
df = tibble(x=c("x:1", "x:2", "y:4", "z", NA))
df >> separate(f.x, c("key","value"), ":")



Unnamed: 0,key,value
0,x,1.0
1,x,2.0
2,y,4.0
3,z,
4,,


In [11]:
df >> separate(f.x, c("key","value"), ":", convert={'value': float}) 



Unnamed: 0,key,value
0,x,1.0
1,x,2.0
2,y,4.0
3,z,
4,,


In [12]:
_.value.dtype

dtype('float64')

In [13]:
df = tibble(
  x=[1,2,3],
  y=c("a", "d,e,f", "g,h"),
  z=c("1", "2,3,4", "5,6")
)

In [14]:
df >> separate_rows(f.y, f.z, convert={'z': int})

Unnamed: 0,x,y,z
0,1,a,1
1,2,d,2
1,2,e,3
1,2,f,4
2,3,g,5
2,3,h,6


In [15]:
_.z.dtype

dtype('int64')