In [45]:
import numpy
from datar.all import *
from datar.datasets import gss_cat

%run nb_helpers.py

nb_header(
    as_factor,
    fct_count,
    fct_match,
    fct_unique,
    lvls_reorder,
    lvls_revalue,
    lvls_expand,
    lvls_union,
    book="forcat_lvl_addrm",
)


### # as_factor  

##### Convert an iterable into a pandas.Categorical object

##### Args:
&emsp;&emsp;`x`: The iterable  

##### Returns:
&emsp;&emsp;The converted categorical object  


### # fct_count  

##### Count entries in a factor

##### Args:
&emsp;&emsp;`_f`: A factor  
&emsp;&emsp;`sort`: If True, sort the result so that the most common values float to  
&emsp;&emsp;&emsp;&emsp;the top  

&emsp;&emsp;`prop`: If True, compute the fraction of marginal table.  

##### Returns:
&emsp;&emsp;A data frame with columns `f`, `n` and `p`, if prop is True  


### # fct_match  

##### Test for presence of levels in a factor

Do any of `lvls` occur in `_f`?  

##### Args:
&emsp;&emsp;`_f`: A factor  
&emsp;&emsp;`lvls`: A vector specifying levels to look for.  

##### Returns:
&emsp;&emsp;A logical factor  


### # fct_unique  

##### Unique values of a factor

##### Args:
&emsp;&emsp;`_f`: A factor  

##### Returns:
&emsp;&emsp;The factor with the unique values in `_f`  


### # lvls_reorder  

##### Leaves values of a factor as they are, but changes the order by
given indices  

##### Args:
&emsp;&emsp;`f`: A factor (or character vector).  
&emsp;&emsp;`idx`: A integer index, with one integer for each existing level.  
&emsp;&emsp;`new_levels`: A character vector of new levels.  
&emsp;&emsp;`ordered`: A logical which determines the "ordered" status of the  
&emsp;&emsp;&emsp;&emsp;output factor. `None` preserves the existing status of the factor.  

##### Returns:
&emsp;&emsp;The factor with levels reordered  


### # lvls_revalue  

##### changes the values of existing levels; there must
be one new level for each old level  

##### Args:
&emsp;&emsp;`_f`: A factor  
&emsp;&emsp;`new_levels`: A character vector of new levels.  

##### Returns:
&emsp;&emsp;The factor with the new levels  


### # lvls_expand  

##### Expands the set of levels; the new levels must
include the old levels.  

##### Args:
&emsp;&emsp;`_f`: A factor  
&emsp;&emsp;`new_levels`: The new levels. Must include the old ones  

##### Returns:
&emsp;&emsp;The factor with the new levels  


### # lvls_union  

##### Find all levels in a list of factors

##### Args:
&emsp;&emsp;`fs`: A list of factors  

##### Returns:
&emsp;&emsp;A list of all levels  


## as_factor

In [46]:
x = c("a", "z", "g")
as_factor(x)

['a', 'z', 'g']
Categories (3, object): ['a', 'g', 'z']

In [47]:
y = c("1.1", "11", "2.2", "22")
as_factor(y)

['1.1', '11', '2.2', '22']
Categories (4, object): ['1.1', '11', '2.2', '22']

In [48]:
z = as_numeric(y)
as_factor(z)

  for val, m in zip(values.ravel(), mask.ravel())


[1.1, 11.0, 2.2, 22.0]
Categories (4, float64): [1.1, 2.2, 11.0, 22.0]

## fct_count

In [49]:
fct = factor(sample(letters)[rpois(1000, 10)])
table(fct)

Unnamed: 0,a,b,d,e,f,g,h,i,j,k,n,p,r,s,t,u,v,x,z
,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>
count,108,88,9,6,18,128,49,13,2,2,2,16,72,89,49,27,115,70,137


In [50]:
fct_count(fct)

Unnamed: 0,f,n
,<category>,<int64>
0.0,a,108
1.0,b,88
2.0,d,9
3.0,e,6
4.0,f,18
5.0,g,128
6.0,h,49
7.0,i,13
8.0,j,2


In [51]:
fct_count(fct, sort = TRUE)

Unnamed: 0,f,n
,<category>,<int64>
0.0,z,137
1.0,g,128
2.0,v,115
3.0,a,108
4.0,s,89
5.0,b,88
6.0,r,72
7.0,x,70
8.0,h,49


In [52]:
fct_count(fct, sort = TRUE, prop = TRUE)

Unnamed: 0,f,n,p
,<category>,<int64>,<float64>
0.0,z,137,0.137
1.0,g,128,0.128
2.0,v,115,0.115
3.0,a,108,0.108
4.0,s,89,0.089
5.0,b,88,0.088
6.0,r,72,0.072
7.0,x,70,0.070
8.0,h,49,0.049


## fct_match

In [53]:
table(fct_match(gss_cat.marital, c("Married", "Divorced")))

Unnamed: 0,False,True
,<int64>,<int64>
count,7983,13500


In [54]:
table(numpy.isin(gss_cat.marital, c("Maried", "Davorced")))

Unnamed: 0,False
,<int64>
count,21483


In [55]:
with try_catch():
    table(fct_match(gss_cat.marital, c("Maried", "Davorced")))

[TypeError] only integer scalar arrays can be converted to a scalar index


## fct_unique

In [56]:
fct = factor(letters[rpois(100, 10)-1])

unique(fct)

['k', 'g', 'j', 'd', 'h', ..., 'a', 'i', 'n', 'p', 'c']
Length: 16
Categories (16, object): ['a', 'c', 'd', 'e', ..., 'n', 'o', 'p', 's']

In [57]:
fct_unique(fct)

['a', 'c', 'd', 'e', 'f', ..., 'm', 'n', 'o', 'p', 's']
Length: 16
Categories (16, object): ['a', 'c', 'd', 'e', ..., 'n', 'o', 'p', 's']

## lvls_reorder, lvls_revalue and lvls_expand

In [58]:
fct = factor(c("a", "b", "c"))
lvls_reorder(fct, [3,2,1])

['a', 'b', 'c']
Categories (3, object): ['c', 'b', 'a']

In [59]:
lvls_revalue(fct, c("apple", "banana", "carrot"))

['apple', 'banana', 'carrot']
Categories (3, object): ['apple', 'banana', 'carrot']

In [60]:
lvls_expand(fct, c("a", "b", "c", "d"))

['a', 'b', 'c']
Categories (4, object): ['a', 'b', 'c', 'd']

## lvls_union

In [62]:
fs = [factor("a"), factor("b"), factor(c("a", "b"))]
lvls_union(fs)

['a', 'b']