In [1]:
%run nb_helpers.py

import numpy
from datar.all import *
from datar.datasets import gss_cat


nb_header(
    as_factor,
    fct_count,
    fct_match,
    fct_unique,
    lvls_reorder,
    lvls_revalue,
    lvls_expand,
    lvls_union,
    book="forcat_lvl_addrm",
)


### <div style="background-color: #EEE; padding: 5px 0 8px 0">★ fct_inorder</div>

##### Reorder factor levels by first appearance

##### Args:
&emsp;&emsp;`_f`: A factor  
&emsp;&emsp;`ordered`: A logical which determines the "ordered" status of the  
&emsp;&emsp;&emsp;&emsp;output factor.  

##### Returns:
&emsp;&emsp;The factor with levels reordered  


### <div style="background-color: #EEE; padding: 5px 0 8px 0">★ fct_count</div>

##### Count entries in a factor

##### Args:
&emsp;&emsp;`_f`: A factor  
&emsp;&emsp;`sort`: If True, sort the result so that the most common values float to  
&emsp;&emsp;&emsp;&emsp;the top  

&emsp;&emsp;`prop`: If True, compute the fraction of marginal table.  

##### Returns:
&emsp;&emsp;A data frame with columns `f`, `n` and `p`, if prop is True  


### <div style="background-color: #EEE; padding: 5px 0 8px 0">★ fct_match</div>

##### Test for presence of levels in a factor

Do any of `lvls` occur in `_f`?  

##### Args:
&emsp;&emsp;`_f`: A factor  
&emsp;&emsp;`lvls`: A vector specifying levels to look for.  

##### Returns:
&emsp;&emsp;A logical factor  


### <div style="background-color: #EEE; padding: 5px 0 8px 0">★ fct_unique</div>

##### Unique values of a factor

##### Args:
&emsp;&emsp;`_f`: A factor  

##### Returns:
&emsp;&emsp;The factor with the unique values in `_f`  


### <div style="background-color: #EEE; padding: 5px 0 8px 0">★ lvls_reorder</div>

##### Leaves values of a factor as they are, but changes the order by
given indices  

##### Args:
&emsp;&emsp;`f`: A factor (or character vector).  
&emsp;&emsp;`idx`: A integer index, with one integer for each existing level.  
&emsp;&emsp;`new_levels`: A character vector of new levels.  
&emsp;&emsp;`ordered`: A logical which determines the "ordered" status of the  
&emsp;&emsp;&emsp;&emsp;output factor. `None` preserves the existing status of the factor.  

##### Returns:
&emsp;&emsp;The factor with levels reordered  


### <div style="background-color: #EEE; padding: 5px 0 8px 0">★ lvls_revalue</div>

##### changes the values of existing levels; there must
be one new level for each old level  

##### Args:
&emsp;&emsp;`_f`: A factor  
&emsp;&emsp;`new_levels`: A character vector of new levels.  

##### Returns:
&emsp;&emsp;The factor with the new levels  


### <div style="background-color: #EEE; padding: 5px 0 8px 0">★ lvls_expand</div>

##### Expands the set of levels; the new levels must
include the old levels.  

##### Args:
&emsp;&emsp;`_f`: A factor  
&emsp;&emsp;`new_levels`: The new levels. Must include the old ones  

##### Returns:
&emsp;&emsp;The factor with the new levels  


### <div style="background-color: #EEE; padding: 5px 0 8px 0">★ lvls_union</div>

##### Find all levels in a list of factors

##### Args:
&emsp;&emsp;`fs`: A list of factors  

##### Returns:
&emsp;&emsp;A list of all levels  


## as_factor

In [2]:
x = c("a", "z", "g")
as_factor(x)

['a', 'z', 'g']
Categories (3, object): ['a', 'z', 'g']

In [3]:
y = c("1.1", "11", "2.2", "22")
as_factor(y)

['1.1', '11', '2.2', '22']
Categories (4, object): ['1.1', '11', '2.2', '22']

In [4]:
z = as_numeric(y)
as_factor(z)

  output = repr(obj)


[1.1, 11.0, 2.2, 22.0]
Categories (4, float64): [1.1, 11.0, 2.2, 22.0]

## fct_count

In [5]:
fct = factor(sample(letters)[rpois(1000, 10)])
table(fct)

Unnamed: 0,a,b,c,d,e,f,h,j,k,l,m,n,o,p,r,s,t,u,x,z
,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>
count,113,110,19,37,76,38,11,98,14,2,101,92,6,10,84,11,49,2,126,1


In [6]:
fct_count(fct)

Unnamed: 0,f,n
,<category>,<int64>
0.0,a,113
1.0,b,110
2.0,c,19
3.0,d,37
4.0,e,76
5.0,f,38
6.0,h,11
7.0,j,98
8.0,k,14


In [7]:
fct_count(fct, sort = TRUE)

Unnamed: 0,f,n
,<category>,<int64>
18.0,x,126
0.0,a,113
1.0,b,110
10.0,m,101
7.0,j,98
11.0,n,92
14.0,r,84
4.0,e,76
16.0,t,49


In [8]:
fct_count(fct, sort = TRUE, prop = TRUE)

Unnamed: 0,f,n,p
,<category>,<int64>,<float64>
18.0,x,126,0.126
0.0,a,113,0.113
1.0,b,110,0.110
10.0,m,101,0.101
7.0,j,98,0.098
11.0,n,92,0.092
14.0,r,84,0.084
4.0,e,76,0.076
16.0,t,49,0.049


## fct_match

In [9]:
table(fct_match(gss_cat.marital, c("Married", "Divorced")))

Unnamed: 0,False,True
,<int64>,<int64>
count,7983,13500


In [10]:
table(numpy.isin(gss_cat.marital, c("Maried", "Davorced")))

Unnamed: 0,False
,<int64>
count,21483


In [11]:
with try_catch():
    table(fct_match(gss_cat.marital, c("Maried", "Davorced")))

[ValueError] Levels not present in factor: ['Maried' 'Davorced'].


## fct_unique

In [12]:
fct = factor(letters[rpois(100, 10)-1])

unique(fct)

['l', 'm', 'h', 'u', 'k', ..., 'n', 'd', 'q', 'p', 'e']
Length: 15
Categories (15, object): ['d', 'e', 'f', 'g', ..., 'o', 'p', 'q', 'u']

In [13]:
fct_unique(fct)

['d', 'e', 'f', 'g', 'h', ..., 'n', 'o', 'p', 'q', 'u']
Length: 15
Categories (15, object): ['d', 'e', 'f', 'g', ..., 'o', 'p', 'q', 'u']

## lvls_reorder, lvls_revalue and lvls_expand

In [14]:
fct = factor(c("a", "b", "c"))
lvls_reorder(fct, [2,1,0])

['a', 'b', 'c']
Categories (3, object): ['c', 'b', 'a']

In [15]:
lvls_revalue(fct, c("apple", "banana", "carrot"))

['apple', 'banana', 'carrot']
Categories (3, object): ['apple', 'banana', 'carrot']

In [16]:
lvls_expand(fct, c("a", "b", "c", "d"))

['a', 'b', 'c']
Categories (4, object): ['a', 'b', 'c', 'd']

## lvls_union

In [17]:
fs = [factor("a"), factor("b"), factor(c("a", "b"))]
lvls_union(fs)

array(['a', 'b'], dtype=object)