In [1]:
from datar.all import *
from datar.datasets import gss_cat

gss_cat >>= mutate(rincome=as_factor(f.rincome))

%run nb_helpers.py

nb_header(
    fct_anon,
    fct_collapse,
    fct_lump,
    fct_lump_min,
    fct_lump_prop,
    fct_lump_n,
    fct_lump_lowfreq,
    fct_other,
    fct_recode,
    fct_relabel,
    book="forcat_lvl_value",
)




### # fct_anon  

##### Anonymise factor levels

##### Args:
&emsp;&emsp;`f`: A factor.  
&emsp;&emsp;`prefix`: A character prefix to insert in front of the random labels.  

##### Returns:
&emsp;&emsp;The factor with levels anonymised  


### # fct_collapse  

##### Collapse factor levels into manually defined groups

##### Args:
&emsp;&emsp;`_f`: A factor  
&emsp;&emsp;`**kwargs`: The levels to collapse.  
&emsp;&emsp;&emsp;&emsp;Like `name=[old_level, old_level1, ...]`. The old levels will  
&emsp;&emsp;&emsp;&emsp;be replaced with `name`  

&emsp;&emsp;`other_level`: Replace all levels not named in `kwargs`.  
&emsp;&emsp;&emsp;&emsp;If not, don't collapse them.  

##### Returns:
&emsp;&emsp;The factor with levels collapsed.  


### # fct_lump  

##### Lump together factor levels into "other"

##### Args:
&emsp;&emsp;`f`: A factor  
&emsp;&emsp;`n`: Positive `n` preserves the most common `n` values.  
&emsp;&emsp;&emsp;&emsp;Negative `n` preserves the least common `-n` values.  
&emsp;&emsp;&emsp;&emsp;It there are ties, you will get at least `abs(n)` values.  

&emsp;&emsp;`prop`: Positive `prop` lumps values which do not appear at least  
&emsp;&emsp;&emsp;&emsp;`prop` of the time. Negative `prop` lumps values that  
&emsp;&emsp;&emsp;&emsp;do not appear at most `-prop` of the time.  

&emsp;&emsp;`w`: An optional numeric vector giving weights for frequency of  
&emsp;&emsp;&emsp;&emsp;each value (not level) in f.  

&emsp;&emsp;`other_level`: Value of level used for "other" values. Always  
&emsp;&emsp;&emsp;&emsp;placed at end of levels.  

&emsp;&emsp;ties_method A character string specifying how ties are treated.  
&emsp;&emsp;&emsp;&emsp;One of: `average`, `first`, `dense`, `max`, and `min`.  

##### Returns:
&emsp;&emsp;The factor with levels lumped.  


### # fct_lump_min  

##### lumps levels that appear fewer than `min` times.

##### Args:
&emsp;&emsp;`_f`: A factor  
&emsp;&emsp;`min`: Preserve levels that appear at least `min` number of times.  
&emsp;&emsp;`w`: An optional numeric vector giving weights for frequency of  
&emsp;&emsp;&emsp;&emsp;each value (not level) in f.  

&emsp;&emsp;`other_level`: Value of level used for "other" values. Always  
&emsp;&emsp;&emsp;&emsp;placed at end of levels.  

##### Returns:
&emsp;&emsp;The factor with levels lumped.  


### # fct_lump_prop  

##### Lumps levels that appear in fewer `prop * n` times.

##### Args:
&emsp;&emsp;`_f`: A factor  
&emsp;&emsp;`prop`: Positive `prop` lumps values which do not appear at least  
&emsp;&emsp;&emsp;&emsp;`prop` of the time. Negative `prop` lumps values that  
&emsp;&emsp;&emsp;&emsp;do not appear at most `-prop` of the time.  

&emsp;&emsp;`w`: An optional numeric vector giving weights for frequency of  
&emsp;&emsp;&emsp;&emsp;each value (not level) in f.  

&emsp;&emsp;`other_level`: Value of level used for "other" values. Always  
&emsp;&emsp;&emsp;&emsp;placed at end of levels.  

##### Returns:
&emsp;&emsp;The factor with levels lumped.  


### # fct_lump_n  

##### Lumps all levels except for the `n` most frequent.

##### Args:
&emsp;&emsp;`f`: A factor  
&emsp;&emsp;`n`: Positive `n` preserves the most common `n` values.  
&emsp;&emsp;&emsp;&emsp;Negative `n` preserves the least common `-n` values.  
&emsp;&emsp;&emsp;&emsp;It there are ties, you will get at least `abs(n)` values.  

&emsp;&emsp;`w`: An optional numeric vector giving weights for frequency of  
&emsp;&emsp;&emsp;&emsp;each value (not level) in f.  

&emsp;&emsp;`other_level`: Value of level used for "other" values. Always  
&emsp;&emsp;&emsp;&emsp;placed at end of levels.  

&emsp;&emsp;ties_method A character string specifying how ties are treated.  
&emsp;&emsp;&emsp;&emsp;One of: `average`, `first`, `dense`, `max`, and `min`.  

##### Returns:
&emsp;&emsp;The factor with levels lumped.  


### # fct_lump_lowfreq  

##### lumps together the least frequent levels, ensuring
that "other" is still the smallest level.  

##### Args:
&emsp;&emsp;`f`: A factor  
&emsp;&emsp;`other_level`: Value of level used for "other" values. Always  
&emsp;&emsp;&emsp;&emsp;placed at end of levels.  

##### Returns:
&emsp;&emsp;The factor with levels lumped.  


### # fct_other  

##### Replace levels with "other"

##### Args:
&emsp;&emsp;`_f`: A factor  
&emsp;&emsp;`keep`: and  
&emsp;&emsp;`drop`: Pick one of `keep` and `drop`:  
&emsp;&emsp;&emsp;&emsp;- `keep` will preserve listed levels, replacing all others with
&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;`other_level`.  

&emsp;&emsp;&emsp;&emsp;- `drop` will replace listed levels with `other_level`, keeping all
&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;as is.  

&emsp;&emsp;`other_level`: Value of level used for "other" values. Always  
&emsp;&emsp;&emsp;&emsp;placed at end of levels.  

##### Returns:
&emsp;&emsp;The factor with levels replaced.  


### # fct_recode  

##### Change factor levels by hand

##### Args:
&emsp;&emsp;`_f`: A factor  
&emsp;&emsp;`*args`: and  
&emsp;&emsp;`**kwargs`: A sequence of named character vectors where the name  
&emsp;&emsp;&emsp;&emsp;gives the new level, and the value gives the old level.  
&emsp;&emsp;&emsp;&emsp;Levels not otherwise mentioned will be left as is. Levels can  
&emsp;&emsp;&emsp;&emsp;be removed by naming them `NULL`.  
&emsp;&emsp;&emsp;&emsp;As `NULL/None` cannot be a name of keyword arguments, replacement  
&emsp;&emsp;&emsp;&emsp;has to be specified as a dict  
&emsp;&emsp;&emsp;&emsp;(i.e. `fct_recode(x, {NULL: "apple"})`)  
&emsp;&emsp;&emsp;&emsp;If you want to replace multiple values with the same old value,  
&emsp;&emsp;&emsp;&emsp;use a `set`/`list`/`numpy.ndarray`  
&emsp;&emsp;&emsp;&emsp;(i.e. `fct_recode(x, fruit=["apple", "banana"])`).  
&emsp;&emsp;&emsp;&emsp;This is a safe way, since `set`/`list`/`numpy.ndarray` is  
&emsp;&emsp;&emsp;&emsp;not hashable to be a level of a factor.  
&emsp;&emsp;&emsp;&emsp;Do NOT use a `tuple`, as it's hashable!  

&emsp;&emsp;&emsp;&emsp;Note that the order of the name-value is in the reverse way as  
&emsp;&emsp;&emsp;&emsp;`dplyr.recode()` and `dplyr.recode_factor()`  

##### Returns:
&emsp;&emsp;The factor recoded with given recodings  


### # fct_relabel  

##### Automatically relabel factor levels, collapse as necessary

##### Args:
&emsp;&emsp;`_f`: A factor  
&emsp;&emsp;`_fun`: A function to be applied to each level. Must accept the old  
&emsp;&emsp;&emsp;&emsp;levels and return a character vector of the same length  
&emsp;&emsp;&emsp;&emsp;as its input.  

&emsp;&emsp;`*args`: and  
&emsp;&emsp;`**kwargs`: Addtional arguments to `_fun`  

##### Returns:
&emsp;&emsp;The factor with levels relabeled  


## fct_anon

In [2]:
gss_cat.relig >> fct_count()

Unnamed: 0,f,n
,<category>,<int64>
0.0,Buddhism,147
1.0,Catholic,5124
2.0,Christian,689
3.0,Don't know,15
4.0,Hinduism,71
5.0,Inter-nondenominational,109
6.0,Jewish,388
7.0,Moslem/islam,104
8.0,Native american,23


In [3]:
gss_cat.relig >> fct_anon() >> fct_count()

Unnamed: 0,f,n
,<category>,<int64>
0.0,00,95
1.0,01,15
2.0,02,689
3.0,03,388
4.0,04,5124
5.0,05,32
6.0,06,147
7.0,07,23
8.0,08,109


In [4]:
gss_cat.relig >> fct_anon("X") >> fct_count()

Unnamed: 0,f,n
,<category>,<int64>
0.0,X00,388
1.0,X01,5124
2.0,X02,32
3.0,X03,93
4.0,X04,104
5.0,X05,71
6.0,X06,23
7.0,X07,147
8.0,X08,10846


## fct_collapse

In [5]:
fct_count(gss_cat.partyid)

Unnamed: 0,f,n
,<category>,<int64>
0.0,Don't know,1
1.0,"Ind,near dem",2499
2.0,"Ind,near rep",1791
3.0,Independent,4119
4.0,No answer,154
5.0,Not str democrat,3690
6.0,Not str republican,3032
7.0,Other party,393
8.0,Strong democrat,3490


In [6]:
partyid2 = fct_collapse(
    gss_cat.partyid,
    missing = c("No answer", "Don't know"),
    other = "Other party",
    rep = c("Strong republican", "Not str republican"),
    ind = c("Ind,near rep", "Independent", "Ind,near dem"),
    dem = c("Not str democrat", "Strong democrat")
)
fct_count(partyid2)

Unnamed: 0,f,n
,<category>,<int64>
0.0,missing,155
1.0,ind,8409
2.0,dem,7180
3.0,rep,5346
4.0,other,393


## fct_recode

In [7]:
x = factor(c("apple", "bear", "banana", "dear"))
fct_recode(x, fruit=["apple", "banana"])

['fruit', 'bear', 'fruit', 'dear']
Categories (3, object): ['fruit', 'bear', 'dear']

In [8]:
# If you make a mistake you'll get a warning
fct_recode(x, fruit=["apple", "bananana"])



['fruit', 'bear', 'banana', 'dear']
Categories (4, object): ['fruit', 'banana', 'bear', 'dear']

In [9]:
fct_recode(x, {NULL: "apple"}, fruit = "banana")

[NaN, 'fruit', 'bear', 'dear']
Categories (3, object): ['fruit', 'bear', 'dear']

In [10]:
# Anything cannot be a keyword directly, use a dict
fct_recode(x, {"an apple": "apple", "a bear": "bear"})

['an apple', 'a bear', 'banana', 'dear']
Categories (4, object): ['an apple', 'banana', 'a bear', 'dear']

## fct_lump, fct_lump_min, fct_lump_prop, fct_lump_n, and fct_lump_lowfreq

In [12]:
x = factor(rep(LETTERS[:9], times = c(40, 10, 5, 27, 1, 1, 1, 1, 1)))
table(x)

Unnamed: 0,A,B,C,D,E,F,G,H,I
,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>
count,40,10,5,27,1,1,1,1,1


In [14]:
x >> fct_lump_n(3) 
table(_)

['A', 'A', 'A', 'A', 'A', ..., 'Other', 'Other', 'Other', 'Other', 'Other']
Length: 87
Categories (4, object): ['A', 'B', 'D', 'Other']

Unnamed: 0,A,B,D,Other
,<int64>,<int64>,<int64>,<int64>
count,40,10,27,10


In [15]:
x >> fct_lump_prop(0.10) 
table(_)

['A', 'A', 'A', 'A', 'A', ..., 'Other', 'Other', 'Other', 'Other', 'Other']
Length: 87
Categories (4, object): ['A', 'B', 'D', 'Other']

Unnamed: 0,A,B,D,Other
,<int64>,<int64>,<int64>,<int64>
count,40,10,27,10


In [16]:
x >> fct_lump_min(5) 
table(_)

['A', 'A', 'A', 'A', 'A', ..., 'Other', 'Other', 'Other', 'Other', 'Other']
Length: 87
Categories (5, object): ['A', 'B', 'C', 'D', 'Other']

Unnamed: 0,A,B,C,D,Other
,<int64>,<int64>,<int64>,<int64>,<int64>
count,40,10,5,27,5


In [17]:
x >> fct_lump_lowfreq() 
table(_)

['A', 'A', 'A', 'A', 'A', ..., 'Other', 'Other', 'Other', 'Other', 'Other']
Length: 87
Categories (3, object): ['A', 'D', 'Other']

Unnamed: 0,A,D,Other
,<int64>,<int64>,<int64>
count,40,27,20


In [18]:
x = factor(LETTERS[rpois(100, 5)])
x

['H', 'I', 'I', 'D', 'G', ..., 'E', 'H', 'I', 'F', 'E']
Length: 100
Categories (12, object): ['A', 'B', 'C', 'D', ..., 'I', 'J', 'K', 'L']

In [19]:
table(x)

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,K,L
,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>
count,2,1,9,14,14,18,12,12,12,3,1,2


In [20]:
table(fct_lump_lowfreq(x))

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,K,L
,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>
count,2,1,9,14,14,18,12,12,12,3,1,2


In [21]:
fct_lump_n(x, n = 3)

['Other', 'Other', 'Other', 'D', 'Other', ..., 'E', 'Other', 'Other', 'F', 'E']
Length: 100
Categories (4, object): ['D', 'E', 'F', 'Other']

In [22]:
fct_lump_prop(x, prop = 0.1)

['H', 'I', 'I', 'D', 'G', ..., 'E', 'H', 'I', 'F', 'E']
Length: 100
Categories (7, object): ['D', 'E', 'F', 'G', 'H', 'I', 'Other']

In [23]:
# Use negative values to collapse the most common
fct_lump_n(x, n = -3)

['Other', 'Other', 'Other', 'Other', 'Other', ..., 'Other', 'Other', 'Other', 'Other', 'Other']
Length: 100
Categories (5, object): ['A', 'B', 'K', 'L', 'Other']

In [24]:
fct_lump_prop(x, prop = -0.1)

['Other', 'Other', 'Other', 'Other', 'Other', ..., 'Other', 'Other', 'Other', 'Other', 'Other']
Length: 100
Categories (7, object): ['A', 'B', 'C', 'J', 'K', 'L', 'Other']

In [25]:
w = c(rep(2, 50), rep(1, 50))
fct_lump_n(x, n = 5, w = w)

['Other', 'I', 'I', 'D', 'G', ..., 'E', 'Other', 'I', 'F', 'E']
Length: 100
Categories (6, object): ['D', 'E', 'F', 'G', 'I', 'Other']

In [26]:
fct_lump_n(x, n = 6)

['H', 'I', 'I', 'D', 'G', ..., 'E', 'H', 'I', 'F', 'E']
Length: 100
Categories (7, object): ['D', 'E', 'F', 'G', 'H', 'I', 'Other']

In [27]:
fct_lump_n(x, n = 6, ties_method = "max")

['H', 'I', 'I', 'D', 'G', ..., 'E', 'H', 'I', 'F', 'E']
Length: 100
Categories (7, object): ['D', 'E', 'F', 'G', 'H', 'I', 'Other']

In [28]:
# Use fct_lump_min() to lump together all levels with fewer than `n` values
table(fct_lump_min(x, min = 10))


Unnamed: 0,D,E,F,G,H,I,Other
,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>
count,14,14,18,12,12,12,18


In [29]:
table(fct_lump_min(x, min = 15))

Unnamed: 0,F,Other
,<int64>,<int64>
count,18,82


## fct_other

In [30]:
fct_other(x, keep = c("A", "B"))

['Other', 'Other', 'Other', 'Other', 'Other', ..., 'Other', 'Other', 'Other', 'Other', 'Other']
Length: 100
Categories (3, object): ['A', 'B', 'Other']

In [31]:
fct_other(x, drop = c("A", "B"))

['I', 'J', 'J', 'E', 'H', ..., 'F', 'I', 'J', 'G', 'F']
Length: 100
Categories (11, object): ['C', 'D', 'E', 'F', ..., 'J', 'K', 'L', 'Other']

## fct_recode

In [32]:
x = factor(c("apple", "bear", "banana", "dear"))
fct_recode(x, fruit = ["apple", "banana"])

['fruit', 'bear', 'fruit', 'dear']
Categories (3, object): ['fruit', 'bear', 'dear']

In [33]:
# If you make a mistake you'll get a warning
fct_recode(x, fruit = ["apple", "bananana"])



['fruit', 'bear', 'banana', 'dear']
Categories (4, object): ['fruit', 'banana', 'bear', 'dear']

In [34]:
# If you name the level NULL it will be removed
fct_recode(x, {NULL: "apple"}, fruit = "banana")

[NaN, 'fruit', 'bear', 'dear']
Categories (3, object): ['fruit', 'bear', 'dear']

In [35]:
fct_recode(x, {"an apple": "apple", "a bear": "bear"})

['an apple', 'a bear', 'banana', 'dear']
Categories (4, object): ['an apple', 'banana', 'a bear', 'dear']

## fct_relabel

In [36]:
gss_cat.partyid >> fct_count()

Unnamed: 0,f,n
,<category>,<int64>
0.0,Don't know,1
1.0,"Ind,near dem",2499
2.0,"Ind,near rep",1791
3.0,Independent,4119
4.0,No answer,154
5.0,Not str democrat,3690
6.0,Not str republican,3032
7.0,Other party,393
8.0,Strong democrat,3490


In [37]:
gss_cat.partyid >> fct_relabel(lambda old: gsub(",", ", ", old)) >> fct_count()

Unnamed: 0,f,n
,<category>,<int64>
0.0,Don't know,1
1.0,"Ind, near dem",2499
2.0,"Ind, near rep",1791
3.0,Independent,4119
4.0,No answer,154
5.0,Not str democrat,3690
6.0,Not str republican,3032
7.0,Other party,393
8.0,Strong democrat,3490


In [38]:
fct_count(gss_cat.rincome)

Unnamed: 0,f,n
,<category>,<int64>
0.0,$8000 to 9999,340
1.0,Not applicable,7043
2.0,$20000 - 24999,1283
3.0,$25000 or more,7363
4.0,$7000 to 7999,188
5.0,$10000 - 14999,1168
6.0,Refused,975
7.0,$15000 - 19999,1048
8.0,$3000 to 3999,276


In [39]:
def convert_income(income):
    regex = r"^(?:Lt |)[$]([0-9]+).*$"
    is_range = grepl(regex, income)
    num_income = as_numeric(gsub(regex, r"\1", income[is_range]))
    num_income = trunc(num_income / 5000) * 5000
    income[is_range] = paste0("Gt $", num_income)
    return income

convert_income(levels(gss_cat.rincome))

array(['Gt $5000.0', 'Not applicable', 'Gt $20000.0', 'Gt $25000.0',
       'Gt $5000.0', 'Gt $10000.0', 'Refused', 'Gt $15000.0', 'Gt $0.0',
       'Gt $5000.0', "Don't know", 'Gt $0.0', 'Gt $0.0', 'No answer',
       'Gt $5000.0', 'Gt $0.0'], dtype=object)

In [40]:
rincome2 = fct_relabel(gss_cat.rincome, convert_income)
fct_count(rincome2)

Unnamed: 0,f,n
,<category>,<int64>
0.0,Gt $5000.0,970
1.0,Not applicable,7043
2.0,Gt $20000.0,1283
3.0,Gt $25000.0,7363
4.0,Gt $10000.0,1168
5.0,Refused,975
6.0,Gt $15000.0,1048
7.0,Gt $0.0,1183
8.0,Don't know,267
