Permalink
Cannot retrieve contributors at this time
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
760 lines (671 sloc)
27.8 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# @author Scott Dobbins | |
# @version 0.9.9.6 | |
# @date 2017-11-19 01:00 | |
### Statistics -------------------------------------------------------------- | |
tabulate_factor <- function(fact) { | |
return (tabulate(fact, nbins = nlevels(fact))) | |
} | |
mode_factor <- function(fact) { | |
return (levels(fact)[which.max(tabulate_factor(fact))]) | |
} | |
mode_and_others_factor <- function(fact) { | |
all_levels <- levels(fact) | |
tabs <- tabulate_factor(fact) | |
present_levels <- all_levels[tabs > 0L] | |
mode_level <- all_levels[which.max(tabs)] | |
non_mode_levels <- present_levels %d% mode_level | |
return (list("mode" = mode_level, "others" = non_mode_levels)) | |
} | |
### Information ------------------------------------------------------------- | |
level_proportions <- function(column, na.rm = FALSE) { | |
assert_that(is.factor(column), | |
msg = "You supplied a non-factor vector/column to a factor-only method") | |
tab_counts <- tabulate_factor(column) | |
if (na.rm && (sum(tab_counts == 0L) > 0L)) { | |
tab_counts <- tab_counts %[!=]% 0 | |
} | |
tab_total <- sum(tab_counts) | |
return (tab_counts / tab_total) | |
} | |
missing_levels <- function(fact) { | |
return (levels(fact)[tabulate_factor(fact) == 0L]) | |
} | |
level_index <- function(fact, lev) { | |
if (is.na(lev)) { | |
return (which(is.na(levels(fact)))) | |
} else { | |
return (which(levels(fact) == lev)) | |
} | |
} | |
# duplicate of level_index() with better name and default argument for finding NAs | |
NA_index <- function(fact, NA_level = NA_character_) { | |
if (is.na(NA_level)) { | |
return (which(is.na(levels(fact)))) | |
} else { | |
return (which(levels(fact) == NA_level)) | |
} | |
} | |
### Helpers ----------------------------------------------------------------- | |
# best versions of the factor functions set levels (and names) by reference using data.table::setattr | |
if (is_package_installed("data.table")) { | |
# function call is faster without :: reference | |
if (is_package_loaded("data.table")) { | |
re_name <- function(vec, new_names) { | |
setattr(vec, "names", new_names) | |
} | |
re_level <- function(vec, new_levels, NA_level = NA_character_) { | |
setattr(vec, "levels", fix_NA_levels(new_levels, NA_level = NA_level)) | |
} | |
} else { | |
re_name <- function(vec, new_names) { | |
data.table::setattr(vec, "names", new_names) | |
} | |
re_level <- function(vec, new_levels, NA_level = NA_character_) { | |
data.table::setattr(vec, "levels", fix_NA_levels(new_levels, NA_level = NA_level)) | |
} | |
} | |
} else { | |
re_name <- function(vec, new_names) { | |
global_vec_name <- deparse(substitute(vec)) | |
names(vec) <- new_names | |
assign(global_vec_name, vec, envir = parent.frame()) | |
invisible(vec) | |
} | |
re_level <- function(vec, new_levels) { | |
global_vec_name <- deparse(substitute(vec)) | |
levels(vec) <- new_levels | |
assign(global_vec_name, vec, envir = parent.frame()) | |
invisible(vec) | |
} | |
} | |
fix_NA_levels <- function(levs, NA_level = NA_character_) { | |
if (!is.na(NA_level) && anyNA(levs)) { | |
warning("NA levels set to the given (character) NA_level") | |
levs[is.na(levs)] <- NA_level | |
} | |
return (levs) | |
} | |
### Ordering Function ------------------------------------------------------- | |
ordered_empty_at_end <- function(column, empty_string) { | |
ordered_levels <- sort(levels(column)) | |
if ("" %c% ordered_levels) { | |
ordered_levels <- c(ordered_levels %[!=]% "", empty_string) | |
return (ordered(replace_level(column, from = "", to = empty_string), levels = ordered_levels)) | |
} else { | |
if (empty_string %c% ordered_levels) { | |
ordered_levels <- c(ordered_levels %[!=]% empty_string, empty_string) | |
} | |
return (ordered(column, levels = ordered_levels)) | |
} | |
} | |
refactor_and_order <- function(column, empty_string, drop_to = "") { | |
if (drop_to %c% missing_levels(column) || drop_to %!c% levels(column)) { | |
return (ordered_empty_at_end(column = droplevels(column), empty_string = empty_string)) | |
} else { | |
return (ordered_empty_at_end(column = drop_missing_levels(column, to = drop_to), empty_string = empty_string)) | |
} | |
} | |
### Level Functions --------------------------------------------------------- | |
### Format | |
format_levels <- function(fact, func, ...) { | |
re_level(fact, func(levels(fact), ...)) | |
} | |
format_similar_levels <- function(fact, pairings, exact = FALSE, ...) { | |
new_levels <- levels(fact) | |
pairings_names <- get_names(pairings) | |
if (exact) { | |
for (i in seq_along(pairings)) { | |
new_levels[new_levels %exactlylike% pairings_names[[i]]] <- (pairings[[i]])(new_levels %whichexactlylike% pairings_names[[i]], ...) | |
} | |
} else { | |
for (i in seq_along(pairings)) { | |
new_levels[new_levels %like% pairings_names[[i]]] <- (pairings[[i]])(new_levels %whichlike% pairings_names[[i]], ...) | |
} | |
} | |
re_level(fact, new_levels) | |
} | |
### Replace | |
replace_level <- function(fact, from, to) { | |
assert_that(length(from) == 1L && length(to) == 1L, | |
msg = "either (or both) 'from' or 'to' are of length > 1L (you may have intended to use replace_levels, not replace_level") | |
new_levels <- levels(fact) | |
new_levels[new_levels == from] <- to | |
re_level(fact, new_levels) | |
} | |
replace_levels <- function(fact, from, to) { | |
assert_that(length(from) == length(to) || length(to) == 1L, | |
msg = "Lengths of 'from' and 'to' don't match") | |
new_levels <- levels(fact) | |
if (length(to) == 1L) { | |
for (i in seq_along(from)) { | |
new_levels[new_levels == from[[i]]] <- to | |
} | |
} else { | |
for (i in seq_along(from)) { | |
new_levels[new_levels == from[[i]]] <- to[[i]] | |
} | |
} | |
re_level(fact, new_levels) | |
} | |
### Rename | |
rename_levels <- function(fact, changes) { | |
new_levels <- levels(fact) | |
re_name(new_levels, new_levels) | |
changes <- changes %whichin% new_levels | |
if (!is_empty(changes)) { | |
change_empty_level <- "" %c% changes | |
if (change_empty_level) { | |
empty_change <- get_names(changes %[==]% "") | |
changes <- changes %[!=]% "" | |
} | |
changes_names <- get_names(changes) | |
for (i in seq_along(changes)) { | |
new_levels[[changes[[i]]]] <- changes_names[[i]] | |
} | |
if (change_empty_level) { | |
new_levels[new_levels == ""] <- empty_change | |
} | |
re_level(fact, unname(new_levels)) | |
} else { | |
invisible(fact) | |
} | |
} | |
rename_similar_levels <- function(fact, changes, exact = FALSE) { | |
new_levels <- levels(fact) | |
changes_names <- get_names(changes) | |
for (i in seq_along(changes)) { | |
new_levels <- gsub(pattern = changes[[i]], replacement = changes_names[[i]], new_levels, fixed = exact) | |
} | |
re_level(fact, new_levels) | |
} | |
### Add | |
add_levels <- function(fact, add) { | |
new_levels <- levels(fact) | |
add <- add %which!in% new_levels | |
if (!is_empty(add)) { | |
re_level(fact, c(new_levels, add)) | |
} else { | |
invisible(fact) | |
} | |
} | |
### Drop | |
drop_levels <- function(fact, drop, to = "") { | |
new_levels <- levels(fact) | |
drop <- drop %whichin% new_levels | |
if (!is_empty(drop)) { | |
re_name(new_levels, new_levels) | |
drop_empty_level <- "" %c% drop | |
if (drop_empty_level) { | |
drop <- drop %[!=]% "" | |
} | |
for (i in seq_along(drop)) { | |
new_levels[[drop[[i]]]] <- to | |
} | |
if (drop_empty_level) { | |
new_levels[new_levels == ""] <- to | |
} | |
re_level(fact, unname(new_levels)) | |
} else { | |
invisible(fact) | |
} | |
} | |
drop_similar_levels <- function(fact, drop, to = "", exact = FALSE) { | |
new_levels <- levels(fact) | |
for (i in seq_along(drop)) { | |
new_levels[grepl(pattern = drop[[i]], fixed = exact, new_levels)] <- to | |
} | |
re_level(fact, new_levels) | |
} | |
drop_missing_levels <- function(fact, to = "") { | |
drop_levels(fact, drop = missing_levels(fact), to) | |
} | |
drop_levels_formula <- function(fact, expr, to = "") { | |
drop_levels <- eval(parse(text = paste0("levels(fact) %[]% ", deparse(substitute(expr))))) | |
drop_levels(fact, drop = drop_levels, to) | |
} | |
keep_levels <- function(fact, keep, to = "") { | |
new_levels <- levels(fact) | |
new_levels[new_levels %!in% keep] <- to | |
re_level(fact, new_levels) | |
} | |
keep_similar_levels <- function(fact, keep, to = "", exact = FALSE) { | |
levels_to_drop <- levels(fact) | |
for (i in seq_along(keep)) { | |
levels_to_drop[grepl(pattern = keep[[i]], fixed = exact, levels_to_drop)] <- to | |
} | |
levels_to_drop <- levels_to_drop[levels_to_drop != to] | |
drop_levels(fact, drop = levels_to_drop, to = to) | |
} | |
### Simplify | |
reduce_levels <- function(fact, rules, other = "other", exact = FALSE) { | |
replacements <- get_names(rules) | |
patterns <- unname(rules) | |
old_levels <- levels(fact) | |
new_levels <- rep(other, length(old_levels)) | |
for (i in seq_along(rules)) { | |
slicer <- grepl(pattern = patterns[[i]], fixed = exact, old_levels) | |
new_levels[slicer] <- replacements[[i]] | |
old_levels[slicer] <- "" | |
} | |
re_level(fact, new_levels) | |
} | |
otherize_levels_rank <- function(fact, cutoff, other = "other", otherize_empty_levels = TRUE, include_ties = TRUE) { | |
fact_levels <- levels(fact) | |
if (otherize_empty_levels) { | |
otherize_empty_levels <- "" %c% fact_levels | |
} | |
lookup_table <- data.table(levels = fact_levels, count = tabulate_factor(fact)) | |
print(lookup_table) | |
if (other %c% fact_levels) { | |
cutoff <- cutoff + 1L | |
} | |
if (otherize_empty_levels) { | |
lookup_table <- lookup_table[levels != ""] | |
} | |
setkey(lookup_table, count) | |
if (include_ties) { | |
count_at_cutoff <- lookup_table[.N-cutoff, count] | |
dropped_levels <- lookup_table[count < count_at_cutoff, levels] | |
} else { | |
dropped_levels <- lookup_table[1:(.N-cutoff), levels] | |
} | |
if (otherize_empty_levels) { | |
dropped_levels <- append(dropped_levels, "") | |
} | |
drop_levels(fact, drop = dropped_levels, to = other) | |
} | |
otherize_levels_prop <- function(fact, cutoff, other = "other", otherize_empty_levels = TRUE) { | |
fact_levels <- levels(fact) | |
if (otherize_empty_levels) { | |
otherize_empty_levels <- "" %c% fact_levels | |
} | |
lookup_table <- data.table(levels = fact_levels, prop = level_proportions(fact)) | |
if (otherize_empty_levels) { | |
lookup_table <- lookup_table[levels != ""] | |
} | |
dropped_levels <- lookup_table[prop < cutoff, levels] | |
if (otherize_empty_levels) { | |
dropped_levels <- append(dropped_levels, "") | |
} | |
drop_levels(fact, drop = dropped_levels, to = other) | |
} | |
otherize_levels_count <- function(fact, cutoff, other = "other", otherize_empty_levels = TRUE) { | |
fact_levels <- levels(fact) | |
if (otherize_empty_levels) { | |
otherize_empty_levels <- "" %c% fact_levels | |
} | |
lookup_table <- data.table(levels = fact_levels, count = tabulate_factor(fact)) | |
if (otherize_empty_levels) { | |
lookup_table <- lookup_table[levels != ""] | |
} | |
dropped_levels <- lookup_table[count < cutoff, levels] | |
if (otherize_empty_levels) { | |
dropped_levels <- append(dropped_levels, "") | |
} | |
drop_levels(fact, drop = dropped_levels, to = other) | |
} | |
otherize_groups_count <- function(dt, cutoff, group_cols = colnames(dt), cols_to_otherize = colnames(dt), other = "other") { | |
count_table <- dt[, .N, by = group_cols] | |
for_removal <- count_table[N < cutoff, group_cols, with = FALSE] | |
otherize_groups_base(dt, for_removal, group_cols, cols_to_otherize, other) | |
} | |
otherize_groups_prop <- function(dt, cutoff, group_cols = colnames(dt), cols_to_otherize = colnames(dt), other = "other") { | |
dt_nrows <- nrow(dt) | |
prop_table <- dt[, .(prop = .N / dt_nrows), by = group_cols] | |
for_removal <- prop_table[prop < cutoff, group_cols, with = FALSE] | |
otherize_groups_base(dt, for_removal, group_cols, cols_to_otherize, other) | |
} | |
otherize_groups_rank <- function(dt, cutoff, group_cols = colnames(dt), cols_to_otherize = colnames(dt), other = "other", include_ties = TRUE) { | |
count_table <- dt[, .N, by = group_cols] | |
setkey(count_table, N) | |
if (include_ties) { | |
N_at_cutoff <- count_table[.N-cutoff, N] | |
for_removal <- count_table[N < N_at_cutoff, group_cols, with = FALSE] | |
} else { | |
for_removal <- count_table[1:(.N-cutoff), group_cols, with = FALSE] | |
} | |
otherize_groups_base(dt, for_removal, group_cols, cols_to_otherize, other) | |
} | |
otherize_groups_base <- function(dt, for_removal, group_cols, cols_to_otherize, other) { | |
otherize_ncols <- length(cols_to_otherize) | |
other <- recycle_arguments(other, otherize_ncols) | |
removal_nrows <- nrow(for_removal) | |
for (c in seq_len(removal_nrows)) { | |
rows <- dt[for_removal[c], on = group_cols, which = TRUE] | |
for (d in seq_len(otherize_ncols)) { | |
set(dt, i = rows, j = cols_to_otherize[[d]], value = other[[d]]) | |
} | |
} | |
} | |
### Validate | |
if (is_package_loaded("data.table")) { | |
fill_matching_values_ <- function(data, code_col, value_col, ...) { | |
code_col <- deparse(substitute(code_col)) | |
value_col <- deparse(substitute(value_col)) | |
fill_matching_values(data, code_col, value_col, ...) | |
} | |
# currently requires both codes and values to be factors before calling the function | |
# currently this function only exists through data.table (if data.table isn't laoded, this function isn't supported) | |
# need to also ensure that different types of missing/incomplete values are handled (what if "" and NA are both present?) | |
fill_matching_values <- function(data, code_col, value_col, backfill = FALSE, drop.codes = FALSE, drop.values = FALSE, drop.missing.levels = FALSE, NA_code = NA_of_same_type(data[[code_col]]), NA_value = NA_of_same_type(data[[value_col]]), assume.exclusive = FALSE, code_exceptions = NULL, value_exceptions = NULL) { | |
NA_index_value <- NA_index(data[[value_col]], NA_value) | |
if (is_empty(NA_index_value)) { | |
code_to_values <- eval(parse(text = paste0("data[, .('codes' = ", code_col, ", 'values' = ", value_col, ")]"))) | |
} else { | |
code_to_values <- eval(parse(text = paste0("data[as.integer(", value_col, ") != ", as.character(NA_index_value), "L, .('codes' = ", code_col, ", 'values' = ", value_col, ")]"))) | |
} | |
NA_index_code <- NA_index(code_to_values[["codes"]], NA_code) | |
if (is_empty(NA_index_code)) { | |
code_to_values <- unique(code_to_values) | |
} else { | |
code_to_values <- unique(code_to_values[as.integer(codes) != NA_index_code, ]) | |
} | |
if (!is.null(code_exceptions)) { | |
code_to_values <- code_to_values[codes %!in% code_exceptions, ] | |
} | |
code_to_values <- code_to_values[, .(values, 'GRPN' = .N), keyby = codes] | |
indeterminate_codes <- code_to_values[GRPN > 1L, as.character(unique(codes))] | |
if (!is_empty(indeterminate_codes)) { | |
if (assume.exclusive) { | |
if (is_empty(NA_index_value)) { | |
for (a_code in indeterminate_codes) { | |
present_levels <- mode_and_others_factor(eval(parse(text = paste0("data[", code_col, " == a_code, ", value_col, "]")))) | |
replace_levels(code_to_values[["values"]], from = present_levels[["others"]], to = present_levels[["mode"]]) | |
} | |
} else { | |
for (a_code in indeterminate_codes) { | |
present_levels <- mode_and_others_factor(eval(parse(text = paste0("data[", code_col, " == a_code & as.integer(", value_col, ") != ", as.character(NA_index_value), "L, ", value_col, "]")))) | |
replace_levels(code_to_values[["values"]], from = present_levels[["others"]], to = present_levels[["mode"]]) | |
} | |
} | |
} else { | |
if (is_empty(NA_index_value)) { | |
for (a_code in indeterminate_codes) { | |
present_levels <- mode_and_others_factor(eval(parse(text = paste0("data[", code_col, " == a_code, ", value_col, "]")))) | |
set(code_to_values, i = which(code_to_values[["codes"]] == a_code), j = "values", present_levels[["mode"]]) | |
} | |
} else { | |
for (a_code in indeterminate_codes) { | |
present_levels <- mode_and_others_factor(eval(parse(text = paste0("data[", code_col, " == a_code & as.integer(", value_col, ") != ", as.character(NA_index_value), "L, ", value_col, "]")))) | |
set(code_to_values, i = which(code_to_values[["codes"]] == a_code), j = "values", present_levels[["mode"]]) | |
} | |
} | |
} | |
code_to_values <- unique(code_to_values[, .(codes, values)]) | |
} else { | |
code_to_values <- code_to_values[, .(codes, values)] | |
} | |
verified_codes <- as.character(code_to_values[["codes"]]) | |
NA_index_code <- NA_index(data[[code_col]], NA_code) | |
if (is_empty(NA_index_code)) { | |
all_codes <- levels(data[[code_col]]) | |
} else { | |
if (is.na(NA_code)) { | |
all_codes <- as.character(unique(data[[code_col]] %[]% (as.integer(.) != NA_index_code))) | |
} else { | |
all_codes <- levels(data[[code_col]]) %[!=]% "" | |
} | |
} | |
if (!is.null(code_exceptions)) { | |
all_codes <- all_codes %which!in% code_exceptions | |
} | |
unverified_codes <- all_codes %d% verified_codes | |
if (is_empty(unverified_codes)) { | |
code_to_values_for_join <- code_to_values | |
} else { | |
if (drop.codes) { | |
drop_levels(data[[code_col]], drop = unverified_codes, to = NA_code) | |
code_to_values_for_join <- rbindlist(list(code_to_values, data.table(codes = NA_code, values = NA_value))) | |
} else { | |
code_to_values_for_join <- rbindlist(list(code_to_values, data.table(codes = unverified_codes, values = NA_value))) | |
} | |
} | |
joined_data <- eval(parse(text = paste0("code_to_values_for_join[data, .(values), on = c('codes' = '", code_col, "')]"))) | |
NA_index_code <- NA_index(data[[code_col]], NA_code) | |
if (is_empty(NA_index_code)) { | |
if (is.null(code_exceptions)) { | |
set(data, j = value_col, value = joined_data[["values"]]) | |
} else { | |
slicer <- data[[code_col]] %!in% code_exceptions | |
set(data, i = which(slicer), j = value_col, value = joined_data[["values"]][slicer]) | |
} | |
} else { | |
if (is.null(code_exceptions)) { | |
slicer <- as.integer(data[[code_col]]) != NA_index_code | |
} else { | |
slicer <- as.integer(data[[code_col]]) != NA_index_code & data[[code_col]] %!in% code_exceptions | |
} | |
set(data, i = which(slicer), j = value_col, value = joined_data[["values"]][slicer]) | |
} | |
if (drop.missing.levels) { | |
drop_missing_levels(data[[value_col]], to = NA_value) | |
} | |
NA_index_code <- NA_index(data[[code_col]], NA_code) | |
if (is_empty(NA_index_code)) { | |
value_to_codes <- eval(parse(text = paste0("data[, .('values' = ", value_col, ", 'codes' = ", code_col, ")]"))) | |
} else { | |
value_to_codes <- eval(parse(text = paste0("data[as.integer(", code_col, ") != ", as.character(NA_index_code), "L, .('values' = ", value_col, ", 'codes' = ", code_col, ")]"))) | |
} | |
NA_index_value <- NA_index(value_to_codes[["values"]], NA_value) | |
if (is_empty(NA_index_value)) { | |
value_to_codes <- unique(value_to_codes) | |
} else { | |
value_to_codes <- unique(value_to_codes[as.integer(values) != NA_index_value, ]) | |
} | |
if (!is.null(value_exceptions)) { | |
value_to_codes <- value_to_codes[values %!in% value_exceptions, ] | |
} | |
value_to_codes <- value_to_codes[, .(codes, 'GRPN' = .N), keyby = values] | |
indeterminate_values <- value_to_codes[GRPN > 1L, as.character(unique(values))] | |
if (!is_empty(indeterminate_values)) { | |
if (assume.exclusive) { | |
if (is_empty(NA_index_code)) { | |
for (a_value in indeterminate_values) { | |
present_levels <- mode_and_others_factor(eval(parse(text = paste0("data[", value_col, " == a_value, ", code_col, "]")))) | |
replace_levels(value_to_codes[["codes"]], from = present_levels[["others"]], to = present_levels[["mode"]]) | |
} | |
} else { | |
for (a_value in indeterminate_values) { | |
present_levels <- mode_and_others_factor(eval(parse(text = paste0("data[", value_col, " == a_value & as.integer(", code_col, ") != ", as.character(NA_index_code), "L, ", code_col, "]")))) | |
replace_levels(value_to_codes[["codes"]], from = present_levels[["others"]], to = present_levels[["mode"]]) | |
} | |
} | |
} else { | |
if (is_empty(NA_index_code)) { | |
for (a_value in indeterminate_values) { | |
present_levels <- mode_and_others_factor(eval(parse(text = paste0("data[", value_col, " == a_value, ", code_col, "]")))) | |
set(value_to_codes, i = which(value_to_codes[["values"]] == a_value), j = "codes", present_levels[["mode"]]) | |
} | |
} else { | |
for (a_value in indeterminate_values) { | |
present_levels <- mode_and_others_factor(eval(parse(text = paste0("data[", value_col, " == a_value & as.integer(", code_col, ") != ", as.character(NA_index_code), "L, ", code_col, "]")))) | |
set(value_to_codes, i = which(value_to_codes[["values"]] == a_value), j = "codes", present_levels[["mode"]]) | |
} | |
} | |
} | |
value_to_codes <- unique(value_to_codes[, .(values, codes)]) | |
} else { | |
value_to_codes <- value_to_codes[, .(values, codes)] | |
} | |
verified_values <- as.character(value_to_codes[["values"]]) | |
NA_index_value <- NA_index(data[[value_col]], NA_value) | |
if (is_empty(NA_index_value)) { | |
all_values <- levels(data[[value_col]]) | |
} else { | |
if (is.na(NA_value)) { | |
all_values <- as.character(unique(data[[value_col]] %[]% (as.integer(.) != NA_index_value))) | |
} else { | |
all_values <- levels(data[[value_col]]) %[!=]% "" | |
} | |
} | |
if (!is.null(value_exceptions)) { | |
all_values <- all_values %which!in% value_exceptions | |
} | |
unverified_values <- all_values %d% verified_values | |
if (is_empty(unverified_values)) { | |
value_to_codes_for_join <- value_to_codes | |
} else { | |
if (drop.values) { | |
drop_levels(data[[value_col]], drop = unverified_values, to = NA_value) | |
value_to_codes_for_join <- rbindlist(list(value_to_codes, data.table(values = NA_value, codes = NA_code))) | |
} else { | |
value_to_codes_for_join <- rbindlist(list(value_to_codes, data.table(values = unverified_values, codes = NA_code))) | |
} | |
} | |
if (backfill) { | |
joined_data <- eval(parse(text = paste0("value_to_codes_for_join[data, .(codes), on = c('values' = '", value_col, "')]"))) | |
NA_index_value <- NA_index(data[[value_col]], NA_value) | |
if (is_empty(NA_index_value)) { | |
if (is.null(value_exceptions)) { | |
set(data, j = code_col, value = joined_data[["codes"]]) | |
} else { | |
slicer <- data[[value_col]] %!in% value_exceptions | |
} | |
} else { | |
if (is.null(value_exceptions)) { | |
slicer <- as.integer(data[[value_col]]) != NA_index_value | |
} else { | |
slicer <- as.integer(data[[value_col]]) != NA_index_value & data[[value_col]] %!in% value_exceptions | |
} | |
set(data, i = which(slicer), j = code_col, value = joined_data[["codes"]][slicer]) | |
} | |
if (drop.missing.levels) { | |
drop_missing_levels(data[[code_col]], to = NA_code) | |
} | |
} | |
invisible(data) | |
} | |
# correct_mistakes <- function() {#*** haven't implemented this yet as I'm not sure I'll need it | |
# # do fill_matching_values()-like stuff but correct based on other columns (useful for me here as I'll be correcting City and Country based on latitude and longitude) | |
# # get code_to_values (here observation 1 and observation 2 or obseration and corollary) | |
# # n-dimensional (easy case 2-dimensional like I have) corresponding values that can be used (distance deviation from median-wise) to determine whether two corollaries of an observation are different and should be different or not *and/or* whether two observations are labeled differently but should be labeled the same | |
# } | |
} | |
### Repeat by column | |
by_col <- function(data, col_func, cols = colnames(keep(data, is.factor)), ...) { | |
for (col in cols) { | |
col_func(data[[col]], ...) | |
} | |
invisible(data) | |
} | |
fix_NAs_by_col <- function(data, NA_level = "<NA>", cols = colnames(keep(data, is.factor))) { | |
for (col in cols) { | |
if (anyNA(data[[col]])) { | |
data[[col]] %>% add_levels(NA_level) | |
set(data, i = which(is.na(data[[col]])), j = col, NA_level) | |
} | |
} | |
invisible(data) | |
} | |
format_levels_by_col <- function(data, func, cols = colnames(keep(data, is.factor)), ...) { | |
for (col in cols) { | |
format_levels(data[[col]], func, ...) | |
} | |
invisible(data) | |
} | |
format_similar_levels_by_col <- function(data, pairings, exact = FALSE, cols = colnames(keep(data, is.factor))) { | |
for (col in cols) { | |
format_similar_levels(data[[col]], pairings) | |
} | |
invisible(data) | |
} | |
replace_level_by_col <- function(data, from, to, cols = colnames(keep(data, is.factor))) { | |
for (col in cols) { | |
replace_level(data[[col]], from, to) | |
} | |
invisible(data) | |
} | |
replace_levels_by_col <- function(data, from, to, cols = colnames(keep(data, is.factor))) { | |
for (col in cols) { | |
replace_levels(data[[col]], from, to) | |
} | |
invisible(data) | |
} | |
rename_levels_by_col <- function(data, changes, cols = colnames(keep(data, is.factor))) { | |
for (col in cols) { | |
rename_levels(data[[col]], changes) | |
} | |
invisible(data) | |
} | |
rename_similar_levels_by_col <- function(data, changes, exact = FALSE, cols = colnames(keep(data, is.factor))) { | |
for (col in cols) { | |
rename_similar_levels(data[[col]], changes, exact) | |
} | |
invisible(data) | |
} | |
add_levels_by_col <- function(data, add, cols = colnames(keep(data, is.factor))) { | |
for (col in cols) { | |
add_levels(data[[col]], add) | |
} | |
invisible(data) | |
} | |
drop_levels_by_col <- function(data, drop, to = "", cols = colnames(keep(data, is.factor))) { | |
for (col in cols) { | |
drop_levels(data[[col]], drop, to) | |
} | |
invisible(data) | |
} | |
drop_similar_levels_by_col <- function(data, drop, to = "", exact = FALSE, cols = colnames(keep(data, is.factor))) { | |
for (col in cols) { | |
drop_similar_levels(data[[col]], drop, to, exact) | |
} | |
invisible(data) | |
} | |
drop_missing_levels_by_col <- function(data, to = "", cols = colnames(keep(data, is.factor))) { | |
for (col in cols) { | |
drop_missing_levels(data[[col]], to) | |
} | |
invisible(data) | |
} | |
drop_levels_formula_by_col <- function(data, expr, to = "", cols = colnames(keep(data, is.factor))) { | |
for (col in cols) { | |
drop_levels_formula(data[[col]], expr, to) | |
} | |
invisible(data) | |
} | |
keep_levels_by_col <- function(data, keep, to = "", cols = colnames(keep(data, is.factor))) { | |
for (col in cols) { | |
keep_levels(data[[col]], keep, to) | |
} | |
invisible(data) | |
} | |
keep_similar_levels_by_col <- function(data, keep, to = "", exact = FALSE, cols = colnames(keep(data, is.factor))) { | |
for (col in cols) { | |
keep_similar_levels(data[[col]], keep, to, exact) | |
} | |
invisible(data) | |
} | |
reduce_levels_by_col <- function(data, rules, other = "other", exact = FALSE, cols = colnames(keep(data, is.factor))) { | |
for (col in cols) { | |
reduce_levels(data[[col]], rules, other, exact) | |
} | |
invisible(data) | |
} | |
otherize_levels_rank_by_col <- function(data, rank, other = "other", exact = FALSE, cols = colnames(keep(data, is.factor))) { | |
for (col in cols) { | |
otherize_levels_rank(data[[col]], rank, other, exact) | |
} | |
invisible(data) | |
} | |
otherize_levels_prop_by_col <- function(data, cutoff, other = "other", exact = FALSE, cols = colnames(keep(data, is.factor))) { | |
for (col in cols) { | |
otherize_levels_prop(data[[col]], cutoff, other, exact) | |
} | |
invisible(data) | |
} | |
fill_matching_values_by_col <- function(data, code_cols, value_cols, backfill = FALSE, drop.codes = FALSE, drop.values = FALSE, drop.missing.levels = FALSE, NA_code = NA_of_same_type_by_col(select(data, code_cols)), NA_value = NA_of_same_type_by_col(select(data, value_cols)), assume.exclusive = FALSE) { | |
assert_that(length(value_cols) == length(code_cols), | |
msg = "There is a different number of value and code columns") | |
num_cols <- length(value_cols) | |
drop.codes <- recycle_arguments(drop.codes, num_cols) | |
drop.values <- recycle_arguments(drop.values, num_cols) | |
drop.missing.levels <- recycle_arguments(drop.missing.levels, num_cols) | |
backfill <- recycle_arguments(backfill, num_cols) | |
NA_value <- recycle_arguments(NA_value, num_cols) | |
NA_code <- recycle_arguments(NA_code, num_cols) | |
assume.exclusive <- recycle_arguments(assume.exclusive, num_cols) | |
for (i in seq_along(value_cols)) { | |
fill_matching_values(data, code_col = code_cols[[i]], value_col = value_cols[[i]], backfill = backfill[[i]], drop.codes = drop.codes[[i]], drop.values = drop.values[[i]], drop.missing.levels = drop.missing.levels[[i]], NA_value = NA_value[[i]], NA_code = NA_code[[i]], assume.exclusive = assume.exclusive[[i]]) | |
} | |
invisible(data) | |
} |