Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Resolve validation step info to string for hashing #511

Merged
merged 19 commits into from
Dec 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@

* When no columns are returned from a `{tidyselect}` expression in `columns`, the agent's report now displays the originally supplied *expression* instead of simply blank (e.g., in `create_agent(small_table) %>% col_vals_null(matches("z"))`).

* Fixes issue with the hashing implementation to improve performance and alignment of validation steps in the multiagent.

# pointblank 0.11.4

* Fixes issue with gt `0.9.0` compatibility.
Expand Down
57 changes: 51 additions & 6 deletions R/create_multiagent.R
Original file line number Diff line number Diff line change
Expand Up @@ -160,18 +160,16 @@ create_multiagent <- function(
) {

agent_list <- list(...)

if (!all(sapply(agent_list, is_ptblank_agent))) {
rlang::abort("All components of `...` must be an agent")
}
agent_list <- rehash_agent_list(agent_list)
agent_list <-
lapply(
agent_list,
FUN = function(agent) {

# TODO: Ensure that each `agent` in `agent_list` is
# actually an agent with `is_ptblank_agent()`

class(agent) <-
c(setdiff(class(agent), "ptblank_agent"), "ptblank_agent_i")

agent
}
)
Expand All @@ -185,3 +183,50 @@ create_multiagent <- function(
class(agent_series) <- "ptblank_multiagent"
agent_series
}

rehash_agent_list <- function(agent_list) {

hash_versions <- lapply(agent_list, function(x) {
gsub("^.*(-|$)", "", x$validation_set$sha1)
})
hash_versions <- unique(unlist(hash_versions))

# agents using any of these hash versions are rehashed
to_rehash <- c("")

if (any(to_rehash %in% hash_versions) || length(hash_versions) > 1) {
lapply(agent_list, rehash_agent)
} else {
agent_list
}

}

rehash_agent <- function(agent) {

cur_hash_version <- get_hash_version()
vs <- agent$validation_set

new_hash <- sapply(seq_len(nrow(vs)), function(i) {
step <- vs[i, ]
hash <- step$sha1
hash_version <- gsub("^.*(-|$)", "", hash)
if (hash_version != cur_hash_version) {
# Rehash from validation set, extracting from list-column where necessary
hash <- hash_validation_step(
assertion_type = step$assertion_type,
column = step$column[[1]],
values = step$values[[1]],
na_pass = step$na_pass,
preconditions = step$preconditions[[1]],
seg_col = step$seg_col,
seg_val = step$seg_val
)
}
hash
})

agent$validation_set$sha1 <- new_hash
agent

}
78 changes: 57 additions & 21 deletions R/steps_and_briefs.R
Original file line number Diff line number Diff line change
Expand Up @@ -43,27 +43,15 @@ create_validation_step <- function(
i <- get_next_validation_set_row(agent)

# Calculate the SHA1 hash for the validation step
sha1 <-
digest::sha1(
list(
assertion_type = assertion_type,
column = ifelse(is.null(column), list(NULL), list(column)),
values = ifelse(
is.null(values) || is_a_table_object(values),
list(NULL), list(values)
),
na_pass = ifelse(is.null(na_pass), NA, as.logical(na_pass)),
preconditions = ifelse(
is.null(preconditions), list(NULL), list(preconditions)
),
seg_col = ifelse(
is.null(seg_col), NA_character_, as.character(seg_col)
),
seg_val = ifelse(
is.null(seg_val), NA_character_, as.character(seg_val)
)
)
)
sha1 <- hash_validation_step(
assertion_type = assertion_type,
column = column,
values = values,
na_pass = na_pass,
preconditions = preconditions,
seg_col = seg_col,
seg_val = seg_val
)

# Create a validation step as a single-row `tbl_df` object
validation_step_df <-
Expand Down Expand Up @@ -129,6 +117,54 @@ create_validation_step <- function(
agent
}

get_hash_version <- function() {
"v0.12"
}

hash_validation_step <- function(assertion_type,
column = NULL,
values = NULL,
na_pass = NULL,
preconditions = NULL,
seg_col = NULL,
seg_val = NULL) {

# pkg version that introduced the current hash implementation
hash_version <- get_hash_version()

values <- if (is.null(values) || is_a_table_object(values)) {
NA_character_
} else if (is.list(values)) {
# Resolve `vars()` to scalar string
toString(vapply(values, deparse_expr, character(1)))
} else {
deparse_expr(values)
}

preconditions <- if (inherits(preconditions, "fseq")) {
# Spell out components of magrittr anonymous function
magrittr_fn_seq <- environment(preconditions)[["_function_list"]]
deparse_expr(magrittr_fn_seq)
} else {
deparse_expr(preconditions)
}

step_chr <- c(
assertion_type = assertion_type,
column = as.character(column %||% NA_character_),
values = values,
na_pass = as.character(na_pass %||% NA_character_),
preconditions = preconditions,
seg_col = as.character(seg_col %||% NA_character_),
seg_val = as.character(seg_val %||% NA_character_)
)

step_hash <- digest::sha1(step_chr)

paste(step_hash, hash_version, sep = "-")

}

apply_preconditions_to_tbl <- function(agent, idx, tbl) {

preconditions <- agent$validation_set$preconditions[[idx]]
Expand Down
9 changes: 9 additions & 0 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -1715,3 +1715,12 @@ pb_get_image_tag <- function(file, dir = "images") {
"style=\"width:100\\%;\">"
)
}

deparse_expr <- function(expr, collapse = " ", ...) {
if (rlang::is_scalar_atomic(expr)) {
as.character(expr)
} else {
deparsed <- paste(deparse(expr, ...), collapse = collapse)
paste("<expr>", deparsed)
}
}
7 changes: 0 additions & 7 deletions tests/testthat/_snaps/draft_validation.md
Original file line number Diff line number Diff line change
Expand Up @@ -124,13 +124,6 @@
Output
[1] "library(pointblank)\n\nagent <-\n create_agent(\n tbl = ~ tbl,\n actions = action_levels(\n warn_at = 0.05,\n stop_at = 0.10\n ),\n tbl_name = \"tbl\",\n label = \"Validation plan generated by `draft_validation()`.\"\n ) %>%\n # Expect that column `name` is of type: character\n col_is_character(\n columns = c(\"name\")\n ) %>%\n # Expect that column `height` is of type: integer\n col_is_integer(\n columns = c(\"height\")\n ) %>%\n # Expect that values in `height` should be between `66` and `264`\n col_vals_between(\n columns = c(\"height\"),\n left = 66,\n right = 264,\n na_pass = TRUE\n ) %>%\n # Expect that column `mass` is of type: numeric\n col_is_numeric(\n columns = c(\"mass\")\n ) %>%\n # Expect that values in `mass` should be between `15` and `1358`\n col_vals_between(\n columns = c(\"mass\"),\n left = 15,\n right = 1358,\n na_pass = TRUE\n ) %>%\n # Expect that column `hair_color` is of type: character\n col_is_character(\n columns = c(\"hair_color\")\n ) %>%\n # Expect that column `skin_color` is of type: character\n col_is_character(\n columns = c(\"skin_color\")\n ) %>%\n # Expect that column `eye_color` is of type: character\n col_is_character(\n columns = c(\"eye_color\")\n ) %>%\n # Expect that column `birth_year` is of type: numeric\n col_is_numeric(\n columns = c(\"birth_year\")\n ) %>%\n # Expect that values in `birth_year` should be between `8` and `896`\n col_vals_between(\n columns = c(\"birth_year\"),\n left = 8,\n right = 896,\n na_pass = TRUE\n ) %>%\n # Expect that column `sex` is of type: character\n col_is_character(\n columns = c(\"sex\")\n ) %>%\n # Expect that column `gender` is of type: character\n col_is_character(\n columns = c(\"gender\")\n ) %>%\n # Expect that column `homeworld` is of type: character\n col_is_character(\n columns = c(\"homeworld\")\n ) %>%\n # Expect that column `species` is of type: character\n col_is_character(\n columns = c(\"species\")\n ) %>%\n # Expect entirely distinct rows across `name, height, mass, hair_color, skin_color, eye_color, birth_year, sex, gender, homeworld, species, films, vehicles, starships`\n rows_distinct(\n columns = c(\"name, height, mass, hair_color, skin_color, eye_color, birth_year, sex, gender, homeworld, species, films, vehicles, starships\")\n ) %>%\n # Expect that column schemas match\n col_schema_match(\n schema = col_schema(\n name = \"character\",\n height = \"integer\",\n mass = \"numeric\",\n hair_color = \"character\",\n skin_color = \"character\",\n eye_color = \"character\",\n birth_year = \"numeric\",\n sex = \"character\",\n gender = \"character\",\n homeworld = \"character\",\n species = \"character\",\n films = \"list\",\n vehicles = \"list\",\n starships = \"list\"\n )\n ) %>%\n interrogate()\n\nagent"

---

Code
readLines(con = path) %>% paste0(collapse = "\n")
Output
[1] "library(pointblank)\n\nagent <-\n create_agent(\n tbl = ~ tbl,\n actions = action_levels(\n warn_at = 0.05,\n stop_at = 0.10\n ),\n tbl_name = \"tbl\",\n label = \"Validation plan generated by `draft_validation()`.\"\n ) %>%\n # Expect that column `name` is of type: character\n col_is_character(\n columns = c(\"name\")\n ) %>%\n # Expect that column `year` is of type: numeric\n col_is_numeric(\n columns = c(\"year\")\n ) %>%\n # Expect that values in `year` should be between `1975` and `2021`\n col_vals_between(\n columns = c(\"year\"),\n left = 1975,\n right = 2021\n ) %>%\n # Expect that column `month` is of type: numeric\n col_is_numeric(\n columns = c(\"month\")\n ) %>%\n # Expect that values in `month` should be between `1` and `12`\n col_vals_between(\n columns = c(\"month\"),\n left = 1,\n right = 12\n ) %>%\n # Expect that column `day` is of type: integer\n col_is_integer(\n columns = c(\"day\")\n ) %>%\n # Expect that values in `day` should be between `1` and `31`\n col_vals_between(\n columns = c(\"day\"),\n left = 1,\n right = 31\n ) %>%\n # Expect that column `hour` is of type: numeric\n col_is_numeric(\n columns = c(\"hour\")\n ) %>%\n # Expect that values in `hour` should be between `0` and `23`\n col_vals_between(\n columns = c(\"hour\"),\n left = 0,\n right = 23\n ) %>%\n # Expect that column `lat` is of type: numeric\n col_is_numeric(\n columns = c(\"lat\")\n ) %>%\n # Expect that values in `lat` should be between `-90` and `90`\n col_vals_between(\n columns = c(\"lat\"),\n left = -90,\n right = 90\n ) %>%\n # Expect that column `long` is of type: numeric\n col_is_numeric(\n columns = c(\"long\")\n ) %>%\n # Expect that values in `long` should be between `-180` and `180`\n col_vals_between(\n columns = c(\"long\"),\n left = -180,\n right = 180\n ) %>%\n # Expect that column `status` is of type: factor\n col_is_factor(\n columns = c(\"status\")\n ) %>%\n # Expect that column `category` is of type: numeric\n col_is_numeric(\n columns = c(\"category\")\n ) %>%\n # Expect that values in `category` should be between `1` and `5`\n col_vals_between(\n columns = c(\"category\"),\n left = 1,\n right = 5,\n na_pass = TRUE\n ) %>%\n # Expect that column `wind` is of type: integer\n col_is_integer(\n columns = c(\"wind\")\n ) %>%\n # Expect that values in `wind` should be between `10` and `165`\n col_vals_between(\n columns = c(\"wind\"),\n left = 10,\n right = 165\n ) %>%\n # Expect that column `pressure` is of type: integer\n col_is_integer(\n columns = c(\"pressure\")\n ) %>%\n # Expect that values in `pressure` should be between `882` and `1024`\n col_vals_between(\n columns = c(\"pressure\"),\n left = 882,\n right = 1024\n ) %>%\n # Expect that column `tropicalstorm_force_diameter` is of type: integer\n col_is_integer(\n columns = c(\"tropicalstorm_force_diameter\")\n ) %>%\n # Expect that values in `tropicalstorm_force_diameter` should be between `0` and `1440`\n col_vals_between(\n columns = c(\"tropicalstorm_force_diameter\"),\n left = 0,\n right = 1440,\n na_pass = TRUE\n ) %>%\n # Expect that column `hurricane_force_diameter` is of type: integer\n col_is_integer(\n columns = c(\"hurricane_force_diameter\")\n ) %>%\n # Expect that values in `hurricane_force_diameter` should be between `0` and `300`\n col_vals_between(\n columns = c(\"hurricane_force_diameter\"),\n left = 0,\n right = 300,\n na_pass = TRUE\n ) %>%\n # Expect that column schemas match\n col_schema_match(\n schema = col_schema(\n name = \"character\",\n year = \"numeric\",\n month = \"numeric\",\n day = \"integer\",\n hour = \"numeric\",\n lat = \"numeric\",\n long = \"numeric\",\n status = \"factor\",\n category = \"numeric\",\n wind = \"integer\",\n pressure = \"integer\",\n tropicalstorm_force_diameter = \"integer\",\n hurricane_force_diameter = \"integer\"\n )\n ) %>%\n interrogate()\n\nagent"

---

Code
Expand Down
Loading
Loading