Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Detect alignment #537

Merged
merged 17 commits into from Aug 9, 2019

second round: alignment detection with FIXME.

  • Loading branch information...
lorenzwalthert committed Aug 4, 2019
commit 8857bc8d7121322fa50bf0be6cedd7e30a42bc05
@@ -1,63 +1,134 @@
#' Check if tokens are aligned
#'
#' @details
#' A line is called aligned if the following conditions hold:
#'
#' * argument name and = have the same line1 (i.e. the start of a token) for
#' multiple lines for the first column.
#' * spacing around comma is correct (none before, at least one after).
#' * and at least one space around =.
#' @importFrom purrr map compact reduce map_lgl
#' * lag spaces of column 1 must agree.
#' * spacing around comma (0 before, > 1 after) and spacing around `=` (at least
#' one around).
#' * all positions of commas of col > 2 must agree (needs recursive creation of
#' `text`).
#'
#' Because of the last requirement, this function is very expensive to run. For
#' this reason, the following approach is taken:
#'
#' * Only invoke the function when certain that allignment is possible.
#' * Check the cheap conditions first.
#' * For the recursive creation of text, greedily check column by column to make
#' sure we can stop as soon as we found that columns are not aligned.
#'
#' @importFrom purrr map compact reduce map_lgl map_int
#' @importFrom rlang seq2
#' @importFrom magrittr not
token_is_on_alligned_line <- function(pd_flat, op_before) {

# works if there are no lag newlies hidden in children!
line_idx <- 1 + cumsum(pd_flat$lag_newlines)
pd_flat$.lag_spaces <- lag(pd_flat$spaces)
pd_by_line <- split(pd_flat, line_idx)
lag_spaces_col_1 <- map_int(pd_by_line, ~ .x$.lag_spaces[1])
relevant_idx <- seq2(2, length(lag_spaces_col_1) - 1)
relevant_lag_spaces_col_1 <- lag_spaces_col_1[relevant_idx]
col1_is_aligned <- length(unique(relevant_lag_spaces_col_1)) == 1
if (!col1_is_aligned) {
return(FALSE)
}
has_correct_spacing_around_comma <- purrr::map_lgl(
pd_by_line, has_correct_spacing_around_comma
)
if (!all(has_correct_spacing_around_comma)) {
return(FALSE)
}

has_correct_spacing_around_eq_sub <- purrr::map_lgl(
pd_by_line, has_correct_spacing_around_eq_sub
)

line1_first_arg_name_and_eq_assign <- pd_by_line %>%
map(line1_arg_name_and_eq_sub)
eq_sub_line1 <- line1_first_arg_name_and_eq_assign %>%
compact()
if (!all(has_correct_spacing_around_eq_sub)) {
return(FALSE)
}
starting_with_comma <- purrr::map_lgl(pd_by_line, ~ .x$token[1] == "','")
if (any(starting_with_comma)) {
return(FALSE)
}
# most expensive check in the end.
loop_upper <- max(purrr::map_int(pd_by_line, ~ sum(.x$token == "','")))
col1_all_named <- purrr::map_lgl(pd_by_line[relevant_idx],
~ .x$token[c(1, 3)] == c("SYMBOL_SUB", "expr") &&
.x$token[2] %in% c("EQ_SUB", "SPECIAL-IN", "LT", "GT", "EQ", "NE")
) %>%
all()
for (column in seq2(ifelse(col1_all_named, 1, 2), loop_upper)) {
# check column by column since it is very expensive
is_last <- column == loop_upper
char_len <- purrr::map(pd_by_line[relevant_idx],
serialize_lines, column = column,
is_last_idx = is_last
) %>%
purrr::compact() %>%
unlist() %>%
trimws(which = "right") %>%
nchar()
# TODO consistently use last()
if (is_last && last(last(pd_by_line[relevant_idx])$token) != "','") {
# last column does often not have a comma.
char_len[length(char_len)] <- char_len[length(char_len)] +1
}
is_aligned <- length(unique(char_len)) == 1

if (length(eq_sub_line1) < 2) {
# need at least two lines with eq_sub
return(rep(FALSE, length = nrow(pd_flat)))
if (!is_aligned) {
return(FALSE)
}
# for column = 2, this includes text up to the second column and space after it,
# so it should be the same length.
# potentially remove trainling space and compare length.
}
TRUE
}

arg_name_and_eq_sub_are_aligned <- line1_first_arg_name_and_eq_assign %>%
map_lgl(~identical(.x, eq_sub_line1[[1]]))
serialize_lines <- function(pd,is_last_idx, column) {
# better also add lover bound for column. If you already checked up to comma 2,
# you don't need to re-construct text again, just check if text between comma 2
# and 3 has the same length.

if (sum(arg_name_and_eq_sub_are_aligned) < 2) {
# first is obviously aligned with itself, so it needs one more.
return(rep(FALSE, length = nrow(pd_flat)))
if (is_last_idx) {
relevant_comma <- nrow(pd)
} else {
comma_idx <- which(pd$token == "','")
relevant_comma <- comma_idx[column]
if (column > length(comma_idx)) {
return(NULL)
}
}
# Not sure if it's quicker to simply construct text from the nested pd and
# then nchar(.text) to validate instead of doing complicated checking.
# instead of also check

has_correct_spacing_around_comma <- pd_by_line %>%
map_lgl(has_correct_spacing_around_comma)
pd <- pd[seq2(1, nrow(pd)) < 1L + relevant_comma,]
serialize(pd)
}

has_correct_spacing_around_eq_sub <- pd_by_line %>%
map_lgl(has_correct_spacing_around_eq_sub)
# No new lines considered
serialize <- function(pd) {

pd_flat$.lag_spaces <- NULL
out <- Map(function(terminal, text, child, spaces) {

pd_by_line_aligned <- arg_name_and_eq_sub_are_aligned &
has_correct_spacing_around_comma &
has_correct_spacing_around_eq_sub
map2(pd_by_line, pd_by_line_aligned, ~ rep(.y, length = nrow(.x))) %>%
unlist()
if (terminal) {
return(paste0(text, rep_char(" ", spaces)))
} else {
return(paste0(serialize(child), rep_char(" ", spaces)))
}
}, pd$terminal, pd$text, pd$child, pd$spaces)
paste0(out, collapse = "")
}


#' At least one space after comma, none before, for all but the last comma on
#' the line
#' @param pd_sub The subset of a parse table corresponding to one line.
#' @importFrom rlang seq2
#' @keywords internal
has_correct_spacing_around_comma <- function(pd_sub) {
comma_tokens <- which(pd_sub$token == "','")
if (length(comma_tokens) == 0) return(TRUE)
if (length(comma_tokens) == 0) {
return(TRUE)
}
relevant_comma_token <- comma_tokens[seq2(1, length(comma_tokens) - 1L)]
correct_spaces_before <- pd_sub$.lag_spaces[relevant_comma_token] == 0
correct_spaces_after <- pd_sub$spaces[relevant_comma_token] > 0
@@ -68,21 +139,10 @@ has_correct_spacing_around_comma <- function(pd_sub) {
#' @keywords internal
#' @importFrom rlang seq2
has_correct_spacing_around_eq_sub <- function(pd_sub) {
eq_sub_token <- which(pd_sub$token == "EQ_SUB")
if (length(eq_sub_token) == 0) return(TRUE)
relevant_eq_sub_token <- eq_sub_token[seq2(1, length(eq_sub_token) - 1L)]
relevant_eq_sub_token <- which(pd_sub$token == "EQ_SUB")
if (length(relevant_eq_sub_token) == 0) return(TRUE)

correct_spaces_before <- pd_sub$.lag_spaces[relevant_eq_sub_token] >= 1
correct_spaces_after <- pd_sub$spaces[relevant_eq_sub_token] >= 1
all(correct_spaces_before) && all(correct_spaces_after)
}

#' Computes the `line1` attribute for the first two tokens of the of a sub
#' parse table if it contains an `EQ_SUB`.
line1_arg_name_and_eq_sub <- function(pd_sub) {
if (nrow(pd_sub) >= 2 && pd_sub$token[2] == "EQ_SUB") {
cumsum(lag(nchar(pd_sub$text[1:2]), default = 0)) +
cumsum(pd_sub$.lag_spaces[1:2])
} else {
NULL
}
}
@@ -1,33 +1,42 @@
#' @include token-define.R
#' @keywords internal
add_space_around_op <- function(pd_flat) {
op_after <- pd_flat$token %in% op_token
op_before <- lead(op_after, default = FALSE)
idx_before <- op_before & (pd_flat$newlines == 0L)
pd_flat$spaces[idx_before] <- pmax(pd_flat$spaces[idx_before], 1L)
idx_after <- op_after & (pd_flat$newlines == 0L)
pd_flat$spaces[idx_after] <- pmax(pd_flat$spaces[idx_after], 1L)
pd_flat
}

#' Set spaces around operators
#'
#' Alignement is kept, if detected.
#' @include token-define.R
#' @keywords internal
set_space_around_op <- function(pd_flat) {
#' @include token-define.R
set_space_around_op <- function(pd_flat, strict) {
# spacing and operator in same function because alternative is
# calling token_is_on_alligned_line() twice because comma and operator spacing
# depends on it.
pd_flat <- add_space_after_comma(pd_flat)
op_after <- pd_flat$token %in% op_token
op_before <- lead(op_after, default = FALSE)
# include comma, but only for after
op_after <- op_after | pd_flat$token == "','"
if (!any(op_after)) {
return(pd_flat)
}
op_before <- lead(op_after, default = FALSE)
if (any(pd_flat$token == "EQ_SUB")) {
if (sum(pd_flat$lag_newlines) > 2 &&
is_function_call(pd_flat) &&
any(pd_flat$token %in% c("EQ_SUB", "','"))
) {
is_on_alligned_line <- token_is_on_alligned_line(pd_flat, op_before)
} else {
is_on_alligned_line <- FALSE
}
pd_flat$spaces[op_before & (pd_flat$newlines == 0L) & !is_on_alligned_line] <- 1L
pd_flat$spaces[op_after & (pd_flat$newlines == 0L) & !is_on_alligned_line] <- 1L
# operator
must_have_space_before <- op_before & (pd_flat$newlines == 0L) & !is_on_alligned_line
pd_flat$spaces[must_have_space_before] <- if (strict) {
1L
} else {
pmax(pd_flat$spaces[must_have_space_before], 1L)
}
must_have_space_after <- op_after & (pd_flat$newlines == 0L) & !is_on_alligned_line
pd_flat$spaces[must_have_space_after] <- if (strict) {
1L
} else {
pmax(pd_flat$spaces[must_have_space_after], 1L)
}
pd_flat
}

@@ -85,16 +85,9 @@ tidyverse_style <- function(scope = "tokens",
style_space_around_tilde,
strict = strict
),
spacing_around_op = if (strict) {
set_space_around_op
} else {
add_space_around_op
},
spacing_around_comma = if (strict) {
set_space_after_comma
} else {
add_space_after_comma
},
spacing_around_op = purrr::partial(set_space_around_op,
strict = strict
),
remove_space_after_opening_paren,
remove_space_after_excl,
set_space_after_bang_bang,
@@ -4,6 +4,12 @@ call(
xy = 2, n = 33,
)

# without trailing comma
call(
x = 1, kdd = 2,
xy = 2, n = 33
)

# algorithm: aligned. human: aligned.
call(
x = 1, kdd = 2,
@@ -22,29 +28,19 @@ call(
xy =2, n = 33,
)

# algorithm: aligned (spacing around coma cannot be detected). human: aligned
# (fix: spacing around comma).
# TODO should detect redundant space before comma (not possible currently
# because this space is hidden in child nest. Would be possible when we would
# attempt to get position of all tokens on the line right away)
call(
x = 1, kdd = 2,
xy = 2 , n = 33,
)

# algorithm: aligned. human: not aligned.
# algorithm: not aligned. human: not aligned.
call(
x = 1, kdd = 2,
xy = 2, n = 33,
)

# algorithm: aligned. human: aligned.
# algorithm: not aligned. human: not aligned.
call(
x = 1, kdd = 2,
xy = 22, n = 33,
)

# algorithm: aligned. human: not aligned.
# algorithm: not aligned. human: not aligned.
call(
x = 1, d = 2,
xy = 22, n = 33,
@@ -57,13 +53,6 @@ call(
xy = 2, n = 33, z = "333"
)

# algorithm: aligned. human: aligned (fix spacing around =).
# FIXME need to check also text of argument value.
call(
x = 122, kdd = 2, k = "abc",
xy = 2, n = 33, z = "333"
)


# algorithm: aligned. human: aligned.
call(
@@ -72,9 +61,31 @@ call(
)

# algorithm: aligned. human: aligned.
# FIXME
call(
x = 1, n = 33, z = "333",

xy = 2,
)

# aligned. when spaces are spread accross different nests
call(
k = ff("pk"), k = 3,
b = f(-g), 22 + 1,
44, 323
)

# if all col1 arguments are named, col1 must also be aligned
# not aligned
fell(
x = 1,
y = 23,
zz = NULL
)

# aligned
fell(
x = 1,
y = 23,
zz = NULL
)
ProTip! Use n and p to navigate between commits in a pull request.
You can’t perform that action at this time.