/
dupree_classes.R
227 lines (194 loc) · 6.54 KB
/
dupree_classes.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
###############################################################################
# Classes for `dupree`
###############################################################################
# Class definition: `EnumeratedCodeTable`
#' An S4 class to represent the code blocks as strings of integers
#'
#' @name EnumeratedCodeTable-class
#' @slot blocks A tbl_df with columns `file`, `block`, `start_line` and
#' `enumerated_code`
#'
methods::setClass("EnumeratedCodeTable", slots = list(blocks = "tbl_df"))
###############################################################################
#' `EnumeratedCodeTable` validation
#'
#' @noRd
#'
.is_enumerated_code_table <- function(object) {
required_cols <- c("file", "block", "start_line", "enumerated_code")
observed_cols <- colnames(object@blocks)
if (
all(required_cols %in% observed_cols)
) {
TRUE
} else {
missing_cols <- setdiff(required_cols, observed_cols)
paste("Column", missing_cols, "should be in object@blocks")
}
}
methods::setValidity(
"EnumeratedCodeTable",
function(object) .is_enumerated_code_table(object)
)
###############################################################################
#' Initialise an `EnumeratedCodeTable`
#'
#' An `EnumeratedCodeTable` contains a `blocks` table. Each row of this table
#' contains details for a block of R code: the filename, block-id and startline
#' of the block, and a tokenized version of the code within that block.
#'
#' Once initialised, the blocks table is ordered by filename and then block-id.
#'
#' @importFrom methods callNextMethod setMethod validObject
#' @importFrom tibble tibble
#'
#' @noRd
#'
methods::setMethod(
"initialize",
"EnumeratedCodeTable",
function(.Object, blocks = NULL, ...) {
.Object <- methods::callNextMethod(...)
default_code_table <- tibble::tibble(
file = character(0), block = integer(0), start_line = integer(0),
enumerated_code = list()
)
if (is.null(blocks)) {
.Object@blocks <- default_code_table
} else {
# we ensure that the code blocks are ordered by file and then block
.Object@blocks <- dplyr::arrange(
blocks, .data[["file"]], .data[["block"]]
)
}
methods::validObject(.Object)
.Object
}
)
###############################################################################
# `find_best_matches`
###############################################################################
# By default we use `lcs` as the sequence-similarity measure
# - for two integer vectors, the lcs-distance is the minimum number of entries
# that need to be removed from both vectors before identity is reached
# - then the lcs-similarity score is 1 - distance / max_length; where
# max_length is the sum of the lengths of the two input vectors
# - d((1, 2, 3, 4), (1, 4, 5, 6)) = 4; s(..., ...) = 1 - 4 / 8
# - we use lcs because it's simple to explain
#' `find_best_matches` between code blocks
#'
#' @noRd
#'
# nocov start
methods::setGeneric("find_best_matches", function(x, ...) {
methods::standardGeneric("find_best_matches")
})
# nocov end
#' `find_best_matches` between code blocks in an `EnumeratedCodeTable`
#'
#' The code blocks are assumed to be ordered within the
#' `EnumeratedCodeTable`, as such when two code blocks are
#' mutually-best-matches, the results returned by this function only contains
#' a single row for those two code blocks; when this happens we guarantee that
#' `file_a` <= `file_b` and `block_a` <= `block_b`
#'
#' @noRd
#'
methods::setMethod(
"find_best_matches",
methods::signature("EnumeratedCodeTable"),
function(x, ...) {
blocks <- x@blocks
enum_codes <- x@blocks$enumerated_code
index_matches <- find_indexes_of_best_matches(enum_codes, ...)
details_a <- blocks[index_matches$index_a, ]
details_b <- blocks[index_matches$index_b, ]
score <- index_matches$score
tibble::tibble(
file_a = details_a$file,
file_b = details_b$file,
block_a = details_a$block,
block_b = details_b$block,
line_a = details_a$start_line,
line_b = details_b$start_line,
score = score
)
}
)
###############################################################################
# Related Functions
###############################################################################
#' One against all search
#'
#' @noRd
#'
.one_against_all <- function(subject_index, enum_codes, sim_func) {
subject <- enum_codes[subject_index]
scores <- sim_func(subject, enum_codes)
scores[subject_index] <- -1
list(
index_a = subject_index,
index_b = which.max(scores),
score = max(scores)
)
}
#' All against all search
#'
#' @param enum_codes List of vectors of integers. Each `int` is an
#' enumerated code for some code-symbol (like a conversion of the
#' code-symbols into a factor).
#' @param method Alignment method for use in
#' `stringdist::seq_sim`.
#' @param ... Further parameters for passing to
#' `stringdist::seq_sim`.
#'
#' @importFrom dplyr arrange desc mutate select
#' @importFrom purrr map_df
#' @importFrom stringdist seq_sim
#' @importFrom tibble tibble
#' @importFrom rlang .data
#'
#' @noRd
#'
find_indexes_of_best_matches <- function(enum_codes, method = "lcs", ...) {
empty_result <- tibble::tibble(
index_a = integer(0), index_b = integer(0), score = numeric(0)
)
if (length(enum_codes) <= 1) {
return(empty_result)
}
sim_func <- function(x, y) {
stringdist::seq_sim(x, y, method = method, ...)
}
# .one_against_all returns df: (index_a, index_b, score)
# For each code-block we want to identify it's closest matching code-block
#
# We only return a code-block pair once (ie, if A-B is a pair and B-A is a
# pair, then we return A-B, but not B-A)
#
# When C-A is a pair but the index of C is greater than that of A, we return
# the pair A-C
scores <- purrr::map_df(
seq_along(enum_codes),
.one_against_all,
enum_codes,
sim_func
) %>%
# ensure the index of A is less than the index of B
dplyr::mutate(
temp = pmax(.data[["index_a"]], .data[["index_b"]]),
index_a = pmin(.data[["index_a"]], .data[["index_b"]]),
index_b = .data[["temp"]]
) %>%
dplyr::select(
-"temp"
) %>%
# only return each code-block pair once
unique() %>%
# order the code-block pairs by decreasing score
dplyr::arrange(
dplyr::desc(.data[["score"]]), .data[["index_a"]], .data[["index_b"]]
)
scores
}
###############################################################################