/
adist_authors.R
89 lines (80 loc) · 3.38 KB
/
adist_authors.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#' @keywords internal
outer_list <- function(X, Y, FUN, ...) {
# https://rdrr.io/cran/rmngb/src/R/listOperations.R
nX <- length(X)
nY <- length(Y)
x <- rep(X, nY)
y <- rep(Y, each = nX)
res <- mapply(FUN, x, y, ...)
matrix(res, nrow = nX, dimnames = list(names(X), names(Y)))
}
# Proporción de veces que el primero no está en el segundo
#' @keywords internal
dist_names <- function(X, Y) {
nX <- length(X)
nY <- length(Y)
x <- rep(X, nY)
y <- rep(Y, each = nX)
res <- mapply(function(a, t) mean(a %in% t), x, y)
res <- matrix(1 - res, nrow = nX, dimnames = list(X, Y))
return(res)
}
# No tiene en cuenta family name, first name
#' @keywords internal
adist_names <- function(names, table) {
snames <- lapply(str_split(str_remove(names, ','), ' |-'), toupper)
stable <- lapply(str_split(str_remove(table, ','), ' |-'), toupper)
res <- dist_names(snames, stable)
dimnames(res) <- list(names, table)
return(res)
}
#' @keywords internal
authors.short <- function(authors) {
sauthors <- str_split(authors, ', ', simplify = TRUE)
family <- lapply(str_split(sauthors[, 1], ' |-'), function(x) toupper(x))
name <- if (ncol(sauthors) == 1) as.list(rep("", nrow(sauthors))) else
lapply(str_split(sauthors[, 2], ' |-'), function(x) substr(x, 1, 1))
return(list(family = family, name = name))
}
#' Approximate Authors' Names Distances
#'
#' Compute an approximate string distance between character vectors containing authors' names
#' (of the form \code{"family name, first name"}).
#' The distance is a weighted proportion of the differences between (capitalized)
#' family names and between first name initial letters.
#' @param authors author names to be approximately matched.
#' @param table lookup table for matching.
#' @param weight weights associated with differences in family names and in first names.
#' @seealso \code{\link{match_authors}}.
#' @export
adist_authors <- function(authors, table, weight = c(family = 0.9, name = 0.1)) {
sauthors <- authors.short(authors)
stable <- authors.short(table)
family.match <- dist_names(sauthors$family, stable$family)
name.match <- dist_names(sauthors$name, stable$name)
res <- weight[1]*family.match + weight[2]*name.match
dimnames(res) <- list(authors, table)
return(res)
}
#' Approximate Authors' Names Matching
#'
#' Approximate string matching between character vectors containing authors' names
#' (equivalents of R's native \code{\link{match}}).
#' @inheritParams adist_authors
#' @param max.dist maximum distance between strings to be matched.
#' @param attr.dist logical; indicating whether to optionally return the distances
#' (weighted proporpion of names coincidences) as the \code{"dist"} attribute of the return value.
#' @return Returns a vector of the positions of the closest matches of its first argument in its second.
#' When multiple matches with the same smallest distance exist, the first one is returned.
#' When no match is found \code{NA_integer_} is returned.
#' @seealso \code{\link{adist_authors}}.
#' @export
match_authors <- function(authors, table, weight = c(family = 0.9, name = 0.1),
max.dist = 0.2, attr.dist = FALSE) {
adist <- adist_authors(authors, table, weight = weight)
match <- drop(apply(adist, 1, which.min))
dist <- adist[cbind(seq_len(nrow(adist)), match)]
is.na(match) <- dist > max.dist
if (attr.dist) attr(match, 'dist') <- dist
return(match)
}