-
Notifications
You must be signed in to change notification settings - Fork 186
/
kwic.R
206 lines (188 loc) · 6.89 KB
/
kwic.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
#' Locate keywords-in-context
#'
#' For a text or a collection of texts (in a quanteda corpus object), return a
#' list of a keyword supplied by the user in its immediate context, identifying
#' the source text and the word index number within the source text. (Not the
#' line number, since the text may or may not be segmented using end-of-line
#' delimiters.)
#' @param x a character, [corpus], or [tokens] object
#' @inheritParams pattern
#' @param window the number of context words to be displayed around the keyword
#' @inheritParams valuetype
#' @param separator a character to separate words in the output
#' @param index an [index] object to specify keywords
#' @param ... unused
#' @return A `kwic` classed data.frame, with the document name
#' (`docname`) and the token index positions (`from` and `to`,
#' which will be the same for single-word patterns, or a sequence equal in
#' length to the number of elements for multi-word phrases).
#'
#' @note `pattern` will be a keyword pattern or phrase, possibly multiple
#' patterns, that may include punctuation. If a pattern contains whitespace,
#' it is best to wrap it in [phrase()] to make this explicit. However if
#' `pattern` is a `collocations` (see \pkg{quanteda.textstats} or
#' [dictionary] object, then the collocations or multi-word dictionary keys
#' will automatically be considered phrases where each whitespace-separated
#' element matches a token in sequence.
#' @export
#' @seealso [print-methods]
#' @examples
#' # single token matching
#' toks <- tokens(data_corpus_inaugural[1:8])
#' kwic(toks, pattern = "secure*", valuetype = "glob", window = 3)
#' kwic(toks, pattern = "secur", valuetype = "regex", window = 3)
#' kwic(toks, pattern = "security", valuetype = "fixed", window = 3)
#'
#' # phrase matching
#' kwic(toks, pattern = phrase("secur* against"), window = 2)
#' kwic(toks, pattern = phrase("war against"), valuetype = "regex", window = 2)
#'
#' # use index
#' idx <- index(toks, phrase("secur* against"))
#' kwic(toks, index = idx, window = 2)
kwic <- function(x, pattern, window = 5,
valuetype = c("glob", "regex", "fixed"),
separator = " ",
case_insensitive = TRUE,
index = NULL,
...) {
UseMethod("kwic")
}
#' @export
kwic.default <- function(x, ...) {
check_class(class(x), "kwic")
}
#' @export
kwic.character <- function(x, ...) {
lifecycle::deprecate_stop(
when = "3.0",
what = "kwic.character()",
details = 'Please apply `tokens()` to the character object first.'
)
}
#' @export
kwic.corpus <- function(x, ...) {
lifecycle::deprecate_stop(
when = "3.0",
what = "kwic.corpus()",
details = 'Please apply `tokens()` to the corpus object first.'
)
}
#' @export
kwic.tokens_xptr <- function(x, pattern = NULL, window = 5,
valuetype = c("glob", "regex", "fixed"),
separator = " ",
case_insensitive = TRUE,
index = NULL,
...) {
check_dots(..., "kwic")
window <- check_integer(window, 1, 1, 0)
valuetype <- match.arg(valuetype)
separator <- check_character(separator)
case_insensitive <- check_logical(case_insensitive)
if (is.null(pattern) && is.null(index))
stop("Either pattten or index must be provided\n", call. = FALSE)
if (!is.null(pattern)) {
index <- index(x, pattern = pattern, valuetype = valuetype,
case_insensitive = case_insensitive)
} else if (!is.null(index)) {
if (!is.index(index))
stop("Invalid index object\n", call. = FALSE)
}
n <- ntoken(x)
index$document <- match(index$docname, docnames(x))
index <- subset(index, !is.na(index$document))
result <- cbind(index, cpp_kwic(x, index$document, index$from, index$to,
window, separator, get_threads()))
# reorder columns to match pre-v3 order
result <- result[, c("docname", "from", "to", "pre", "keyword", "post", "pattern")]
attr(result, "ntoken") <- n[unique(index$docname)]
class(result) <- c("kwic", "data.frame")
rownames(result) <- NULL
return(result)
}
#' @export
kwic.tokens <- function(x, ...) {
kwic(as.tokens_xptr(x), ...)
}
#' @rdname kwic
#' @export
#' @examples
#' kw <- kwic(tokens(data_corpus_inaugural[1:20]), "provident*")
#' is.kwic(kw)
#' is.kwic("Not a kwic")
#' is.kwic(kw[, c("pre", "post")])
#'
is.kwic <- function(x) {
inherits(x, "kwic")
}
#' @rdname kwic
#' @method as.data.frame kwic
#' @examples
#' toks <- tokens(data_corpus_inaugural[1:8])
#' kw <- kwic(toks, pattern = "secure*", valuetype = "glob", window = 3)
#' as.data.frame(kw)
#'
#' @export
as.data.frame.kwic <- function(x, ...) {
attr(x, "ntoken") <- NULL
class(x) <- "data.frame"
return(x)
}
#' @rdname print-methods
#' @method print kwic
#' @param max_nrow max number of documents to print; default is from the
#' `print_kwic_max_nrow` setting of [quanteda_options()]
#' @importFrom stringi stri_c stri_c_list
#' @export
print.kwic <- function(x, max_nrow = quanteda_options("print_kwic_max_nrow"),
show_summary = quanteda_options("print_kwic_summary"), ...) {
max_nrow <- check_integer(max_nrow, min = -1)
if (max_nrow < 0)
max_nrow <- nrow(x)
nrem <- max(0, nrow(x) - max_nrow)
x <- head(x, max_nrow)
if (show_summary) {
cat(msg("Keyword-in-context with %d %s",
list(nrow(x) + nrem, c("match", "matches")),
list(1, nrow(x) + nrem != 1)), ".", sep = "")
}
if (nrow(x)) {
if (all(x$from == x$to)) {
labels <- stri_c("[", x$docname, ", ", x$from, "]")
} else {
labels <- stri_c("[", x$docname, ", ", x$from, ":", x$to, "]")
}
result <- data.frame(
label = labels,
pre = format(stri_replace_all_regex(x$pre, "(\\p{L}*) (\\p{Po})", "$1$2"), justify = "right"),
s1 = rep("|", nrow(x)),
keyword = format(x$keyword, justify = "centre"),
s2 = rep("|", nrow(x)),
post = format(stri_replace_all_regex(x$post, "(\\p{L}*) (\\p{Po})", "$1$2"), justify = "left")
)
colnames(result) <- NULL
print(result, row.names = FALSE)
if (nrem > 0) {
cat("[", sep = "")
if (nrem > 0) {
cat(" reached max_nrow ... ", format(nrem, big.mark = ","), " more match", sep = "")
if (nrem > 1) cat("es", sep = "")
}
cat(" ]\n", sep = "")
}
}
cat("\n", sep = "")
}
#' @method [ kwic
#' @export
#' @noRd
`[.kwic` <- function(x, i, j, ...) {
attrs <- attributes(x)
class(x) <- c("data.frame")
x <- x[i,]
attr(x, "ntoken") <- attrs$ntoken[i]
class(x) <- c("kwic", "data.frame")
rownames(x) <- NULL
return(x)
}