/
spacy_parse.R
249 lines (226 loc) · 10.2 KB
/
spacy_parse.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
#' Parse a text using spaCy
#'
#' The `spacy_parse()` function calls spaCy to both tokenize and tag the
#' texts, and returns a data.table of the results. The function provides options
#' on the types of tagsets (`tagset_` options) either `"google"` or
#' `"detailed"`, as well as lemmatization (`lemma`). It provides a
#' functionalities of dependency parsing and named entity recognition as an
#' option. If `"full_parse = TRUE"` is provided, the function returns the
#' most extensive list of the parsing results from spaCy.
#'
#' @param x a character object, a \pkg{quanteda} corpus, or a TIF-compliant
#' corpus data.frame (see <https://github.com/ropenscilabs/tif>)
#' @param pos logical whether to return universal dependency POS tagset
#' <https://universaldependencies.org/u/pos/>)
#' @param tag logical whether to return detailed part-of-speech tags, for the
#' language model `en`, it uses the OntoNotes 5 version of the Penn
#' Treebank tag set
#' (<https://spacy.io/docs/usage/pos-tagging#pos-schemes>). Annotation
#' specifications for other available languages are available on the spaCy
#' website (<https://spacy.io/api/annotation>).
#' @param lemma logical; include lemmatized tokens in the output (lemmatization
#' may not work properly for non-English models)
#' @param entity logical; if `TRUE`, report named entities
#' @param multithread logical; If `TRUE`, the processing is parallelized
#' using spaCy's architecture (<https://spacy.io/api>)
#' @param dependency logical; if `TRUE`, analyse and tag dependencies
#' @param nounphrase logical; if `TRUE`, analyse and tag noun phrases
#' tags
#' @param additional_attributes a character vector; this option is for
#' extracting additional attributes of tokens from spaCy. When the names of
#' attributes are supplied, the output data.frame will contain additional
#' variables corresponding to the names of the attributes. For instance, when
#' `additional_attributes = c("is_punct")`, the output will include an
#' additional variable named `is_punct`, which is a Boolean (in R,
#' logical) variable indicating whether the token is a punctuation. A full
#' list of available attributes is available from
#' <https://spacy.io/api/token#attributes>.
#' @param ... not used directly
#' @return a `data.frame` of tokenized, parsed, and annotated tokens
#' @export
#' @examples
#' \dontrun{
#' spacy_initialize()
#' # See Chap 5.1 of the NLTK book, http://www.nltk.org/book/ch05.html
#' txt <- "And now for something completely different."
#' spacy_parse(txt)
#' spacy_parse(txt, pos = TRUE, tag = TRUE)
#' spacy_parse(txt, dependency = TRUE)
#'
#' txt2 <- c(doc1 = "The fast cat catches mice.\\nThe quick brown dog jumped.",
#' doc2 = "This is the second document.",
#' doc3 = "This is a \\\"quoted\\\" text." )
#' spacy_parse(txt2, entity = TRUE, dependency = TRUE)
#'
#' txt3 <- "We analyzed the Supreme Court with three natural language processing tools."
#' spacy_parse(txt3, entity = TRUE, nounphrase = TRUE)
#' spacy_parse(txt3, additional_attributes = c("like_num", "is_punct"))
#' }
spacy_parse <- function(x,
pos = TRUE,
tag = FALSE,
lemma = TRUE,
entity = TRUE,
dependency = FALSE,
nounphrase = FALSE,
multithread = TRUE,
additional_attributes = NULL,
...) {
UseMethod("spacy_parse")
}
#' @importFrom data.table data.table setDT setnames
#' @export
spacy_parse.character <- function(x,
pos = TRUE,
tag = FALSE,
lemma = TRUE,
entity = TRUE,
dependency = FALSE,
nounphrase = FALSE,
multithread = TRUE,
additional_attributes = NULL,
...) {
x <- structure(as.character(x), names = names(x))
`:=` <- `.` <- `.N` <- NULL
spacy_out <- process_document(x, multithread)
if (is.null(spacy_out$timestamps)) {
stop("Document parsing failed")
}
## check the omit_entity status
if (entity == TRUE & getOption("spacy_entity") == FALSE) {
message("entity == TRUE is ignored because spaCy model is initialized without Entity Recognizer")
message("In order to turn on entity recognition, run spacy_finalize(); spacy_initialize(entity = TURE)")
entity <- FALSE
}
tokens <- get_tokens(spacy_out)
ntokens <- get_ntokens(spacy_out)
ntokens_by_sent <- get_ntokens_by_sent(spacy_out)
dt <- data.table(doc_id = rep(spacy_out$docnames, ntokens),
sentence_id = unlist(lapply(ntokens_by_sent, function(x) rep(seq_along(x), x))),
token_id = unlist(lapply(unlist(ntokens_by_sent), function(x) seq(to = x))),
token = tokens)
if (lemma) {
model <- spacyr_pyget("model")
dt[, "lemma" := get_attrs(spacy_out, "lemma_", TRUE)]
if (substr(model, 0, 2) != "en"){
warning("lemmatization may not work properly in model '", model, "'")
}
}
if (pos) {
dt[, "pos" := get_tags(spacy_out, "google")]
}
if (tag) {
dt[, "tag" := get_tags(spacy_out, "detailed")]
}
## add dependency data fields
if (dependency) {
subtractor <- unlist(lapply(ntokens_by_sent, function(x) {
if (length(x) == 0) return(NULL)
csumx <- cumsum(c(0, x[-length(x)]))
return(rep(csumx, x))
}))
deps <- get_dependency(spacy_out)
dt[, c("head_token_id", "dep_rel") := list(deps$head_id - subtractor,
deps$dep_rel)]
}
## named entity fields
if (entity) {
dt[, entity := get_named_entities(spacy_out)]
}
## noun phrases
if (nounphrase) {
doc_id <- start_id <- nounphrase <- w_id <- root_id <- whitespace <- NULL
dt_nounphrases <- data.table::data.table(get_noun_phrases(spacy_out))
if (nrow(dt_nounphrases) > 0) {
dt_nounphrases <- dt_nounphrases[rep(1:nrow(dt_nounphrases), times = length)]
dt_nounphrases[, w_id := seq(start_id[1], length.out = length[1]), by = .(doc_id, start_id)]
dt_nounphrases <- data.table::setorder(dt_nounphrases, w_id, -length)
dt_nounphrases <- unique(dt_nounphrases, by = c("doc_id", "w_id"))
dt_nounphrases[, nounphrase := ifelse(w_id == start_id, "beg",
ifelse(w_id == max(w_id), "end", "mid")), by = .(doc_id, start_id)]
dt_nounphrases[, nounphrase := ifelse(w_id == root_id, paste0(nounphrase, "_root"), nounphrase)]
dt[, w_id := seq_len(.N), by = doc_id]
dt <- merge(dt, dt_nounphrases, by = c("doc_id", "w_id"), all.x = TRUE)
# dt[ !is.na(start_id), start_token_id := token_id[w_id == start_id][1],
# by = .(doc_id, root_id)]
# dt[ !is.na(start_id), root_token_id := token_id[w_id == root_id][1],
# by = .(doc_id, root_id)]
dt[, c("w_id", "start_id", "root_id", "text", "root_text", "length") := NULL]
dt[, whitespace := ifelse(nchar(get_attrs(spacy_out, "whitespace_")), TRUE, FALSE)]
dt[, nounphrase := ifelse(is.na(nounphrase), "", nounphrase)]
} else {
message("No noun phrase found in documents.")
}
}
if (!is.null(additional_attributes)) {
for (att_name in additional_attributes){
dt[, (att_name) := get_attrs(spacy_out, att_name, deal_utf8 = TRUE)]
}
}
dt <- as.data.frame(dt)
class(dt) <- c("spacyr_parsed", class(dt))
return(dt)
}
#' @export
spacy_parse.data.frame <- function(x, ...) {
# insert compliance check here - replace with tif package
if (!all(c("doc_id", "text") %in% names(x)))
stop("input data.frame does not conform to the TIF standard")
txt <- x$text
names(txt) <- x$doc_id
spacy_parse(txt, ...)
}
#' Tokenize text using spaCy
#'
#' Tokenize text using spaCy. The results of tokenization is stored as a Python
#' object. To obtain the tokens results in R, use `get_tokens()`.
#' <https://spacy.io>.
#' @param x input text
#' functionalities including the tagging, named entity recognition, dependency
#' analysis.
#' This slows down `spacy_parse()` but speeds up the later parsing.
#' If FALSE, tagging, entity recognition, and dependency analysis when
#' relevant functions are called.
#' @param multithread logical;
#' @param ... arguments passed to specific methods
#' @return result marker object
#' @importFrom methods new
#' @examples
#' \dontrun{
#' spacy_initialize()
#' # the result has to be "tag() is ready to run" to run the following
#' txt <- c(text1 = "This is the first sentence.\nHere is the second sentence.",
#' text2 = "This is the second document.")
#' results <- spacy_parse(txt)
#' }
#' @export
#' @keywords internal
process_document <- function(x, multithread, ...) {
# This function passes texts to python and spacy
# get or set document names
if (!is.null(names(x))) {
docnames <- names(x)
} else {
docnames <- paste0("text", 1:length(x))
}
if (all(!duplicated(docnames)) == FALSE) {
stop("Docnames are duplicated.")
} else if (all(nchar(docnames) > 0L) == FALSE) {
stop("Some docnames are missing.")
}
if (is.null(options()$spacy_initialized)) spacy_initialize()
spacyr_pyexec("try:\n del spobj\nexcept NameError:\n 1")
spacyr_pyexec("texts = []")
x <- gsub("\\\\n", "\\\n", x) # replace two quotes \\n with \n
x <- gsub("\\\\t", "\\\t", x) # replace two quotes \\t with \t
x <- gsub("\\\\", "", x) # delete unnecessary backslashes
x <- unname(x)
spacyr_pyassign("texts", x)
spacyr_pyassign("multithread", multithread)
spacyr_pyexec("spobj = spacyr()")
spacyr_pyexec("timestamps = spobj.parse(texts, multithread = multithread)")
timestamps <- as.character(spacyr_pyget("timestamps"))
output <- spacy_out$new(docnames = docnames,
timestamps = timestamps)
return(output)
}