/
spacy_extract_entity.R
147 lines (130 loc) · 5.32 KB
/
spacy_extract_entity.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#' Extract named entities from texts using spaCy
#'
#' This function extracts named entities from texts, based on the entity tag
#' `ent` attributes of documents objects parsed by spaCy (see
#' <https://spacy.io/usage/linguistic-features#section-named-entities>).
#'
#' @param x a character object or a TIF-compliant
#' corpus data.frame (see <https://github.com/ropenscilabs/tif>)
#' @inheritParams spacy_parse
#' @param output type of returned object, either `"list"` or
#' `"data.frame"`.
#' @param type type of named entities, either `named`, `extended`, or
#' `all`. See
#' <https://spacy.io/docs/usage/entity-recognition#entity-types> for
#' details.
#' @param ... unused
#' @details When the option `output = "data.frame"` is selected, the
#' function returns a `data.frame` with the following fields.
#' \describe{\item{`text`}{contents of entity}
#' \item{`entity_type`}{type of entity (e.g. `ORG` for
#' organizations)} \item{`start_id`}{serial number ID of starting token.
#' This number corresponds with the number of `data.frame` returned from
#' `spacy_tokenize(x)` with default options.} \item{`length`}{number
#' of words (tokens) included in a named entity (e.g. for an entity, "New York
#' Stock Exchange"", `length = 4`)}}
#'
#' @return either a `list` or `data.frame` of tokens
#' @export
#' @examples
#' \dontrun{
#' spacy_initialize()
#'
#' txt <- c(doc1 = "The Supreme Court is located in Washington D.C.",
#' doc2 = "Paul earned a postgraduate degree from MIT.")
#' spacy_extract_entity(txt)
#' spacy_extract_entity(txt, output = "list")
#' }
spacy_extract_entity <- function(x, output = c("data.frame", "list"),
type = c("all", "named", "extended"),
multithread = TRUE, ...) {
UseMethod("spacy_extract_entity")
}
#' @importFrom data.table data.table
#' @export
spacy_extract_entity.character <- function(x,
output = c("data.frame", "list"),
type = c("all", "named", "extended"),
multithread = TRUE, ...) {
type <- match.arg(type)
`ent_type` <- `start_id` <- `:=` <- NULL
output <- match.arg(output)
if (!is.null(names(x))) {
docnames <- names(x)
} else {
docnames <- paste0("text", 1:length(x))
}
if (length(x) == 1) {
multithread <- FALSE
}
if (all(!duplicated(docnames)) == FALSE) {
stop("Docnames are duplicated.")
} else if (all(nchar(docnames) > 0L) == FALSE) {
stop("Some docnames are missing.")
}
if (is.null(options()$spacy_initialized)) spacy_initialize()
spacyr_pyexec("try:\n del spobj\nexcept NameError:\n 1")
spacyr_pyexec("texts = []")
if (spacyr_pyget("py_version") != 3) {
message("multithreading for python 2 is not supported by spacy_tokenize()")
multithread <- FALSE
}
x <- gsub("\\\\n", "\\\n", x) # replace two quotes \\n with \n
x <- gsub("\\\\t", "\\\t", x) # replace two quotes \\t with \t
x <- gsub("\\\\", "", x) # delete unnecessary backslashes
x <- unname(x)
## send documents to python
spacyr_pyassign("texts", x)
spacyr_pyassign("docnames", docnames)
spacyr_pyassign("multithread", multithread)
spacyr_pyassign("ent_type_category", type)
## run noun phrase extraction
spacyr_pyexec("spobj = spacyr()")
if (identical(output, "list")) {
command_str <- paste("entities = spobj.extract_entity_list(texts = texts,",
"docnames = docnames,",
"multithread = multithread,
ent_type_category = ent_type_category)")
spacyr_pyexec(command_str)
return(spacyr_pyget("entities"))
} else {
command_str <- paste("entities = spobj.extract_entity_dataframe(texts = texts,",
"docnames = docnames,",
"multithread = multithread)")
spacyr_pyexec(command_str)
entities <- spacyr_pyget("entities")
doc_id <- names(entities)
data_out <-
data.table::rbindlist(lapply(doc_id, function(x) {
df <- as.data.frame(entities[[x]], stringsAsFactors = FALSE)
if (nrow(df) == 0) return(NULL)
df$doc_id <- x
return(df)
}))
if (nrow(data_out) == 0) {
message("No entity found in documents")
return(NULL)
}
data_out[, start_id := start_id + 1]
extended_list <- c("DATE", "TIME", "PERCENT", "MONEY", "QUANTITY", "ORDINAL",
"CARDINAL")
if (type == "extended"){
data_out <- data_out[ent_type %in% extended_list]
} else if (type == "named") {
data_out <- data_out[!ent_type %in% extended_list]
}
data.table::setDF(data_out)
data_out <- data_out[, c(5, 1:4)]
return(data_out)
}
}
#' @method spacy_extract_entity data.frame
#' @export
spacy_extract_entity.data.frame <- function(x, ...) {
# insert compliance check here - replace with tif package
if (!all(c("doc_id", "text") %in% names(x)))
stop("input data.frame does not conform to the TIF standard")
txt <- x$text
names(txt) <- x$doc_id
spacy_extract_entity(txt, ...)
}