/
coercion.R
189 lines (156 loc) · 3.98 KB
/
coercion.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
#' Coerce Between tif Object Specifications
#'
#' These functions convert between the various valid
#' formats for corpus and tokens objects. By using these
#' in other packages, maintainers need to only handle
#' whichever specific format they would like to work
#' with, but gain the freedom to output (or convert
#' into) the one most suited to their package's paradigm.
#'
#' @param corpus valid tif corpus object to coerce
#' @param tokens valid tif tokens object to coerce
#'
#' @details
#' No explicit checking is done on the input; the output
#' is guaranteed to be valid only if the input is a valid
#' format. In fact, we make an effort to not modify an
#' object that appears to be in the required format already
#' due to R's copy on modify semantics.
#'
#' @example inst/examples/tif_as.R
#' @name tif_as
NULL
#' @export
#' @rdname tif_as
tif_as_corpus_character <- function(corpus) {
UseMethod("tif_as_corpus_character")
}
#' @rdname tif_as
#' @export
tif_as_corpus_character.default <- function(corpus) {
nd <- length(dim(corpus))
if (nd <= 1L) {
out <- as.character(corpus)
} else if (nd == 2L) {
out <- as.data.frame(corpus)
} else {
stop(sprintf("Cannot convert object of class %s to tif corpus",
class(corpus)))
}
return(out)
}
#' @rdname tif_as
#' @export
tif_as_corpus_character.character <- function(corpus) {
return(corpus)
}
#' @rdname tif_as
#' @export
tif_as_corpus_character.data.frame <- function(corpus) {
out <- as.character(corpus$text)
names(out) <- corpus$doc_id
return(out)
}
#' @export
#' @rdname tif_as
tif_as_corpus_df <- function(corpus) {
UseMethod("tif_as_corpus_df")
}
#' @rdname tif_as
#' @export
tif_as_corpus_df.default <- function(corpus) {
nd <- length(dim(corpus))
if (nd <= 1L) {
out <- as.character(corpus)
tif_as_corpus_df(out)
} else if (nd == 2L) {
out <- as.data.frame(corpus)
} else {
stop(sprintf("Cannot convert object of class %s to tif corpus",
class(corpus)))
}
return(out)
}
#' @rdname tif_as
#' @export
tif_as_corpus_df.character <- function(corpus) {
# Need to convert from character
if (is.null(names(corpus))) {
doc_id <- sprintf("doc%d", seq_along(corpus))
} else {
doc_id <- names(corpus)
}
out <- data.frame(doc_id = doc_id, text = as.character(corpus),
stringsAsFactors = FALSE)
return(out)
}
#' @rdname tif_as
#' @export
tif_as_corpus_df.data.frame <- function(corpus) {
return(corpus)
}
#' @export
#' @rdname tif_as
tif_as_tokens_df <- function(tokens) {
UseMethod("tif_as_tokens_df")
}
#' @rdname tif_as
#' @export
tif_as_tokens_df.default <- function(tokens) {
nd <- length(dim(tokens))
if (nd == 2L) {
out <- as.data.frame(tokens)
tif_as_tokens_df(out)
} else {
stop("Cannot convert object of class ", class(tokens),
" to tif tokens")
}
return(out)
}
#' @rdname tif_as
#' @export
tif_as_tokens_df.list <- function(tokens) {
if (is.null(names(tokens))) {
doc_id <- sprintf("doc%d", seq_along(tokens))
} else {
doc_id <- names(tokens)
}
doc_id <- rep(doc_id, lengths(tokens))
out <- data.frame(doc_id = unlist(doc_id, use.names = FALSE),
token = unlist(tokens, use.names = FALSE),
stringsAsFactors = FALSE)
return(out)
}
#' @rdname tif_as
#' @export
tif_as_tokens_df.data.frame <- function(tokens) {
return(tokens)
}
#' @export
#' @rdname tif_as
tif_as_tokens_list <- function(tokens) {
UseMethod("tif_as_tokens_list")
}
#' @rdname tif_as
#' @export
tif_as_tokens_list.default <- function(tokens) {
nd <- length(dim(tokens))
if (nd == 2L) {
out <- as.data.frame(tokens)
} else {
stop("Cannot convert object of class ", class(tokens),
" to tif tokens")
}
return(out)
}
#' @rdname tif_as
#' @export
tif_as_tokens_list.list <- function(tokens) {
return(tokens)
}
#' @rdname tif_as
#' @export
tif_as_tokens_list.data.frame <- function(tokens) {
out <- split(tokens$token, tokens$doc_id)
return(out)
}