/
vos_import.R
195 lines (171 loc) · 5.7 KB
/
vos_import.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
#' Bulk Import of RDF triples
#'
#' While triples data can be added one by one over SPARQL queries,
#' Virtuoso bulk import is by far the fastest way to import large
#' triplestores in the database.
#'
#' @param con a ODBC connection to Virtuoso, from [vos_connect()]
#' @param files paths to files to be imported
#' @param wd Alternatively, can specify directory and globbing pattern
#' to import. Note that in this case, wd must be in (or a subdir of)
#' the `AllowedDirs` list of `virtuoso.ini` file created by
#' [vos_configure()]. By default, this includes the working directory
#' where you called [vos_start()] or [vos_configure()].
#' @param glob A wildcard aka globbing pattern (e.g. `"*.nq"``).
#' @param graph Name (technically URI) for a graph in the database.
#' Can leave as default. If a graph is already specified by the
#' import file (e.g. in nquads), that will be used instead.
#' @param n_cores specify the number of available cores for parallel loading.
#' Particularly useful when importing large numbers of bulk files.
#' @return (Invisibly) returns the status table of the bulk loader,
#' indicating file loading time or errors.
#' @details the bulk importer imports all files matching a pattern
#' in a given directory. If given a list of files, these are
#' temporarily symlinked (or copied on Windows machines) to
#' the Virtuoso app cache dir in a subdirectory, and the entire
#' subdirectory is loaded (filtered by the globbing pattern).
#' If files are not specified, load is called directly on the specified
#' directory and pattern. This is particularly useful for loading large
#' numbers of files.
#'
#' Note that Virtuoso recommends breaking large files into multiple smaller ones,
#' which can improve loading time (particularly if using multiple cores.)
#'
#' Virtuoso Bulk Importer recognizes the following file formats:
#' - `.grdf`
#' - `.nq`
#' - `.owl`
#' - `.nt`
#' - `.rdf`
#' - `.trig`
#' - `.ttl`
#' - `.xml`
#'
#' Any of these can optionally be gzipped (with a `.gz` extension).
#' @references <http://vos.openlinksw.com/owiki/wiki/VOS/VirtBulkRDFLoader>
#' @importFrom digest digest
#' @importFrom fs path_abs
#' @export
#' @examples
#'
#' vos_status()
#'
#' \donttest{
#' if(has_virtuoso()){
#' vos_start()
#' con <- vos_connect()
#'
#' example <- system.file("extdata", "person.nq", package = "virtuoso")
#' vos_import(con, example)
#' }
#' }
vos_import <- function(con,
files = NULL,
wd = ".",
glob = "*",
graph = "rdflib",
n_cores = 1L) {
cache <- vos_cache()
## If given a list of specific files
stopifnot(all(assert_extensions(files))) # could be more helpful error
## We have to copy (link) files into the directory Virtuoso can access.
if (!is.null(files)) {
subdir <- digest::digest(files)
wd <- file.path(cache, subdir)
dir.create(wd, showWarnings = FALSE, recursive = TRUE)
## NOTE we need abs paths of files for this to work (at least with symlinks)
lapply(files, function(from) {
target <- file.path(wd, basename(from))
## remove target before symlinking
if (file.exists(target)) file.remove(target)
## symlink only on Unix, must copy on Windows:
switch(which_os(),
"windows" = file.copy(fs::path_abs(from), target),
file.symlink(fs::path_abs(from), target)
)
})
}
## Even on Windows, ld_dir wants a Unix-style path-slash
wd <- fs::path_tidy(wd)
if (is_windows()) wd <- fs::path_abs(wd)
DBI::dbGetQuery(
con,
paste0(
"ld_dir('",
wd,
"', '",
glob,
"', '",
graph,
"')"
)
)
importing_files <- fs::dir_ls(wd, glob = glob)
## Can call loader multiple times on multicore to load multiple files...
replicate(n_cores, DBI::dbGetQuery(con, "rdf_loader_run()"))
## clean up cache
if (!is.null(files)) {
lapply(files, function(f) unlink(file.path(wd, basename(files))))
unlink(subdir)
}
## Check status. This includes all fils ever imported
## Select only those on current import list.
status <- DBI::dbGetQuery(con, paste0("SELECT * FROM DB.DBA.LOAD_LIST"))
current <- status$ll_file %in% importing_files
status <- status[current, ]
import_errors <- any(!is.na(status$ll_error))
if (import_errors) {
err <- status[!is.na(status$ll_error), c("ll_file", "ll_error")]
stop(paste("Error importing:", paste(basename(err$ll_file), err$ll_error)),
call. = FALSE
)
}
invisible(status)
}
assert_extensions <- function(files) {
known_extensions <- c(
"grdf", "nq", "owl", "nt",
"rdf", "trig", "ttl", "xml"
)
pattern <- paste0("[.]", known_extensions, "(.gz)?$")
results <-
vapply(
files, function(filename) any(
vapply(pattern, grepl, logical(1L), filename)
),
logical(1L)
)
invisible(results)
}
guess_ext <- function(files) {
filename <- basename(files[[1]])
ext <- sub(".*([.]\\w+)", "*\\1", filename)
if (ext == "*.gz") {
ext <- paste0(
sub(
".*([.]\\w+)", "*\\1",
sub("[.]\\w+$", "", filename)
),
".gz"
)
}
ext
}
#' @importFrom fs path_tidy
assert_allowedDirs <- function(wd = ".", db_dir = vos_db()) {
## In case user connects to external virtuoso
status <- vos_status()
if (is.null(status)) {
warning(paste(
"Could not access virtuoso.ini configuration.",
"If you are using an external virtuoso server,",
"ensure working directory is in allowedDirs"
),
call. = FALSE
)
return(as.character(NA))
}
V <- ini::read.ini(file.path(db_dir, "virtuoso.ini"))
allowed <- strsplit(V$Parameters$DirsAllowed, ",")[[1]]
fs::path_tidy(wd) %in% fs::path_tidy(allowed)
}