forked from andresgmejiar/dataARC
/
Translator.R
147 lines (115 loc) · 5.51 KB
/
Translator.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
arc.dataconvert <- function(json,export=FALSE){
oldw <- getOption("warn")
options(warn = -1)
library(gdata) # for the trim function
library(jsonlite)
library(knitr)
library(stringr)
library(tibble)
library(dplyr)
library(tidyr)
library(purrr)
library(data.table)
options(warn = oldw)
dataARC_raw_input <- fromJSON(json, simplifyDataFrame = TRUE)
#read in features as a flattened dataframe
flt_input <- jsonlite::flatten(as.data.frame(dataARC_raw_input$features))
flt_combinators <- jsonlite::flatten(as.data.frame(dataARC_raw_input$combinators))
tbl_comb <- as_tibble(flt_combinators)
tbl_comb_2 <- rename(tbl_comb, combinatorid = `_id`)
reduced_comb_data <- tbl_comb_2 %>% dplyr::select("combinatorid", "dataset", "features", "concepts")
flt_concepts <- jsonlite::flatten(as.data.frame(dataARC_raw_input$concepts))
tbl_concepts <- as_tibble(flt_concepts)
tbl_concepts_2 <- rename(tbl_concepts, conceptid = `_id`)
#make flat feature dataframe into manipulable tibble
tbl_flt <- as_tibble(flt_input)
#select desired input fields from tibble
reduced_data <- tbl_flt %>% dplyr::select("_id", "dataset", location.coordinates, starts_with("properties"), -"properties.finds")
#unnest coordinates within reduced dataset - this flattens deeply listed
#data from the tephrabase and jardabok datasets, among others
unnested_coords <- reduced_data %>% mutate(values = purrr::map(location.coordinates, setNames, c("long", "lat"))) %>% unnest_wider(values)
unnested_coords_2 <- unnested_coords %>% mutate(values = unnested_coords$properties.data) %>% unnest_wider(values)
unnested_coords_3 <- unnested_coords_2 %>% mutate(values = unnested_coords_2$properties.relationships) %>% unnest_wider(values)
unnested_coords_4 <- unnested_coords_3 %>% mutate(values = unnested_coords_3$properties.tephra) %>% unnest_wider(values)
unnested_coords_5 <- unnested_coords_4 %>% select(-"location.coordinates", -"properties.data", -"properties.relationships", -"properties.tephra")
#turns tibble into a data.table for more manipulation
dt <- as.data.table(unnested_coords_5)
#determines if columns are all NA
dt[, "SUM" := rowSums(!is.na(.SD))]
#and deletes them if so
dt <- dt[SUM > 0,]
#recouperates resource list from Farm Histories Database
recs <- dt[, list(relationships_lookup_resource_en =
as.character(unlist(relationships_lookup_resource_en))),
by = "_id"]
dt[, relationships_lookup_resource_en := NULL]
#joins the data back into the manipulated datatable
dt <- recs[dt,on="_id",all=TRUE]
oldw <- getOption("warn")
options(warn = -1)
#break the data apart using the ID and dataset variables as the ID variables
#keeping only properties and the resources
dt1 <- melt(dt, id.vars=c("_id","dataset"),
measure.vars = names(dt)[str_detect(names(dt),
"properties|relationships_lookup_resource_en")])
options(warn = oldw)
#remove "properties" prefix from subfields
dt1[, variable := str_replace(variable,"properties.","")]
#remove NA rows from the datatable
dt1 <- na.omit(dt1)
#name columns appropriately
cols = c("FID","FEATURE","Y","X","DATASET","CATEGORY","ENTRY")
#recreates lat/long from original datatable
dt2 <- dt[,.(`_id`,lat,long)]
#merges lat/long into dt1 datatable
dt1 <- dt1[unique(dt2),on="_id"]
#creates sequential Feature IDs ("FIDs") for
dt1[, FID := 1:nrow(dt1)]
#sets column names appropriately
setnames(dt1,c("_id","long","lat","dataset","variable","value"),
cols[2:length(cols)])
#orders the columns by the cols variable in the data table dt1
#assigns them to a new data table "out"
out <- dt1[,..cols]
#Bring in combinator (r) and concepts/contextual/related (s) dataframes
r <- as.data.table(reduced_comb_data)
s <- as.data.table(tbl_concepts_2)[,.(conceptid,contextual,related)]
#converts context/related lists into hash strings with colon separators
cols <- c('contextual','related')
s[, (cols) := lapply(.SD, function(x) {x <- x[[1]]
paste(unique(x),collapse=":")}),
.SDcols = cols]
setnames(s,"conceptid","concepts")
#Unlists the concepts and associated features from the combinators datatable
r1 <- r[, .(features = unlist(features)), by = combinatorid]
r2 <- r[, .(concepts = unlist(concepts)), by = combinatorid]
#joins the appropriate related and context to each concepts
r2 <- r2[s,on="concepts"]
#combines then onto the features by combinator
#before converting them to hash strings with colon separators by feature
r <- r1[r2,on='combinatorid', all=TRUE
][,.(concepts = paste(unique(concepts),collapse=":"),
combinators = paste(unique(combinatorid),collapse=":"),
related = paste(unique(related),collapse=":"),
context = paste(unique(contextual),collapse=":")),
by=features]
#set the output order, convert to capital
cols <- c("feature","concepts","combinators","related","context")
setnames(r,"features","feature")
setnames(r, cols, toupper(cols))
#combines the concepts and data with its original X and Y coordinates
output <- out[r,on="FEATURE",all=TRUE]
#makes hashes uppercase
hashcols <- c("FEATURE","DATASET","CONCEPTS","CATEGORY", "COMBINATORS")
output[, (hashcols) := lapply(.SD,toupper),.SDcols = hashcols]
#select export columns, drop NAs in data columns
cols = c("FID","FEATURE","X","Y","DATASET","CATEGORY","ENTRY",
"CONCEPTS","COMBINATORS","RELATED","CONTEXT")
output <- na.omit(output, cols=c("X","Y","ENTRY"))
#export
if (is.character(export)) {
fwrite(output[,..cols], file=paste0(export,".gz"))
} else {
output
}
}