-
Notifications
You must be signed in to change notification settings - Fork 2
/
meta_fix.R
141 lines (106 loc) · 4.03 KB
/
meta_fix.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
library(stringr)
library(dfrtopics)
library(RJSONIO)
library(dplyr)
setwd("/Users/agoldst/Documents/signs-model")
# model_dir <- "model65_fla_800"
#model_dir <- "model_k65_fla_len800_stoprefs"
model_dir <- "model_k70"
# data_dir <- "data"
# citations_file <- file.path(data_dir,"citations.CSV")
data_dir <- "data/20140827"
citations_file <- c(file.path(data_dir,"citations.tsv"),
"scrape_txt/citations.tsv")
# backup source files
meta_file <- "browser/data/meta.csv.zip"
info_file <- "browser/data/info.json"
file.copy(meta_file,"meta.csv.zip.bak")
file.copy(info_file,"info.json.bak")
m <- read_metadata(citations_file)
ids <- c(readLines(file.path(model_dir,"id_map.txt")),
read.csv(file.path(model_dir,"signs_39.4.doc_topics.csv"),
as.is=T)$id)
i_md <- match(ids,m$id)
m <- m[i_md,]
# keep only unneeded columns
keep_cols <- c("id","title","author","journaltitle","volume","issue",
"pubdate","pagerange")
m <- m[,keep_cols]
# correct the dates
#contents <- read.csv("contents.csv")
#contents <- tbl_df(contents[-398,])
#if (any(is.na(contents$Vol) | is.na(contents$No))) {
# stop("NA values in Vol/No")
#}
#issues <- str_c(contents$Vol,"_",contents$No)
#
## one item has missing information, which we'll drop
#check_dates <- contents %>% group_by(Vol,No) %>%
# summarize(tally=n_distinct(Date.1))
#stopifnot(all(check_dates$tally==1))
#date_lookup <- dates$date
#names(date_lookup) <- dates$issue
#
#date_keys <- str_c(m$volume,"_",m$issue)
#m$pubdate <- date_lookup[date_keys]
# Add special issue column
# read special issue information
specials <- read.csv("specials.csv",as.is=T)
special_keys <- str_c(specials$VOLUME,"_",specials$ISSUE)
info_obj <- fromJSON(info_file)
info_obj$issues <- matrix(c(special_keys,
specials$SPECIAL.ISSUE.TITLE,
str_trim(specials$JSTOR.LINK)),
ncol=3)
# keys to index into special_keys for each doc:
issue_keys <- str_c(m$volume,"_",m$issue)
# but 5.3S is 5.3 as far as jstor knows, but with S page numbers:
issue_keys[m$volume == 5 & m$issue == 3
& grepl("S",m$pagerange,fixed=T)] <- '5_3S'
# and correct the issue number itself
m$issue[issue_keys == "5_3S"] <- "3S"
# and 1.3S is not annotated in jstor metadata (yet JSTOR knows the
# difference between the two issues somehow?)
# so instead we can use the fact that 1.3 has page numbers 585-808
# whereas 1.3S has page numbers 1-317
# actually this is tricky because both issues have roman-numeral prefatory
# matter, but we'll get away with it because none of the prefaces and
# editorials made it into our document set.
is_13s <- function (pp) {
num_match <- str_match(pp,"p\\. (\\d+)")[,2]
ifelse(is.na(num_match),F,as.numeric(num_match) < 400)
}
issue_keys[m$volume == 1 & m$issue == 3
& is_13s(m$pagerange)] <- "1_3S"
# and correct the issue number itself
m$issue[issue_keys == "1_3S"] <- "3S"
m$special <- ifelse(issue_keys %in% special_keys,issue_keys,"")
# verify match on existing metadata file
m_check <- read.csv(unz(meta_file,"meta.csv"),header=F,
as.is=T)
# skip the issue column (which we've changed)
# and the rightmost column (pubtype in the old data, dropped here;
# special in the new data, not in old data)
# if the original file has the new column 10 (abstract), ignore that too
if (ncol(m_check == 10)) {
skipcol <- c(6,9,10)
} else {
skipcol <- c(6,9)
}
if (!isTRUE(all.equal(m_check[,-skipcol],m[,-skipcol],
check.attributes=F))) {
stop("mismatch between old metadata file and new")
}
message("Overwriting ",info_file)
writeLines(toJSON(info_obj, pretty=T),info_file)
# output csv file in the expected zipped format
f_temp <- file.path(tempdir(),"meta.csv")
write.table(m,f_temp,
quote=T,sep=",",
col.names=F,row.names=F,
# d3.csv.* expects RFC 4180 compliance
qmethod="double")
unlink(meta_file)
message("Overwriting ",meta_file)
status <- zip(meta_file,f_temp,flags="-9Xj")
stopifnot(status == 0)