-
Notifications
You must be signed in to change notification settings - Fork 2
/
prepare_content.rb
244 lines (196 loc) · 11.6 KB
/
prepare_content.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
# Used to stage content from Rumsey or other similar format to folder structure ready for accessioning.
# Iterate through each row in the manifest, find files, generate contentMetadata and symlink to new location.
# Set the base content folder below!
# Peter Mangiafico
# November 14, 2017
# see https://consul.stanford.edu/pages/viewpage.action?pageId=146704638 for more documentation
#
# Run with
# ROBOT_ENVIRONMENT=production ruby devel/prepare_content.rb INPUT_CSV_FILE.csv FULL_PATH_TO_CONTENT FULL_PATH_TO_STAGING_AREA [--exact-match] [--no-object-folders] [--report] [--content-metadata] [--content-metadata-style map]
# e.g.
# ROBOT_ENVIRONMENT=production ruby devel/prepare_content.rb /maps/ThirdParty/Rumsey/Rumsey_Batch1.csv /maps/ThirdParty/Rumsey/content /maps/ThirdParty/Rumsey [--no-object-folders] [--report] [--content-metadata] [--content-metadata-style map]
# the first parameter is the input CSV (with columns labeled "Object", "Image", and "Label" (image is the filename, object is the object identifier which can be turned into a folder)
# second parameter is the full path to the content folder that will be searched
# third parameter is optional and is the full path to a folder to stage content to (if not provided, will use same path as csv file, and append "staging")
#
# if you set the --report switch, it will only produce the output report, it will not symlink any files
# if you set the --content-metadata switch, it will only generate content metadata for each object using the log file for successfully found files, assuming you also have columns in your input CSV labeled "Druid", "Sequence" and "Label"
# if you set the --no-object-folders switch, then all symlinks will be flat in the staging directory (i.e. no object level folders) -- this requires all filenames to be unique across objects, if left off, then object folders will be created to store symlinks
# if you set the --exact-match switch, then only files which match exactly (but not case sensitive) will be found; the normal finding algorithm allows for also matching files that have any number of leading 0s
# file extensions do not matter when matching
require File.expand_path(File.dirname(__FILE__) + '/../config/boot')
require 'optparse'
require 'pathname'
content_metadata_filename = 'contentMetadata.xml'
report = false # if set to true, will only show output and produce report, won't actually symlink files or create anything, can be overriden with --report switch
content_metadata = false # if set to true, will also generate content-metadata from values supplied in spreadsheet, can be set via switch
cm_style = 'map' # defaults to map type content metadata unless overriden
no_object_folders = false # if false, then each new object will be in a separately created folder, with symlinks contained inside it; if true, you will get a flat list
exact_match = false # if set to true, only files that match exactly (but case insensitive) will be found; any files with leading 0s will NOT match
help = "Usage:\n ruby prepare_content.rb INPUT_CSV_FILE BASE_CONTENT_FOLDER [STAGING_FOLDER] [--no-object-folders] [--report] [--content_metadata] [--content_metadata_style STYLE]\n"
OptionParser.new do |opts|
opts.banner = help
opts.on("--report") do |_dr|
report = true
end
opts.on("--content_metadata") do |_cm|
content_metadata = true
end
opts.on("--content_metadata_style [STYLE]") do |st|
cm_style = st
end
opts.on("--no-object-folders") do |_ob|
no_object_folders = true
end
opts.on("--exact-match") do |ob|
exact_match = true
end
end.parse!
if ARGV.size < 2
puts help
abort "Incorrect number of argument provided - you need to supply an input CSV file and the folder to search for."
end
csv_in = ARGV[0]
base_content_folder = ARGV[1]
source_path = File.dirname(csv_in)
source_name = File.basename(csv_in, File.extname(csv_in))
csv_out = File.join(source_path, source_name + "_log.csv")
if ARGV.size == 2 # no staging path provided, use same as CSV In and append "staging"
staging_folder = File.join(source_path, "staging")
else # use what was provided
staging_folder = ARGV[2]
end
abort "#{csv_in} not found" unless File.exist?(csv_in)
unless File.exist?(csv_out) # if we don't already have a log file, write out the header row
CSV.open(csv_out, 'a') { |f|
output_row = ["Object", "Image", "Filename", "Sequence", "Label", "Druid", "Success", "Message", "Time"]
f << output_row
}
end
# read in existing log file
log_file_data = CSV.parse(IO.read(csv_out), :headers => true).map { |row| row.to_hash.with_indifferent_access }
# read input manifest
csv_data = CSV.parse(IO.read(csv_in), :headers => true).map { |row| row.to_hash.with_indifferent_access }
start_time = Time.now
puts ""
puts "***Prepare Content***"
puts "Only producing report" if report
puts "Producing content metadata with style '#{cm_style}'" if content_metadata
puts "Creating object folders" unless no_object_folders
puts "Exact match algorithm" if exact_match
puts "Input CSV File: #{csv_in}"
puts "Logging to: #{csv_out}"
puts "Base Content Folder: #{base_content_folder}"
puts "Staging Folder: #{staging_folder}"
puts "Started at: #{start_time}"
puts ""
$stdout.flush
found_objects = []
n = 0
num_files_not_found = 0
num_objects = 0
num_files_copied = 0
# only running content_metadata
if content_metadata # create the content metadata
log_file_data.each do |row| # loop over log file data
object = row['Object'].gsub(',', '-') # commas are no good in filenames, use a dash instead
druid = row['Druid']
unless druid # if we don't have a druid in the output log file, look in the input spreadsheet for this object
input_csv_druids = csv_data.select { |s| s['Object'] == object } # read the druid from the input spreadsheet in case you want to add it later
input_csv_druids.uniq! { |e| e['Druid'] } # remove any duped entries
druid = input_csv_druids.first['Druid']
end
all_files = log_file_data.select { |s| s['Object'] == object && s['Success'] == "true" } # find all files in this object that successfully staged
all_files.uniq! { |e| e['Filename'] } # remove any duped filenames (for example, if the log file has multiple entries from multiple runs)
all_files.sort_by! { |e| e['Sequence'].to_i } # sort the list of files by sequence
if druid and all_files.size > 0 # be sure we have a druid and files
object_folder = File.join(staging_folder, object)
cm_file = File.join(object_folder, content_metadata_filename)
content_object_files = [] # build list of assembly objectfiles for creating content metadata
all_files.each { |object_file| content_object_files << Assembly::ObjectFile.new(File.join(object_folder, File.basename(object_file['Filename'])), :label => object_file['Label']) }
params = { :druid => druid, :objects => content_object_files, :add_exif => false, :style => cm_style.to_sym }
content_md_xml = Assembly::ContentMetadata.create_content_metadata(params)
File.open(cm_file, 'w') { |fh| fh.puts content_md_xml }
puts "Writing content metadata file #{cm_file} for #{druid} and object #{object}"
else
puts "ERROR: Did not create content metadata file #{cm_file} -- missing druid or no files found"
end # end check for a druid and files
end # end loop over all output log file
puts ""
else # either a report or symlink operation
FileUtils.cd(base_content_folder)
FileUtils.mkdir_p staging_folder unless report
files_to_search = Dir.glob("**/**")
files_to_search.reject! { |f| f == '.' || f == '..' || f == '.DS_Store' }
csv_data.each do |row|
n += 1
puts "Row #{n} out of #{csv_data.size}"
$stdout.flush
object = row['Object'].gsub(',', '-') # commas are no good in filenames, use a dash instead
row_filename = row['Image']
label = row['Label']
sequence = row['Sequence']
druid = row['Druid']
success = false
filename = File.basename(row_filename, File.extname(row_filename)) # remove any extension from the filename that was provided
previously_found = (log_file_data.select { |row| row["Image"] == row_filename && row["Success"].downcase == "true" }.size) > 0
previously_missed = (log_file_data.select { |row| row["Image"] == row_filename && row["Success"].downcase == "false" }.size) > 0
unless previously_found # only look for this file if it has not already been found according to the output log file
object_folder = File.join(staging_folder, object)
unless found_objects.include? object # check to see if we have a new object so we can create a new output folder for it
msg = "...#{Time.now}: Found new object: '#{object}'"
unless no_object_folders || report
FileUtils.mkdir_p object_folder
msg += " - creating object folder '#{object_folder}' if it does not exist"
end
found_objects << object
num_objects += 1
puts msg
end # end we have an object
# now search for any file which ends with the filename (trying to catch cases where the filename has 0s at the beginning that were dropped from the spreadsheet)
puts "......#{Time.now}: looking for file '#{filename}', object '#{object}', label '#{label}'"
files_found = files_to_search.grep(/^[0]*#{filename}\.\S+/i) # allow for leading zeros and case insensitivy; we will further refine later according to options set
files_found_basenames = files_found.map { |file| File.basename(file) }
# if found, symlink files that match
files_found.each do |input_file|
input_filename = File.basename(input_file)
# match check, if exact match, do not look for leading zeros, but allow for case insensitivy
# if not exact match, allow leading 0s, keep case sensitivity
matched = exact_match ? /^#{filename}\.\S+/i.match(input_filename) : /^[0]*#{filename}\.\S+/.match(input_filename)
if matched # the found file matches the supplied filename
message = "found #{input_file}, symlink to object folder #{object_folder}"
output_file_full_path = no_object_folders ? File.join(staging_folder, input_filename) : (File.join(object_folder, input_filename))
input_file_full_path = Pathname.new(File.join(base_content_folder, input_file)).cleanpath(true).to_s
FileUtils.ln_s(input_file_full_path, output_file_full_path, :force => true) unless (report || File.exist?(output_file_full_path))
num_files_copied += 1
success = true
CSV.open(csv_out, 'a') { |f|
output_row = [object, filename, input_filename, sequence, label, druid, success, message, Time.now]
f << output_row
}
puts "......#{message}"
end
end # end loop over all matches
# do not log if it was previously missed and we missed it again
if (!previously_missed && !success)
message = "ERROR #{filename} NOT FOUND"
num_files_not_found += 1
CSV.open(csv_out, 'a') { |f|
output_row = [object, filename, '', sequence, label, druid, success, message, Time.now]
f << output_row
}
puts "......#{message}"
end
puts ""
$stdout.flush
else
puts "......#{Time.now}: skipping #{object} - already run"
end # end check to see if we have already successfully run this filename
end # end loop over all rows
puts ""
puts "Total objects staged: #{num_objects}"
puts "Total files symlinked: #{num_files_copied}"
puts "Total rows: #{csv_data.size}"
puts "Total files not found: #{num_files_not_found}"
end # end check for content metadata or symlinking/report
puts "Completed at #{Time.now}, total time was #{'%.2f' % ((Time.now - start_time) / 60.0)} minutes"