Skip to content

Commit

Permalink
modifications to content prep script to search all subdirectories
Browse files Browse the repository at this point in the history
  • Loading branch information
peetucket committed Jul 29, 2019
1 parent 574ad79 commit 5a3908b
Showing 1 changed file with 22 additions and 29 deletions.
51 changes: 22 additions & 29 deletions devel/prepare_content.rb
Original file line number Diff line number Diff line change
@@ -1,25 +1,26 @@
# Used to stage content from Rumsey or other similar format to folder structure ready for accessioning.
# Iterate through each row in the manifest, find files, generate contentMetadata and symlink to new location.
# Set the base content folder below!

# Note: filenames must match exactly (no leading 0s) but can be in any sub-folder

# Peter Mangiafico
# November 14, 2017
# see https://consul.stanford.edu/pages/viewpage.action?pageId=146704638 for more documentation
#
# Run with
# ROBOT_ENVIRONMENT=production ruby devel/prepare_content.rb INPUT_CSV_FILE.csv FULL_PATH_TO_CONTENT FULL_PATH_TO_STAGING_AREA [--exact-match] [--no-object-folders] [--report] [--content-metadata] [--content-metadata-style map]
# ROBOT_ENVIRONMENT=production ruby devel/prepare_content.rb INPUT_CSV_FILE.csv FULL_PATH_TO_CONTENT FULL_PATH_TO_STAGING_AREA [--no-object-folders] [--report] [--content-metadata] [--content-metadata-style map]
# e.g.
# ROBOT_ENVIRONMENT=production ruby devel/prepare_content.rb /maps/ThirdParty/Rumsey/Rumsey_Batch1.csv /maps/ThirdParty/Rumsey/content /maps/ThirdParty/Rumsey [--no-object-folders] [--report] [--content-metadata] [--content-metadata-style map]

# the first parameter is the input CSV (with columns labeled "Object", "Image", and "Label" (image is the filename, object is the object identifier which can be turned into a folder)
# second parameter is the full path to the content folder that will be searched
# third parameter is optional and is the full path to a folder to stage content to (if not provided, will use same path as csv file, and append "staging")
# second parameter is the full path to the content folder that will be searched (i.e. the base content folder)
# Note: files will be searched iteratively through all sub-folders of the base content folder
# third parameter is optional and is the full path to a folder to stage (i.e. symlink) content to - if not provided, will use same path as csv file, and append "staging"
#
# if you set the --report switch, it will only produce the output report, it will not symlink any files
# if you set the --content-metadata switch, it will only generate content metadata for each object using the log file for successfully found files, assuming you also have columns in your input CSV labeled "Druid", "Sequence" and "Label"
# if you set the --no-object-folders switch, then all symlinks will be flat in the staging directory (i.e. no object level folders) -- this requires all filenames to be unique across objects, if left off, then object folders will be created to store symlinks
# if you set the --exact-match switch, then only files which match exactly (but not case sensitive) will be found; the normal finding algorithm allows for also matching files that have any number of leading 0s
# file extensions do not matter when matching
# note that file extensions do not matter when matching

require File.expand_path(File.dirname(__FILE__) + '/../config/boot')
require 'optparse'
Expand All @@ -30,7 +31,6 @@
content_metadata = false # if set to true, will also generate content-metadata from values supplied in spreadsheet, can be set via switch
cm_style = 'map' # defaults to map type content metadata unless overriden
no_object_folders = false # if false, then each new object will be in a separately created folder, with symlinks contained inside it; if true, you will get a flat list
exact_match = false # if set to true, only files that match exactly (but case insensitive) will be found; any files with leading 0s will NOT match


help="Usage:\n ruby prepare_content.rb INPUT_CSV_FILE BASE_CONTENT_FOLDER [STAGING_FOLDER] [--no-object-folders] [--report] [--content_metadata] [--content_metadata_style STYLE]\n"
Expand All @@ -48,9 +48,6 @@
opts.on("--no-object-folders") do |ob|
no_object_folders=true
end
opts.on("--exact-match") do |ob|
exact_match = true
end
end.parse!

if ARGV.size < 2
Expand Down Expand Up @@ -91,7 +88,6 @@
puts "Only producing report" if report
puts "Producing content metadata with style '#{cm_style}'" if content_metadata
puts "Creating object folders" unless no_object_folders
puts "Exact match algorithm" if exact_match
puts "Input CSV File: #{csv_in}"
puts "Logging to: #{csv_out}"
puts "Base Content Folder: #{base_content_folder}"
Expand Down Expand Up @@ -192,28 +188,25 @@

# now search for any file which ends with the filename (trying to catch cases where the filename has 0s at the beginning that were dropped from the spreadsheet)
puts "......#{Time.now}: looking for file '#{filename}', object '#{object}', label '#{label}'"
files_found = files_to_search.grep(/^[0]*#{filename}\.\S+/i) # allow for leading zeros and case insensitivy; we will further refine later according to options set
# this regular expression will look for files that either match exactly (ignoring extension)
# or that match exacatly but are in a sub-directory (as indicated by having a path separator, e.g. a "/" right before the filename)
# e.g. if you are looking for a file called "test.csv", this will match "test", "test.csv", "test.jpg", "dir/test.csv", "dir/test", but NOT "0test", or "dir/0test.jpg"
files_found = files_to_search.grep(/((.+\/{1}#{filename})|(^#{filename}))\.\S+/i)
files_found_basenames = files_found.map { |file| File.basename(file) }

# if found, symlink files that match
files_found.each do |input_file|
input_filename = File.basename(input_file)
# match check, if exact match, do not look for leading zeros, but allow for case insensitivy
# if not exact match, allow leading 0s, keep case sensitivity
matched = exact_match ? /^#{filename}\.\S+/i.match(input_filename) : /^[0]*#{filename}\.\S+/.match(input_filename)
if matched # the found file matches the supplied filename
message = "found #{input_file}, symlink to object folder #{object_folder}"
output_file_full_path = no_object_folders ? File.join(staging_folder, input_filename) : (File.join(object_folder, input_filename))
input_file_full_path = Pathname.new(File.join(base_content_folder, input_file)).cleanpath(true).to_s
FileUtils.ln_s(input_file_full_path, output_file_full_path, :force => true) unless (report || File.exist?(output_file_full_path))
num_files_copied += 1
success = true
CSV.open(csv_out, 'a') { |f|
output_row = [object, filename, input_filename, sequence, label, druid, success, message, Time.now]
f << output_row
}
puts "......#{message}"
end
message = "found #{input_file}, symlink to object folder #{object_folder}"
output_file_full_path = no_object_folders ? File.join(staging_folder, input_filename) : (File.join(object_folder, input_filename))
input_file_full_path = Pathname.new(File.join(base_content_folder, input_file)).cleanpath(true).to_s
FileUtils.ln_s(input_file_full_path, output_file_full_path, :force => true) unless (report || File.exist?(output_file_full_path))
num_files_copied += 1
success = true
CSV.open(csv_out, 'a') { |f|
output_row = [object, filename, input_filename, sequence, label, druid, success, message, Time.now]
f << output_row
}
puts "......#{message}"
end # end loop over all matches

# do not log if it was previously missed and we missed it again
Expand Down

0 comments on commit 5a3908b

Please sign in to comment.