From 5a3908b7a72d49a850b4a338407edd0636c556e0 Mon Sep 17 00:00:00 2001 From: Peter Mangiafico Date: Fri, 26 Jul 2019 15:55:06 -0700 Subject: [PATCH] modifications to content prep script to search all subdirectories --- devel/prepare_content.rb | 51 +++++++++++++++++----------------------- 1 file changed, 22 insertions(+), 29 deletions(-) diff --git a/devel/prepare_content.rb b/devel/prepare_content.rb index 5df0c0fab..ff39f5e88 100644 --- a/devel/prepare_content.rb +++ b/devel/prepare_content.rb @@ -1,25 +1,26 @@ # Used to stage content from Rumsey or other similar format to folder structure ready for accessioning. # Iterate through each row in the manifest, find files, generate contentMetadata and symlink to new location. -# Set the base content folder below! + +# Note: filenames must match exactly (no leading 0s) but can be in any sub-folder # Peter Mangiafico # November 14, 2017 # see https://consul.stanford.edu/pages/viewpage.action?pageId=146704638 for more documentation # # Run with -# ROBOT_ENVIRONMENT=production ruby devel/prepare_content.rb INPUT_CSV_FILE.csv FULL_PATH_TO_CONTENT FULL_PATH_TO_STAGING_AREA [--exact-match] [--no-object-folders] [--report] [--content-metadata] [--content-metadata-style map] +# ROBOT_ENVIRONMENT=production ruby devel/prepare_content.rb INPUT_CSV_FILE.csv FULL_PATH_TO_CONTENT FULL_PATH_TO_STAGING_AREA [--no-object-folders] [--report] [--content-metadata] [--content-metadata-style map] # e.g. # ROBOT_ENVIRONMENT=production ruby devel/prepare_content.rb /maps/ThirdParty/Rumsey/Rumsey_Batch1.csv /maps/ThirdParty/Rumsey/content /maps/ThirdParty/Rumsey [--no-object-folders] [--report] [--content-metadata] [--content-metadata-style map] # the first parameter is the input CSV (with columns labeled "Object", "Image", and "Label" (image is the filename, object is the object identifier which can be turned into a folder) -# second parameter is the full path to the content folder that will be searched -# third parameter is optional and is the full path to a folder to stage content to (if not provided, will use same path as csv file, and append "staging") +# second parameter is the full path to the content folder that will be searched (i.e. the base content folder) +# Note: files will be searched iteratively through all sub-folders of the base content folder +# third parameter is optional and is the full path to a folder to stage (i.e. symlink) content to - if not provided, will use same path as csv file, and append "staging" # # if you set the --report switch, it will only produce the output report, it will not symlink any files # if you set the --content-metadata switch, it will only generate content metadata for each object using the log file for successfully found files, assuming you also have columns in your input CSV labeled "Druid", "Sequence" and "Label" # if you set the --no-object-folders switch, then all symlinks will be flat in the staging directory (i.e. no object level folders) -- this requires all filenames to be unique across objects, if left off, then object folders will be created to store symlinks -# if you set the --exact-match switch, then only files which match exactly (but not case sensitive) will be found; the normal finding algorithm allows for also matching files that have any number of leading 0s -# file extensions do not matter when matching +# note that file extensions do not matter when matching require File.expand_path(File.dirname(__FILE__) + '/../config/boot') require 'optparse' @@ -30,7 +31,6 @@ content_metadata = false # if set to true, will also generate content-metadata from values supplied in spreadsheet, can be set via switch cm_style = 'map' # defaults to map type content metadata unless overriden no_object_folders = false # if false, then each new object will be in a separately created folder, with symlinks contained inside it; if true, you will get a flat list -exact_match = false # if set to true, only files that match exactly (but case insensitive) will be found; any files with leading 0s will NOT match help="Usage:\n ruby prepare_content.rb INPUT_CSV_FILE BASE_CONTENT_FOLDER [STAGING_FOLDER] [--no-object-folders] [--report] [--content_metadata] [--content_metadata_style STYLE]\n" @@ -48,9 +48,6 @@ opts.on("--no-object-folders") do |ob| no_object_folders=true end - opts.on("--exact-match") do |ob| - exact_match = true - end end.parse! if ARGV.size < 2 @@ -91,7 +88,6 @@ puts "Only producing report" if report puts "Producing content metadata with style '#{cm_style}'" if content_metadata puts "Creating object folders" unless no_object_folders -puts "Exact match algorithm" if exact_match puts "Input CSV File: #{csv_in}" puts "Logging to: #{csv_out}" puts "Base Content Folder: #{base_content_folder}" @@ -192,28 +188,25 @@ # now search for any file which ends with the filename (trying to catch cases where the filename has 0s at the beginning that were dropped from the spreadsheet) puts "......#{Time.now}: looking for file '#{filename}', object '#{object}', label '#{label}'" - files_found = files_to_search.grep(/^[0]*#{filename}\.\S+/i) # allow for leading zeros and case insensitivy; we will further refine later according to options set + # this regular expression will look for files that either match exactly (ignoring extension) + # or that match exacatly but are in a sub-directory (as indicated by having a path separator, e.g. a "/" right before the filename) + # e.g. if you are looking for a file called "test.csv", this will match "test", "test.csv", "test.jpg", "dir/test.csv", "dir/test", but NOT "0test", or "dir/0test.jpg" + files_found = files_to_search.grep(/((.+\/{1}#{filename})|(^#{filename}))\.\S+/i) files_found_basenames = files_found.map { |file| File.basename(file) } - # if found, symlink files that match files_found.each do |input_file| input_filename = File.basename(input_file) - # match check, if exact match, do not look for leading zeros, but allow for case insensitivy - # if not exact match, allow leading 0s, keep case sensitivity - matched = exact_match ? /^#{filename}\.\S+/i.match(input_filename) : /^[0]*#{filename}\.\S+/.match(input_filename) - if matched # the found file matches the supplied filename - message = "found #{input_file}, symlink to object folder #{object_folder}" - output_file_full_path = no_object_folders ? File.join(staging_folder, input_filename) : (File.join(object_folder, input_filename)) - input_file_full_path = Pathname.new(File.join(base_content_folder, input_file)).cleanpath(true).to_s - FileUtils.ln_s(input_file_full_path, output_file_full_path, :force => true) unless (report || File.exist?(output_file_full_path)) - num_files_copied += 1 - success = true - CSV.open(csv_out, 'a') { |f| - output_row = [object, filename, input_filename, sequence, label, druid, success, message, Time.now] - f << output_row - } - puts "......#{message}" - end + message = "found #{input_file}, symlink to object folder #{object_folder}" + output_file_full_path = no_object_folders ? File.join(staging_folder, input_filename) : (File.join(object_folder, input_filename)) + input_file_full_path = Pathname.new(File.join(base_content_folder, input_file)).cleanpath(true).to_s + FileUtils.ln_s(input_file_full_path, output_file_full_path, :force => true) unless (report || File.exist?(output_file_full_path)) + num_files_copied += 1 + success = true + CSV.open(csv_out, 'a') { |f| + output_row = [object, filename, input_filename, sequence, label, druid, success, message, Time.now] + f << output_row + } + puts "......#{message}" end # end loop over all matches # do not log if it was previously missed and we missed it again