diff --git a/app/services/discovery_report.rb b/app/services/discovery_report.rb new file mode 100644 index 000000000..9f1b8fe41 --- /dev/null +++ b/app/services/discovery_report.rb @@ -0,0 +1,107 @@ +# Previously a single untested 200-line method from ./lib/pre_assembly/reporting.rb +# Takes a Bundle, enumerates report data via #each_row +class DiscoveryReport + attr_reader :bundle, :start_time + + delegate :bundle_dir, :content_md_creation, :manifest, :object_discovery, :project_style, to: :bundle + delegate :checksums_file, :confirm_checksums, :error_count, :object_filenames_unique?, to: :bundle + + # @param [PreAssembly::Bundle] bundle + def initialize(bundle) + raise ArgumentError unless bundle.is_a?(PreAssembly::Bundle) + @start_time = Time.now + @bundle = bundle + bundle.discover_objects + bundle.process_manifest + end + + # @return [Enumerable Object>>] + # @yield [Hash Object>] data structure about a DigitalObject + def each_row + return enum_for(:each_row) unless block_given? + bundle.objects_to_process.each { |dobj| yield per_dobj(dobj) } + end + + # @return [PreAssembly::Smpl] + def smpl_manifest + @smpl_manifest ||= PreAssembly::Smpl.new(csv_filename: content_md_creation[:smpl_manifest], bundle_dir: bundle_dir) + end + + # @param [PreAssembly::DigitalObject] + # @return [Hash Object>] + def per_dobj(dobj) + errors = {} + counts = { + total_size: dobj.object_files.map(&:filesize).sum, + mimetypes: Hash.new(0) + } + dobj.object_files.each { |obj| counts[:mimetypes][obj.mimetype] += 1 } # number of files by mimetype + errors[:filename_no_extension] = true if dobj.object_files.any? { |obj| File.extname(obj.path).empty? } + counts[:empty_files] = dobj.object_files.count { |obj| obj.filesize == 0 } + if using_smpl_manifest? # if we are using a SMPL manifest, let's add how many files were found + bundle_id = File.basename(dobj.unadjusted_container) + cm_files = smpl_manifest.manifest[bundle_id].fetch(:files, []) + counts[:files_in_manifest] = cm_files.count + relative_paths = dobj.object_files.map(&:relative_path) + counts[:files_found] = (cm_files.map(&:filename) & relative_paths).count + errors[:empty_manifest] = true unless counts[:files_in_manifest] > 0 + errors[:files_found_mismatch] = true unless counts[:files_in_manifest] == counts[:files_found] + end + + errors[:empty_files] = true if counts[:empty_files] > 0 + errors[:empty_object] = true if counts[:total_size] > 0 + errors[:missing_files] = true unless dobj.object_files_exist? + errors[:checksum_mismatch] = true unless !checksums_file || confirm_checksums(dobj) + errors[:dupes] = true unless object_filenames_unique?(dobj) + counts[:source_ids][dobj.source_id] += 1 + errors.merge!(registration_check(dobj.determine_druid)) + if using_manifest? # check global uniqueness + errors[:source_id_dup] = true if dobj.source_id.any? { |id| Dor::SearchService.query_by_id(id) } + end + return { errors: errors, counts: counts } + end + + # @param [String] druid + # @return [Hash Boolean>] errors + def registration_check(druid) + begin + obj = Dor::Item.find(druid) + rescue ActiveFedora::ObjectNotFoundError + return { item_not_registered: true } + end + return { apo_empty: true } unless obj.admin_policy_object + {} + rescue ActiveFedora::ObjectNotFoundError + return { apo_not_registered: true } + end + + # @return [String] primitive version + def header + fields = ['Object Container', 'Number of Items', 'Files with no ext', 'Files with 0 Size', 'Total Size', 'Files Readable'] + fields.concat ['Label', 'Source ID'] if using_manifest? + fields.concat ['Num Files in CM Manifest', 'All CM files found'] if using_smpl_manifest? + fields << 'Checksums' if checksums_file + fields.concat ['Duplicate Filenames?', 'DRUID', 'Registered?', 'APO exists?'] + fields << 'SourceID unique in DOR?' if using_manifest? + fields.join(' , ') + end + + # For use by template + def skipped_files + files = ['Thumbs.db', '.DS_Store'] # if these files are in the bundle directory but not in the manifest, they will be ignorned and not reported as missing + files << File.basename(content_md_creation[:smpl_manifest]) if using_smpl_manifest? + files << File.basename(manifest) if using_manifest? + files << File.basename(checksums_file) if checksums_file + files + end + + # @return [Boolean] + def using_manifest? + manifest && object_discovery[:use_manifest] + end + + # @return [Boolean] + def using_smpl_manifest? + content_md_creation[:style] == :smpl && File.exist?(File.join(bundle_dir, content_md_creation[:smpl_manifest])) + end +end diff --git a/lib/pre_assembly/bundle.rb b/lib/pre_assembly/bundle.rb index 17c8dd648..9b024428b 100644 --- a/lib/pre_assembly/bundle.rb +++ b/lib/pre_assembly/bundle.rb @@ -305,11 +305,6 @@ def object_filenames_unique?(dobj) filenames.count == filenames.uniq.count end - def object_files_exist?(dobj) - return false if dobj.object_files.size == 0 - dobj.object_files.map(&:path).all? { |path| File.readable?(path) } - end - # Cleanup of objects and associated files in specified environment using logfile as input def cleanup(steps = [], dry_run = false) log "cleanup()" @@ -358,32 +353,33 @@ def write_validation_warnings(validator, io = STDOUT) # For each container, creates a new Digitalobject. def discover_objects log "discover_objects()" - object_containers.each do |c| - container = actual_container(c) - stageables = stageable_items_for(c) - object_files = discover_object_files(stageables) - params = { - :project_style => project_style, - :bundle_dir => bundle_dir, - :staging_dir => staging_dir, - :project_name => project_name, - :file_attr => file_attr, - :init_assembly_wf => init_assembly_wf, - :content_md_creation => content_md_creation, - :container => container, - :unadjusted_container => c, - :stageable_items => stageables, - :object_files => object_files, - :new_druid_tree_format => new_druid_tree_format, - :staging_style => staging_style, - :smpl_manifest => smpl_manifest - } - dobj = DigitalObject.new params - digital_objects.push dobj + self.digital_objects = object_containers.map do |c| + params = digital_object_base_params.merge( + :container => actual_container(c), + :stageable_items => stageable_items_for(c), + :unadjusted_container => c + ) + params[:object_files] = discover_object_files(params[:stageable_items]) + DigitalObject.new(params) end log "discover_objects(found #{digital_objects.count} objects)" end + def digital_object_base_params + { + :bundle_dir => bundle_dir, + :content_md_creation => content_md_creation, + :file_attr => file_attr, + :init_assembly_wf => init_assembly_wf, + :new_druid_tree_format => new_druid_tree_format, + :project_name => project_name, + :project_style => project_style, + :smpl_manifest => smpl_manifest, + :staging_dir => staging_dir, + :staging_style => staging_style + } + end + # If user configured pre-assembly to process a limited N of objects, # return the requested number of object containers. def pruned_containers(containers) @@ -528,9 +524,8 @@ def validate_files(dobj) elsif f.valid_image? && f.has_color_profile? tally[:valid] += 1 else - msg = "File validation failed: #{f.path}" failed_validation = true - raise msg + raise "File validation failed: #{f.path}" end end success = true diff --git a/lib/pre_assembly/digital_object.rb b/lib/pre_assembly/digital_object.rb index 9902852fd..c8aeb0444 100644 --- a/lib/pre_assembly/digital_object.rb +++ b/lib/pre_assembly/digital_object.rb @@ -125,6 +125,7 @@ def pre_assemble(desc_md_xml = nil) # Determining the druid. #### + # @return [DruidTools::Druid] def determine_druid k = project_style[:get_druid_from] log " - determine_druid(#{k})" @@ -288,6 +289,12 @@ def content_object_files object_files.reject { |ofile| ofile.exclude_from_content }.sort end + # Checks filesystem for expected files + def object_files_exist? + return false if object_files.size == 0 + object_files.map(&:path).all? { |path| File.readable?(path) } + end + #### # Descriptive metadata. #### diff --git a/lib/pre_assembly/reporting.rb b/lib/pre_assembly/reporting.rb index a6cf4542a..ac60e2fb9 100644 --- a/lib/pre_assembly/reporting.rb +++ b/lib/pre_assembly/reporting.rb @@ -75,26 +75,20 @@ def discovery_report(params = {}) unique_objects = 0 discover_objects process_manifest - - objects_in_bundle_directory = @digital_objects.collect { |dobj| dobj.container_basename } all_object_containers = manifest_rows.collect { |r| r[@manifest_cols[:object_container]] } - total_objects = @digital_objects.size - o2p = objects_to_process - total_objects_to_process = o2p.size - source_ids = Hash.new(0) if using_manifest # hash to keep track of local source_id uniqueness + source_ids = Hash.new(0) # hash to keep track of local source_id uniqueness total_size_all_files = 0 mimetypes = Hash.new(0) # hash to keep track of mimetypes - counter = 0 o2p.each do |dobj| counter += 1 bundle_id = File.basename(dobj.unadjusted_container) - message = "#{counter} of #{total_objects_to_process} : #{bundle_id} , " # obj container id + message = "#{counter} of #{o2p.size} : #{bundle_id} , " # obj container id if dobj.object_files.count == 0 message += report_error_message("none") + " N/A ," # no items found and therefore existence gets an N/A @@ -103,18 +97,16 @@ def discovery_report(params = {}) total_size = (dobj.object_files.inject(0) { |sum, obj| sum + obj.filesize }) / 1048576.0 # compute total size of all files in this object in MB total_size_all_files += total_size # keep running tally of sizes of all discovered files dobj.object_files.each { |obj| mimetypes[obj.mimetype] += 1 } # keep a running tally of number of files by mimetype - filenames_with_no_extension = dobj.object_files.collect { |obj| File.extname(obj.path).empty? }.include?(true) + filenames_with_no_extension = dobj.object_files.any? { |obj| File.extname(obj.path).empty? } file_with_zero_size = dobj.object_files.collect { |obj| obj.filesize == 0 }.include?(true) message += (filenames_with_no_extension ? report_error_message("filenames have no extension") : " no , ") message += (file_with_zero_size ? report_error_message("a file has zero size") : " no , ") message += (total_size == 0 ? report_error_message("object is zero size") : " %.3f" % total_size.to_s + " MB , ") # total size of all files in MB - message += (object_files_exist?(dobj) ? " yes ," : report_error_message("missing or non-readable files")) # check if all files exist and are readable + message += dobj.object_files_exist? ? ' yes ,' : report_error_message('missing or non-readable files') # check if all files exist and are readable end - if using_manifest # if we are using a manifest, let's add label and source ID from manifest to the report - message += "\"#{dobj.label}\" , " # label - message += "\"#{dobj.source_id}\" ," # source ID - end + # if we are using a manifest, let's add label and source ID from manifest to the report + message += "\"#{dobj.label}\" , \"#{dobj.source_id}\" ," if using_manifest if using_smpl_manifest # if we are using a SMPL manifest, let's add how many files were found cm_files = smpl_manifest.manifest[bundle_id] @@ -134,7 +126,7 @@ def discovery_report(params = {}) message += (object_filenames_unique?(dobj) ? " no ," : report_error_message("dupes")) # check for dupe filenames, important in a nested object that will be flattened - source_ids[dobj.source_id] += 1 if using_manifest # keep track of source_id uniqueness + source_ids[dobj.source_id] += 1 # keep track of source_id uniqueness if confirming_registration # objects should already be registered, let's confirm that dobj.determine_druid @@ -178,7 +170,7 @@ def discovery_report(params = {}) puts "\nConfig filename, #{@config_filename}" puts "Completed at #{Time.now}, total time was #{'%.2f' % ((Time.now - start_time) / 60.0)} minutes" - puts "\nTotal Objects that will be Processed, #{total_objects_to_process}" + puts "\nTotal Objects that will be Processed, #{o2p.size}" puts "Total Files and Folders in bundle directory, #{entries_in_bundle_directory.count}" puts "Total Discovered Objects, #{total_objects}" puts "Total Size of all discovered objects, " + "%.3f" % total_size_all_files.to_s + " MB" @@ -196,7 +188,7 @@ def discovery_report(params = {}) elsif !using_manifest if show_other && (entries_in_bundle_directory.count != total_objects) puts "List of entries in bundle directory that will not be discovered: " - puts (entries_in_bundle_directory - objects_in_bundle_directory).join("\n") + puts (entries_in_bundle_directory - @digital_objects.map(&:container_basename)).join("\n") end end diff --git a/spec/bundle_spec.rb b/spec/bundle_spec.rb index d3340687f..89c662b2a 100644 --- a/spec/bundle_spec.rb +++ b/spec/bundle_spec.rb @@ -5,14 +5,6 @@ let(:revs) { bundle_setup(:proj_revs) } let(:rumsey) { bundle_setup(:proj_rumsey) } - def bundle_setup(proj) - filename = "spec/test_data/project_config_files/#{proj}.yaml" - @ps = YAML.load(File.read(filename)) - @ps['config_filename'] = filename - @ps['show_progress'] = false - PreAssembly::Bundle.new(@ps) - end - describe "initialize() and other setup" do it "trims the trailing slash from the bundle directory" do expect(revs.bundle_dir).to eq('spec/test_data/bundle_input_a') @@ -562,9 +554,8 @@ def bundle_setup(proj) expect(b.progress_log_file).to eq('spec/test_data/project_config_files/proj_sohp3_progress.yaml') end - it "sets the content_tag_override to the default value when not specified" do + it "sets content_tag_override to the default value when not specified" do expect(revs.project_style[:content_tag_override]).to be_falsey - expect(@ps['project_style'][:content_tag_override]).to be_nil end it "sets the staging_dir to the default value if not specified in the YAML" do diff --git a/spec/services/discover_report_spec.rb b/spec/services/discover_report_spec.rb new file mode 100644 index 000000000..bd9da40b7 --- /dev/null +++ b/spec/services/discover_report_spec.rb @@ -0,0 +1,32 @@ +require 'rails_helper' + +describe DiscoveryReport do + let(:bundle) { bundle_setup(:proj_revs)} + let(:params) { {} } + subject(:report) { described_class.new(bundle) } + + describe '#initialize' do + it 'raises if PreAssembly::Bundle not received' do + expect { described_class.new }.to raise_error(ArgumentError) + expect { described_class.new({}) }.to raise_error(ArgumentError) + end + it 'accepts PreAssembly::Bundle, performs setup' do + expect(bundle).to receive(:discover_objects) + expect(bundle).to receive(:process_manifest) + expect { described_class.new(bundle) }.not_to raise_error + end + end + + describe '#each_row' do + it 'returns an Enumerable of Hashes' do + expect(report.each_row).to be_an(Enumerable) + end + it 'yields per objects_to_process' do + expect(report).to receive(:per_dobj).with(1).and_return(fake: 1) + expect(report).to receive(:per_dobj).with(2).and_return(fake: 2) + expect(report).to receive(:per_dobj).with(3).and_return(fake: 3) + expect(bundle).to receive(:objects_to_process).and_return([1, 2, 3]) + report.each_row { |_r| } # no-op + end + end +end diff --git a/spec/support/bundle_setup.rb b/spec/support/bundle_setup.rb new file mode 100644 index 000000000..e8beba230 --- /dev/null +++ b/spec/support/bundle_setup.rb @@ -0,0 +1,9 @@ +# @param [#to_s] proj basename of YAML fixture file +# @return [PreAssembly::Bundle] +def bundle_setup(proj) + filename = "spec/test_data/project_config_files/#{proj}.yaml" + ps = YAML.load(File.read(filename)) + ps['config_filename'] = filename + ps['show_progress'] = false + PreAssembly::Bundle.new(ps) +end