Skip to content

Commit

Permalink
Merge 09d6c1c into f0c3877
Browse files Browse the repository at this point in the history
  • Loading branch information
atz committed Aug 28, 2018
2 parents f0c3877 + 09d6c1c commit 5b23349
Show file tree
Hide file tree
Showing 7 changed files with 188 additions and 56 deletions.
107 changes: 107 additions & 0 deletions app/services/discovery_report.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
# Previously a single untested 200-line method from ./lib/pre_assembly/reporting.rb
# Takes a Bundle, enumerates report data via #each_row
class DiscoveryReport
attr_reader :bundle, :start_time

delegate :bundle_dir, :content_md_creation, :manifest, :object_discovery, :project_style, to: :bundle
delegate :checksums_file, :confirm_checksums, :error_count, :object_filenames_unique?, to: :bundle

# @param [PreAssembly::Bundle] bundle
def initialize(bundle)
raise ArgumentError unless bundle.is_a?(PreAssembly::Bundle)
@start_time = Time.now
@bundle = bundle
bundle.discover_objects
bundle.process_manifest
end

# @return [Enumerable<Hash<Symbol => Object>>]
# @yield [Hash<Symbol => Object>] data structure about a DigitalObject
def each_row
return enum_for(:each_row) unless block_given?
bundle.objects_to_process.each { |dobj| yield per_dobj(dobj) }
end

# @return [PreAssembly::Smpl]
def smpl
@smpl ||= PreAssembly::Smpl.new(csv_filename: content_md_creation[:smpl_manifest], bundle_dir: bundle_dir)
end

# @param [PreAssembly::DigitalObject]
# @return [Hash<Symbol => Object>]
def per_dobj(dobj)
errors = {}
counts = {
total_size: dobj.object_files.map(&:filesize).sum,
mimetypes: Hash.new(0)
}
dobj.object_files.each { |obj| counts[:mimetypes][obj.mimetype] += 1 } # number of files by mimetype
errors[:filename_no_extension] = true if dobj.object_files.any? { |obj| File.extname(obj.path).empty? }
counts[:empty_files] = dobj.object_files.count { |obj| obj.filesize == 0 }
if using_smpl_manifest? # if we are using a SMPL manifest, let's add how many files were found
bundle_id = File.basename(dobj.unadjusted_container)
cm_files = smpl.manifest[bundle_id].fetch(:files, [])
counts[:files_in_manifest] = cm_files.count
relative_paths = dobj.object_files.map(&:relative_path)
counts[:files_found] = (cm_files.map(&:filename) & relative_paths).count
errors[:empty_manifest] = true unless counts[:files_in_manifest] > 0
errors[:files_found_mismatch] = true unless counts[:files_in_manifest] == counts[:files_found]
end

errors[:empty_files] = true if counts[:empty_files] > 0
errors[:empty_object] = true if counts[:total_size] > 0
errors[:missing_files] = true unless dobj.object_files_exist?
errors[:checksum_mismatch] = true unless !checksums_file || confirm_checksums(dobj)
errors[:dupes] = true unless object_filenames_unique?(dobj)
counts[:source_ids][dobj.source_id] += 1
errors.merge!(registration_check(dobj.determine_druid))
if using_manifest? # check global uniqueness
errors[:source_id_dup] = true if dobj.source_id.any? { |id| Dor::SearchService.query_by_id(id) }
end
return { errors: errors, counts: counts }
end

# @param [String] druid
# @return [Hash<Symbol => Boolean>] errors
def registration_check(druid)
begin
obj = Dor::Item.find(druid)
rescue ActiveFedora::ObjectNotFoundError
return { item_not_registered: true }
end
return { apo_empty: true } unless obj.admin_policy_object
{}
rescue ActiveFedora::ObjectNotFoundError
return { apo_not_registered: true }
end

# @return [String] primitive version
def header
fields = ['Object Container', 'Number of Items', 'Files with no ext', 'Files with 0 Size', 'Total Size', 'Files Readable']
fields.concat ['Label', 'Source ID'] if using_manifest?
fields.concat ['Num Files in CM Manifest', 'All CM files found'] if using_smpl_manifest?
fields << 'Checksums' if checksums_file
fields.concat ['Duplicate Filenames?', 'DRUID', 'Registered?', 'APO exists?']
fields << 'SourceID unique in DOR?' if using_manifest?
fields.join(' , ')
end

# For use by template
def skipped_files
files = ['Thumbs.db', '.DS_Store'] # if these files are in the bundle directory but not in the manifest, they will be ignorned and not reported as missing
files << File.basename(content_md_creation[:smpl_manifest]) if using_smpl_manifest?
files << File.basename(manifest) if using_manifest?
files << File.basename(checksums_file) if checksums_file
files
end

# @return [Boolean]
def using_manifest?
manifest && object_discovery[:use_manifest]
end

# @return [Boolean]
def using_smpl_manifest?
content_md_creation[:style] == :smpl && File.exist?(File.join(bundle_dir, content_md_creation[:smpl_manifest]))
end
end
53 changes: 24 additions & 29 deletions lib/pre_assembly/bundle.rb
Original file line number Diff line number Diff line change
Expand Up @@ -305,11 +305,6 @@ def object_filenames_unique?(dobj)
filenames.count == filenames.uniq.count
end

def object_files_exist?(dobj)
return false if dobj.object_files.size == 0
dobj.object_files.map(&:path).all? { |path| File.readable?(path) }
end

# Cleanup of objects and associated files in specified environment using logfile as input
def cleanup(steps = [], dry_run = false)
log "cleanup()"
Expand Down Expand Up @@ -358,32 +353,33 @@ def write_validation_warnings(validator, io = STDOUT)
# For each container, creates a new Digitalobject.
def discover_objects
log "discover_objects()"
object_containers.each do |c|
container = actual_container(c)
stageables = stageable_items_for(c)
object_files = discover_object_files(stageables)
params = {
:project_style => project_style,
:bundle_dir => bundle_dir,
:staging_dir => staging_dir,
:project_name => project_name,
:file_attr => file_attr,
:init_assembly_wf => init_assembly_wf,
:content_md_creation => content_md_creation,
:container => container,
:unadjusted_container => c,
:stageable_items => stageables,
:object_files => object_files,
:new_druid_tree_format => new_druid_tree_format,
:staging_style => staging_style,
:smpl_manifest => smpl_manifest
}
dobj = DigitalObject.new params
digital_objects.push dobj
self.digital_objects = object_containers.map do |c|
params = digital_object_base_params.merge(
:container => actual_container(c),
:stageable_items => stageable_items_for(c),
:unadjusted_container => c
)
params[:object_files] = discover_object_files(params[:stageable_items])
DigitalObject.new(params)
end
log "discover_objects(found #{digital_objects.count} objects)"
end

def digital_object_base_params
{
:bundle_dir => bundle_dir,
:content_md_creation => content_md_creation,
:file_attr => file_attr,
:init_assembly_wf => init_assembly_wf,
:new_druid_tree_format => new_druid_tree_format,
:project_name => project_name,
:project_style => project_style,
:smpl_manifest => smpl_manifest,
:staging_dir => staging_dir,
:staging_style => staging_style
}
end

# If user configured pre-assembly to process a limited N of objects,
# return the requested number of object containers.
def pruned_containers(containers)
Expand Down Expand Up @@ -528,9 +524,8 @@ def validate_files(dobj)
elsif f.valid_image? && f.has_color_profile?
tally[:valid] += 1
else
msg = "File validation failed: #{f.path}"
failed_validation = true
raise msg
raise "File validation failed: #{f.path}"
end
end
success = true
Expand Down
7 changes: 7 additions & 0 deletions lib/pre_assembly/digital_object.rb
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ def pre_assemble(desc_md_xml = nil)
# Determining the druid.
####

# @return [DruidTools::Druid]
def determine_druid
k = project_style[:get_druid_from]
log " - determine_druid(#{k})"
Expand Down Expand Up @@ -288,6 +289,12 @@ def content_object_files
object_files.reject { |ofile| ofile.exclude_from_content }.sort
end

# Checks filesystem for expected files
def object_files_exist?
return false if object_files.size == 0
object_files.map(&:path).all? { |path| File.readable?(path) }
end

####
# Descriptive metadata.
####
Expand Down
26 changes: 9 additions & 17 deletions lib/pre_assembly/reporting.rb
Original file line number Diff line number Diff line change
Expand Up @@ -75,26 +75,20 @@ def discovery_report(params = {})
unique_objects = 0
discover_objects
process_manifest

objects_in_bundle_directory = @digital_objects.collect { |dobj| dobj.container_basename }
all_object_containers = manifest_rows.collect { |r| r[@manifest_cols[:object_container]] }

total_objects = @digital_objects.size

o2p = objects_to_process
total_objects_to_process = o2p.size

source_ids = Hash.new(0) if using_manifest # hash to keep track of local source_id uniqueness
source_ids = Hash.new(0) # hash to keep track of local source_id uniqueness
total_size_all_files = 0
mimetypes = Hash.new(0) # hash to keep track of mimetypes

counter = 0

o2p.each do |dobj|
counter += 1

bundle_id = File.basename(dobj.unadjusted_container)
message = "#{counter} of #{total_objects_to_process} : #{bundle_id} , " # obj container id
message = "#{counter} of #{o2p.size} : #{bundle_id} , " # obj container id

if dobj.object_files.count == 0
message += report_error_message("none") + " N/A ," # no items found and therefore existence gets an N/A
Expand All @@ -103,18 +97,16 @@ def discovery_report(params = {})
total_size = (dobj.object_files.inject(0) { |sum, obj| sum + obj.filesize }) / 1048576.0 # compute total size of all files in this object in MB
total_size_all_files += total_size # keep running tally of sizes of all discovered files
dobj.object_files.each { |obj| mimetypes[obj.mimetype] += 1 } # keep a running tally of number of files by mimetype
filenames_with_no_extension = dobj.object_files.collect { |obj| File.extname(obj.path).empty? }.include?(true)
filenames_with_no_extension = dobj.object_files.any? { |obj| File.extname(obj.path).empty? }
file_with_zero_size = dobj.object_files.collect { |obj| obj.filesize == 0 }.include?(true)
message += (filenames_with_no_extension ? report_error_message("filenames have no extension") : " no , ")
message += (file_with_zero_size ? report_error_message("a file has zero size") : " no , ")
message += (total_size == 0 ? report_error_message("object is zero size") : " %.3f" % total_size.to_s + " MB , ") # total size of all files in MB
message += (object_files_exist?(dobj) ? " yes ," : report_error_message("missing or non-readable files")) # check if all files exist and are readable
message += dobj.object_files_exist? ? ' yes ,' : report_error_message('missing or non-readable files') # check if all files exist and are readable
end

if using_manifest # if we are using a manifest, let's add label and source ID from manifest to the report
message += "\"#{dobj.label}\" , " # label
message += "\"#{dobj.source_id}\" ," # source ID
end
# if we are using a manifest, let's add label and source ID from manifest to the report
message += "\"#{dobj.label}\" , \"#{dobj.source_id}\" ," if using_manifest

if using_smpl_manifest # if we are using a SMPL manifest, let's add how many files were found
cm_files = smpl_manifest.manifest[bundle_id]
Expand All @@ -134,7 +126,7 @@ def discovery_report(params = {})

message += (object_filenames_unique?(dobj) ? " no ," : report_error_message("dupes")) # check for dupe filenames, important in a nested object that will be flattened

source_ids[dobj.source_id] += 1 if using_manifest # keep track of source_id uniqueness
source_ids[dobj.source_id] += 1 # keep track of source_id uniqueness

if confirming_registration # objects should already be registered, let's confirm that
dobj.determine_druid
Expand Down Expand Up @@ -178,7 +170,7 @@ def discovery_report(params = {})

puts "\nConfig filename, #{@config_filename}"
puts "Completed at #{Time.now}, total time was #{'%.2f' % ((Time.now - start_time) / 60.0)} minutes"
puts "\nTotal Objects that will be Processed, #{total_objects_to_process}"
puts "\nTotal Objects that will be Processed, #{o2p.size}"
puts "Total Files and Folders in bundle directory, #{entries_in_bundle_directory.count}"
puts "Total Discovered Objects, #{total_objects}"
puts "Total Size of all discovered objects, " + "%.3f" % total_size_all_files.to_s + " MB"
Expand All @@ -196,7 +188,7 @@ def discovery_report(params = {})
elsif !using_manifest
if show_other && (entries_in_bundle_directory.count != total_objects)
puts "List of entries in bundle directory that will not be discovered: "
puts (entries_in_bundle_directory - objects_in_bundle_directory).join("\n")
puts (entries_in_bundle_directory - @digital_objects.map(&:container_basename)).join("\n")
end
end

Expand Down
11 changes: 1 addition & 10 deletions spec/bundle_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,6 @@
let(:revs) { bundle_setup(:proj_revs) }
let(:rumsey) { bundle_setup(:proj_rumsey) }

def bundle_setup(proj)
filename = "spec/test_data/project_config_files/#{proj}.yaml"
@ps = YAML.load(File.read(filename))
@ps['config_filename'] = filename
@ps['show_progress'] = false
PreAssembly::Bundle.new(@ps)
end

describe "initialize() and other setup" do
it "trims the trailing slash from the bundle directory" do
expect(revs.bundle_dir).to eq('spec/test_data/bundle_input_a')
Expand Down Expand Up @@ -562,9 +554,8 @@ def bundle_setup(proj)
expect(b.progress_log_file).to eq('spec/test_data/project_config_files/proj_sohp3_progress.yaml')
end

it "sets the content_tag_override to the default value when not specified" do
it "sets content_tag_override to the default value when not specified" do
expect(revs.project_style[:content_tag_override]).to be_falsey
expect(@ps['project_style'][:content_tag_override]).to be_nil
end

it "sets the staging_dir to the default value if not specified in the YAML" do
Expand Down
31 changes: 31 additions & 0 deletions spec/services/discover_report_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
require 'rails_helper'

describe DiscoveryReport do
let(:bundle) { bundle_setup(:proj_revs)}
subject(:report) { described_class.new(bundle) }

describe '#initialize' do
it 'raises if PreAssembly::Bundle not received' do
expect { described_class.new }.to raise_error(ArgumentError)
expect { described_class.new({}) }.to raise_error(ArgumentError)
end
it 'accepts PreAssembly::Bundle, performs setup' do
expect(bundle).to receive(:discover_objects)
expect(bundle).to receive(:process_manifest)
expect { described_class.new(bundle) }.not_to raise_error
end
end

describe '#each_row' do
it 'returns an Enumerable of Hashes' do
expect(report.each_row).to be_an(Enumerable)
end
it 'yields per objects_to_process' do
expect(report).to receive(:per_dobj).with(1).and_return(fake: 1)
expect(report).to receive(:per_dobj).with(2).and_return(fake: 2)
expect(report).to receive(:per_dobj).with(3).and_return(fake: 3)
expect(bundle).to receive(:objects_to_process).and_return([1, 2, 3])
report.each_row { |_r| } # no-op
end
end
end
9 changes: 9 additions & 0 deletions spec/support/bundle_setup.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# @param [#to_s] proj basename of YAML fixture file
# @return [PreAssembly::Bundle]
def bundle_setup(proj)
filename = "spec/test_data/project_config_files/#{proj}.yaml"
ps = YAML.load(File.read(filename))
ps['config_filename'] = filename
ps['show_progress'] = false
PreAssembly::Bundle.new(ps)
end

0 comments on commit 5b23349

Please sign in to comment.