From fb6879a6bf9bf025e60874cfc824f837f08c5313 Mon Sep 17 00:00:00 2001 From: Sean Upton Date: Thu, 30 May 2019 13:24:15 -0600 Subject: [PATCH] Ingest CLI (rake task) and Logging (#150) * NDNP BatchIngester has alternate constructor intended for command-line use. * Rake task for NDNP ingest * NDNP PageIngester generates TIFF from PDF when TIFF is missing. * BatchIngester whitelists the batch directory, when called from CLI. * Clean up NDNP title/publication creation, don't duplicate copy of metadata. * Logging mixin for ingest logging * ALTO fixtures (unscaled px units, and 4X scaled points) generated by OCR of ocr_gray.tiff * RenderALTO class supports scaling for use in generating test fixtures --- .rubocop.yml | 3 + .../text_formats_from_alto_service.rb | 8 +- lib/newspaper_works.rb | 1 + .../ingest/ndnp/batch_ingester.rb | 61 +++++- .../ingest/ndnp/container_ingester.rb | 4 +- .../ingest/ndnp/issue_ingester.rb | 34 ++- .../ingest/ndnp/page_ingester.rb | 64 +++++- .../ingest/ndnp/page_metadata.rb | 11 +- lib/newspaper_works/logging.rb | 54 +++++ lib/newspaper_works/page_finder.rb | 1 + .../text_extraction/alto_reader.rb | 33 ++- .../text_extraction/render_alto.rb | 20 +- lib/tasks/newspaper_works_tasks.rake | 22 +- spec/fixtures/files/ocr_alto.xml | 202 ++++++++++++++++++ .../files/ocr_alto_scaled_4pts_per_px.xml | 202 ++++++++++++++++++ .../ingest/ndnp/batch_ingester_spec.rb | 61 ++++++ .../ingest/ndnp/issue_ingester_spec.rb | 18 ++ .../ingest/ndnp/page_ingester_spec.rb | 61 +++++- .../ingest/ndnp/page_metadata_spec.rb | 7 +- spec/lib/newspaper_works/logging_spec.rb | 53 +++++ .../text_extraction/render_alto_spec.rb | 3 + spec/newspaper_works_rake_spec.rb | 82 +++++++ .../text_formats_from_alto_service_spec.rb | 69 ++++++ 23 files changed, 1017 insertions(+), 57 deletions(-) create mode 100644 lib/newspaper_works/logging.rb create mode 100644 spec/fixtures/files/ocr_alto.xml create mode 100644 spec/fixtures/files/ocr_alto_scaled_4pts_per_px.xml create mode 100644 spec/lib/newspaper_works/logging_spec.rb create mode 100644 spec/newspaper_works_rake_spec.rb diff --git a/.rubocop.yml b/.rubocop.yml index 8990da5e..8fd8dab2 100644 --- a/.rubocop.yml +++ b/.rubocop.yml @@ -3,6 +3,9 @@ inherit_gem: inherit_from: .rubocop_fixme.yml +RSpec/ExampleLength: + Max: 15 + AllCops: TargetRubyVersion: 2.1 DisplayCopNames: true diff --git a/app/services/newspaper_works/text_formats_from_alto_service.rb b/app/services/newspaper_works/text_formats_from_alto_service.rb index cac74b27..51d2f996 100644 --- a/app/services/newspaper_works/text_formats_from_alto_service.rb +++ b/app/services/newspaper_works/text_formats_from_alto_service.rb @@ -55,8 +55,14 @@ def create_derivatives(_filename) # as this plugin makes derivatives of derivative, _filename is ignored source_file = alto return if source_file.nil? + # Image width from characterized primary file helps ensure proper scaling: + file = @file_set.original_file + width = file.nil? ? nil : file.width[0].to_i # ALTOReader is responsible for transcoding, this class just saves result - reader = NewspaperWorks::TextExtraction::AltoReader.new(source_file) + reader = NewspaperWorks::TextExtraction::AltoReader.new( + source_file, + width + ) save_derivative('json', reader.json) save_derivative('txt', reader.text) end diff --git a/lib/newspaper_works.rb b/lib/newspaper_works.rb index 5c0e6f83..77703875 100644 --- a/lib/newspaper_works.rb +++ b/lib/newspaper_works.rb @@ -4,6 +4,7 @@ require "newspaper_works/data" require "newspaper_works/configuration" require "newspaper_works/page_finder" +require "newspaper_works/logging" # Newspaper works modules module NewspaperWorks diff --git a/lib/newspaper_works/ingest/ndnp/batch_ingester.rb b/lib/newspaper_works/ingest/ndnp/batch_ingester.rb index 524860b7..8d4cde46 100644 --- a/lib/newspaper_works/ingest/ndnp/batch_ingester.rb +++ b/lib/newspaper_works/ingest/ndnp/batch_ingester.rb @@ -1,21 +1,69 @@ -require 'find' require 'date' +require 'find' +require 'optparse' module NewspaperWorks module Ingest module NDNP class BatchIngester + include NewspaperWorks::Logging + attr_accessor :path, :batch + # alternate constructor from ARGV + # @param options [Array] + def self.from_command(options, cmd_name) + path = batch_path(options, cmd_name) + missing_path(cmd_name) if path.nil? + path = xml_path(path) + missing_path(cmd_name, "Not found: #{path}") unless File.exist?(path) + Hyrax.config.whitelisted_ingest_dirs.push(File.dirname(path)) + new(path) + end + + def self.missing_path(cmd_name, msg = "Missing path argument") + STDERR.puts "Usage: #{cmd_name} -- --path=PATH" + STDERR.puts "#{msg}. Exiting." + # rubocop:disable Rails/Exit + exit(1) if cmd_name.start_with?('rake') + # rubocop:enable Rails/Exit + end + + def self.batch_path(options, cmd_name) + path = nil + parser = OptionParser.new + args = parser.order!(options) {} + parser.banner = "Usage: #{cmd_name} -- --path=PATH" + parser.on('-i PATH', '--path PATH') do |p| + path = p + end + parser.parse!(args) + path + end + + def self.xml_path(path) + return path unless File.directory?(path) + batch_xml_path = Find.find(path).select do |f| + f.downcase.end_with?('batch_1.xml', 'batch.xml') + end + batch_xml_path.find { |f| f.end_with?('_1.xml') } || batch_xml_path[0] + end + def initialize(path) - @path = xml_path(path) + @path = self.class.xml_path(path) + raise IOError, "No batch file found: #{path}" if @path.empty? @batch = batch_enumerator + configure_logger('ingest') end def ingest + write_log("Beginning NDNP batch ingest for #{@path}") batch.each do |issue| issue_ingester(issue).ingest end + write_log( + "NDNP batch ingest complete!" + ) end private @@ -32,15 +80,6 @@ def issue_ingester(issue) def normalize_date(v) (v.is_a?(String) ? Date.parse(v) : v).to_s end - - def xml_path(path) - return path unless File.directory?(path) - batch_path = Find.find(path).select do |f| - f.downcase.end_with?('batch_1.xml') - end - raise IOError, 'Batch file not found: #{path}' if batch_path.empty? - batch_path[0] - end end end end diff --git a/lib/newspaper_works/ingest/ndnp/container_ingester.rb b/lib/newspaper_works/ingest/ndnp/container_ingester.rb index ba2d53ad..abe176e8 100644 --- a/lib/newspaper_works/ingest/ndnp/container_ingester.rb +++ b/lib/newspaper_works/ingest/ndnp/container_ingester.rb @@ -26,7 +26,7 @@ def ingest # Link a page to target container # @param page [NewspaperPage] def link(page) - @target.members << page + @target.ordered_members << page # save each link attempt (for now no deferring/bundling) @target.save! end @@ -69,7 +69,7 @@ def copy_metadata def link_publication return unless @target.publication.nil? - @publication.members << @target + @publication.ordered_members << @target @publication.save! end end diff --git a/lib/newspaper_works/ingest/ndnp/issue_ingester.rb b/lib/newspaper_works/ingest/ndnp/issue_ingester.rb index 6b6f9a31..17a26e0b 100644 --- a/lib/newspaper_works/ingest/ndnp/issue_ingester.rb +++ b/lib/newspaper_works/ingest/ndnp/issue_ingester.rb @@ -2,6 +2,8 @@ module NewspaperWorks module Ingest module NDNP class IssueIngester + include NewspaperWorks::Logging + attr_accessor :batch, :issue, :target delegate :path, to: :issue @@ -23,6 +25,7 @@ def initialize(issue, batch = nil) @issue = issue @batch = batch @target = nil + configure_logger('ingest') end def ingest @@ -37,13 +40,13 @@ def construct_issue def ingest_pages issue.each do |page| - NewspaperWorks::Ingest::NDNP::PageIngester.new(page, @target).ingest + page_ingest(page) end end private - def page_ingester(page_data) + def page_ingest(page_data) NewspaperWorks::Ingest::NDNP::PageIngester.new( page_data, @target @@ -70,6 +73,7 @@ def create_issue @target = NewspaperIssue.create copy_issue_metadata @target.save! + write_log("Saved metadata to new NewspaperIssue #{@target.id}") end # @param lccn [String] Library of Congress Control Number @@ -87,14 +91,32 @@ def copy_publication_title(publication) publication.place_of_publication = [uri] unless uri.nil? end + def create_publication(lccn) + publication = NewspaperTitle.create + copy_publication_title(publication) + publication.lccn ||= lccn + publication.save! + write_log( + "Created NewspaperTitle work #{publication.id} for LCCN #{lccn}" + ) + publication + end + def find_or_create_linked_publication lccn = issue.metadata.lccn publication = find_publication(lccn) - publication = NewspaperTitle.create if publication.nil? - copy_publication_title(publication) - publication.lccn ||= lccn - publication.members << @target + unless publication.nil? + write_log( + "Found existing NewspaperTitle #{publication.id}, LCCN #{lccn}" + ) + end + publication = create_publication(lccn) if publication.nil? + publication.ordered_members << @target publication.save! + write_log( + "Linked NewspaperIssue #{@target.id} to "\ + "NewspaperTitle work #{publication.id}" + ) end end end diff --git a/lib/newspaper_works/ingest/ndnp/page_ingester.rb b/lib/newspaper_works/ingest/ndnp/page_ingester.rb index 1627bdd2..a85b68a3 100644 --- a/lib/newspaper_works/ingest/ndnp/page_ingester.rb +++ b/lib/newspaper_works/ingest/ndnp/page_ingester.rb @@ -1,7 +1,11 @@ +require 'newspaper_works/logging' + module NewspaperWorks module Ingest module NDNP + # rubocop:disable Metrics/ClassLength class PageIngester + include NewspaperWorks::Logging attr_accessor :page, :issue, :target delegate :path, :dmdid, to: :page @@ -25,6 +29,8 @@ def initialize(page, issue) @issue = issue # target is to-be-created NewspaperPage: @target = nil + @work_files = nil + configure_logger('ingest') end def ingest @@ -34,11 +40,15 @@ def ingest end def construct_page - @target = NewspaperPage.create - @target.title = page_title + @target = NewspaperPage.create!(title: page_title) + write_log( + "Created NewspaperPage work #{@target.id} "\ + "with title '#{@target.title[0]}'" + ) copy_page_metadata link_issue @target.save! + write_log("Saved metadata to NewspaperPage work #{@target.id}") end # Ingest primary, derivative files; other derivatives including @@ -46,16 +56,18 @@ def construct_page # derivative service components as a consequence of commiting # files assigned (via actor stack, via WorkFiles). def ingest_page_files - work_files = NewspaperWorks::Data::WorkFiles.new(@target) + @work_files = NewspaperWorks::Data::WorkFiles.new(@target) page.files.each do |path| ext = path.downcase.split('.')[-1] if ['tif', 'tiff'].include?(ext) - work_files.assign(path) + ingest_primary_file(path) else - work_files.derivatives.assign(path) + ingest_derivative_file(path) end end - work_files.commit! + write_log("Beginning file attachment process (WorkFiles.commit!) "\ + "for work #{@target.id}") + @work_files.commit! end def link_reel @@ -73,9 +85,46 @@ def link_reel private + def ingest_primary_file(path) + unless File.exist?(path) + pdf_path = page.files.select { |p| p.end_with?('pdf') }[0] + # make and get TIFF path (to generated tmp file): + path = make_tiff(pdf_path) + end + write_log("Assigned primary file to work #{@target.id}, #{path}") + @work_files.assign(path) + end + + def ingest_derivative_file(path) + write_log("Assigned derivative file to work #{@target.id}, #{path}") + @work_files.derivatives.assign(path) + end + def link_issue - issue.members << @target # page + issue.ordered_members << @target # page issue.save! + write_log( + "Linked NewspaperIssue work #{issue.id} "\ + "to NewspaperPage work #{@target.id}" + ) + end + + # dir whitelist + def whitelist + Hyrax.config.whitelisted_ingest_dirs + end + + # Generate TIFF in temporary file, return its path, given path to PDF + # @param pdf_path [String] path to single-page PDF + # @return [String] path to generated TIFF + def make_tiff(pdf_path) + write_log( + "Creating TIFF from PDF in lieu of missing for work "\ + " (#{@target.id})", + Logger::WARN + ) + whitelist.push(Dir.tmpdir) unless whitelist.include?(Dir.tmpdir) + NewspaperWorks::Ingest::PdfPages.new(pdf_path).to_a[0] end # Page title as issue title plus page title @@ -96,6 +145,7 @@ def copy_page_metadata end end end + # rubocop:enable Metrics/ClassLength end end end diff --git a/lib/newspaper_works/ingest/ndnp/page_metadata.rb b/lib/newspaper_works/ingest/ndnp/page_metadata.rb index 8b49eecc..d7fbcc61 100644 --- a/lib/newspaper_works/ingest/ndnp/page_metadata.rb +++ b/lib/newspaper_works/ingest/ndnp/page_metadata.rb @@ -31,9 +31,9 @@ def inspect # "Number" is used liberaly, and may contain both alpha # and numeric characters. As such, return value is String. # - # Recommendation: callers may (strongly recommended) fall back to: - # `page.page_number || page.page_sequence_number.to_s`, - # however, this is not implemented automatically by this class. + # If NDNP issue data fails to provide an explicitly + # human-readable page number, fallback to sequence + # number, in String form. # # @return [String, NilClass] Page "number" string def page_number @@ -41,7 +41,10 @@ def page_number ".//mods:mods//mods:detail[@type='page number']", **XML_NS ) - return nil if detail.size.zero? + if detail.size.zero? + fallback = page_sequence_number + return fallback.nil? ? nil : fallback.to_s + end detail.xpath("mods:number", **XML_NS).first.text end diff --git a/lib/newspaper_works/logging.rb b/lib/newspaper_works/logging.rb new file mode 100644 index 00000000..5ec1b649 --- /dev/null +++ b/lib/newspaper_works/logging.rb @@ -0,0 +1,54 @@ +module NewspaperWorks + module Logging + class << self + attr_accessor :configured + end + self.configured = [] + + def logger + @logger = Rails.logger + end + + # Log message, as in standard logger, but use message_format on message. + # @param severity [Integer] log level/severity, e.g. Logger::INFO == 2 + # @param msg [String] Log message to be formatted by message_format + # @param progname [String] (optional) + def log(severity, msg, progname = nil, &block) + logger.add(severity, message_format(msg), progname, &block) + end + + # Simpler alternative to .log, with default severity, message_format + # wrapping. + # @param msg [String] Log message to be formatted by message_format + # @param severity [Integer] log level/severity, e.g. Logger::INFO == 2 + # @param progname [String] + def write_log(msg, severity = Logger::INFO, progname = nil) + logger.add(severity, message_format(msg), progname) + end + + # format message, distinct from per-output formatting, to be used in + # all logging channels Rails.logger broadcasts to. This wrapping + # indicates in parenthetical prefix which class is acting to + # produce message. + # @param msg [String] + def message_format(msg) + "(#{self.class}) #{msg}" + end + + # Should be called by consuming class, prior to use of .logger method + # has checks to prevent duplicate configuration if already configured. + def configure_logger(name) + @logger = Rails.logger + return if NewspaperWorks::Logging.configured.include?(name) + path = Rails.root.join("log/#{name}.log") + @named_log = ActiveSupport::Logger.new(path) + @named_log.formatter = proc do |_severity, datetime, _progname, msg| + "#{datetime}: #{msg}\n" + end + # rails will log to named_log in addition to any other configured + # or default logging destinations: + @logger.extend(ActiveSupport::Logger.broadcast(@named_log)) + NewspaperWorks::Logging.configured.push(name) + end + end +end diff --git a/lib/newspaper_works/page_finder.rb b/lib/newspaper_works/page_finder.rb index e04256b0..78b6b28f 100644 --- a/lib/newspaper_works/page_finder.rb +++ b/lib/newspaper_works/page_finder.rb @@ -32,6 +32,7 @@ def ordered_pages(documents) final_page_id = doc['id'] end end + return documents if next_page_id.nil? while next_page_id != final_page_id next_page = documents.select { |doc| doc['id'] == next_page_id }.first ordered_list.insert(-2, next_page) diff --git a/lib/newspaper_works/text_extraction/alto_reader.rb b/lib/newspaper_works/text_extraction/alto_reader.rb index 9b6d95c3..d80b34e2 100644 --- a/lib/newspaper_works/text_extraction/alto_reader.rb +++ b/lib/newspaper_works/text_extraction/alto_reader.rb @@ -14,8 +14,11 @@ class AltoReader class AltoDocStream < Nokogiri::XML::SAX::Document attr_accessor :text, :words - def initialize - super + def initialize(image_width = nil) + super() + # scaling matters: + @image_width = image_width + @scaling = 1.0 # pt to px, if ALTO using points # plain text buffer: @text = '' # list of word hash, containing word+coord: @@ -27,13 +30,26 @@ def initialize # @param attrs [Hash] hash containing ALTO `String` element attributes. # @return [Array] Array of position x, y, width, height in px. def s_coords(attrs) - height = (attrs['HEIGHT'] || 0).to_i - width = (attrs['WIDTH'] || 0).to_i - hpos = (attrs['HPOS'] || 0).to_i - vpos = (attrs['VPOS'] || 0).to_i + height = scale_value((attrs['HEIGHT'] || 0).to_i) + width = scale_value((attrs['WIDTH'] || 0).to_i) + hpos = scale_value((attrs['HPOS'] || 0).to_i) + vpos = scale_value((attrs['VPOS'] || 0).to_i) [hpos, vpos, width, height] end + def compute_scaling(attrs) + return if @image_width.nil? + match = attrs.select { |e| e[0].casecmp?('WIDTH') }[0] + return if match.empty? + page_width = match[1].to_i + return if @image_width == page_width + @scaling = page_width / @image_width.to_f + end + + def scale_value(v) + (v / @scaling).to_i + end + # Callback for element start, implementation of which ignores # non-String elements. # @@ -41,6 +57,7 @@ def s_coords(attrs) # @param attrs [Array] Array of key, value pair Arrays. def start_element(name, attrs = []) values = attrs.to_h + compute_scaling(attrs) if name == 'Page' return if name != 'String' token = values['CONTENT'] @text << token @@ -73,9 +90,9 @@ def end_document # Construct with either path # # @param xml [String], and process document - def initialize(xml) + def initialize(xml, image_width = nil) @source = isxml?(xml) ? xml : File.read(xml) - @doc_stream = AltoDocStream.new + @doc_stream = AltoDocStream.new(image_width) parser = Nokogiri::XML::SAX::Parser.new(doc_stream) parser.parse(@source) end diff --git a/lib/newspaper_works/text_extraction/render_alto.rb b/lib/newspaper_works/text_extraction/render_alto.rb index c288dd4a..01d8afa4 100644 --- a/lib/newspaper_works/text_extraction/render_alto.rb +++ b/lib/newspaper_works/text_extraction/render_alto.rb @@ -4,9 +4,10 @@ module NewspaperWorks # Module for text extraction (OCR or otherwise) module TextExtraction class RenderAlto - def initialize(width, height) + def initialize(width, height, scaling = 1.0) @height = height @width = width + @scaling = scaling end def to_alto(words) @@ -14,10 +15,10 @@ def to_alto(words) words.each do |word| xml.String( CONTENT: word[:word], - HEIGHT: (word[:y_end] - word[:y_start]).to_s, - WIDTH: (word[:x_end] - word[:x_start]).to_s, - HPOS: word[:x_start].to_s, - VPOS: word[:y_start].to_s + HEIGHT: scale_point(word[:y_end] - word[:y_start]).to_s, + WIDTH: scale_point(word[:x_end] - word[:x_start]).to_s, + HPOS: scale_point(word[:x_start]).to_s, + VPOS: scale_point(word[:y_start]).to_s ) { xml.text '' } end end @@ -39,12 +40,19 @@ def alto_page(pxwidth, pxheight, &block) builder end + def scale_point(value) + # note: presuming non-fractional, even though ALTO 2.1 + # specifies coordinates are xsd:float, not xsd:int, + # simplify to integer value for output: + (value * @scaling).to_i + end + # return layout for page def alto_layout(xml, pxwidth, pxheight, &block) xml.Layout do xml.Page(ID: 'ID1', PHYSICAL_IMG_NR: '1', - HEIGHT: pxwidth.to_i, + HEIGHT: pxheight.to_i, WIDTH: pxwidth.to_i) do xml.PrintSpace(HEIGHT: pxheight.to_i, WIDTH: pxwidth.to_i, diff --git a/lib/tasks/newspaper_works_tasks.rake b/lib/tasks/newspaper_works_tasks.rake index cef6f9f0..47fbfbb8 100644 --- a/lib/tasks/newspaper_works_tasks.rake +++ b/lib/tasks/newspaper_works_tasks.rake @@ -1,4 +1,18 @@ -# desc 'Explaining what the task does' -# task :newspaper_works do -# # Task goes here -# end +namespace :newspaper_works do + def use_application + ENV['RAILS_ENV'] = Rails.env if ENV['RAILS_ENV'].nil? + Rails.application.require_environment! + end + + desc 'Ingest an NDNP batch: "rake newspaper_works:ingest_ndnp -- --path="' + task :ingest_ndnp do + use_application + ingester = NewspaperWorks::Ingest::NDNP::BatchIngester.from_command( + ARGV, + 'rake newspaper_works:ingest_ndnp --' + ) + puts "Beginning NDNP batch ingest..." + ingester.ingest + puts "NDNP batch ingest complete! See log/ingest.log for details." + end +end diff --git a/spec/fixtures/files/ocr_alto.xml b/spec/fixtures/files/ocr_alto.xml new file mode 100644 index 00000000..dbeeace3 --- /dev/null +++ b/spec/fixtures/files/ocr_alto.xml @@ -0,0 +1,202 @@ + + + + pixel + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/spec/fixtures/files/ocr_alto_scaled_4pts_per_px.xml b/spec/fixtures/files/ocr_alto_scaled_4pts_per_px.xml new file mode 100644 index 00000000..14006bcd --- /dev/null +++ b/spec/fixtures/files/ocr_alto_scaled_4pts_per_px.xml @@ -0,0 +1,202 @@ + + + + pixel + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/spec/lib/newspaper_works/ingest/ndnp/batch_ingester_spec.rb b/spec/lib/newspaper_works/ingest/ndnp/batch_ingester_spec.rb index 3ae7dbc1..88f919e1 100644 --- a/spec/lib/newspaper_works/ingest/ndnp/batch_ingester_spec.rb +++ b/spec/lib/newspaper_works/ingest/ndnp/batch_ingester_spec.rb @@ -20,6 +20,15 @@ end describe "ingests issues" do + def expect_start_finish_logging(adapter) + expect(adapter).to receive(:write_log).with( + satisfy { |v| v.include?('Beginning NDNP batch ingest') } + ).once + expect(adapter).to receive(:write_log).with( + satisfy { |v| v.include?('NDNP batch ingest complete') } + ).once + end + it "calls ingest for all issues in batch" do adapter = described_class.new(batch1) issue_ingest_call_count = 0 @@ -27,8 +36,60 @@ allow_any_instance_of(NewspaperWorks::Ingest::NDNP::IssueIngester).to \ receive(:ingest) { issue_ingest_call_count += 1 } # rubocop:enable RSpec/AnyInstance + expect_start_finish_logging(adapter) adapter.ingest expect(issue_ingest_call_count).to eq 4 end end + + describe "command invocation" do + def construct(args) + described_class.from_command( + args, + 'rake newspaper_works:ingest_ndnp --' + ) + end + + it "creates ingester from command arguments" do + fake_argv = ['newspaper_works:ingest_ndnp', '--', "--path=#{batch1}"] + adapter = construct(fake_argv) + expect(adapter).to be_a described_class + expect(adapter.path).to eq batch1 + end + + it "creates ingester from command with dir path" do + # command can accept a parent directory for batch: + base_path = File.dirname(batch1) + fake_argv = ['newspaper_works:ingest_ndnp', '--', "--path=#{base_path}"] + adapter = construct(fake_argv) + expect(adapter).to be_a described_class + # adapter.path is path to actual XML + expect(adapter.path).to eq batch1 + end + + it "exits on file not found for batch" do + fake_argv = ['newspaper_works:ingest_ndnp', '--', "--path=123/45/5678"] + begin + construct(fake_argv) + rescue SystemExit => e + expect(e.status).to eq(1) + end + end + + it "exits on missing path for batch" do + fake_argv = ['newspaper_works:ingest_ndnp', '--'] + begin + construct(fake_argv) + rescue SystemExit => e + expect(e.status).to eq(1) + end + end + + it "exits on unexpected arguments" do + fake_argv = ['newspaper_works:ingest_ndnp', '--', '--foo=bar'] + expect { construct(fake_argv) }.to raise_error( + OptionParser::InvalidOption + ) + end + end end diff --git a/spec/lib/newspaper_works/ingest/ndnp/issue_ingester_spec.rb b/spec/lib/newspaper_works/ingest/ndnp/issue_ingester_spec.rb index 256c8b83..59994f17 100644 --- a/spec/lib/newspaper_works/ingest/ndnp/issue_ingester_spec.rb +++ b/spec/lib/newspaper_works/ingest/ndnp/issue_ingester_spec.rb @@ -15,6 +15,12 @@ let(:adapter) { described_class.new(issue_data) } describe "adapter and asset construction" do + def expect_issue_import_logging(adapter) + expect(adapter).to receive(:write_log).with( + satisfy { |v| v.include?('Saved metadata to new NewspaperIssue') } + ).once + end + it "constructs adapter with issue source" do expect(adapter.issue).to be issue_data expect(adapter.path).to eq issue_data.path @@ -30,10 +36,21 @@ expect(adapter.batch).to be batch end + # rubocop:disable RSpec/ExampleLength it "constructs NewspaperIssue with adapter" do # construct_issue is only the first part of ingest, create issue # and find-or-link publication NewspaperTitle; # this does not trigger creation of child pages. + expect_issue_import_logging(adapter) + expect(adapter).to receive(:write_log).with( + satisfy do |v| + v.include?('Created NewspaperTitle work') || + v.include?('Found existing NewspaperTitle') + end + ).once + expect(adapter).to receive(:write_log).with( + satisfy { |v| v.include?('Linked NewspaperIssue') } + ).once adapter.construct_issue issue = adapter.target expect(issue).to be_a NewspaperIssue @@ -43,6 +60,7 @@ expect(publication.lccn).to eq issue_data.metadata.lccn expect(publication.title).to contain_exactly 'The Park Record' end + # rubocop:enable RSpec/ExampleLength it "creates new NewspaperTitle without place of publication" do # clear any existing publications from previous testing diff --git a/spec/lib/newspaper_works/ingest/ndnp/page_ingester_spec.rb b/spec/lib/newspaper_works/ingest/ndnp/page_ingester_spec.rb index 47803c53..909b7cb5 100644 --- a/spec/lib/newspaper_works/ingest/ndnp/page_ingester_spec.rb +++ b/spec/lib/newspaper_works/ingest/ndnp/page_ingester_spec.rb @@ -33,6 +33,7 @@ expect(page).to be_a NewspaperPage expect(page.id).not_to be_nil expect(issue.members).to include page + expect(issue.ordered_members.to_a).to include page end end @@ -84,24 +85,74 @@ page = adapter.target page.reload expect(page.container).not_to be_nil - expect(page.container.members.map(&:id)).to include page.id + expect(page.container.ordered_members.to_a.map(&:id)).to include page.id end end describe "file import integration" do do_now_jobs = [IngestLocalFileJob, IngestJob, InheritPermissionsJob] - it "attaches primary, derivative files", perform_enqueued: do_now_jobs do - adapter.ingest - page = adapter.target + let(:issue_data) do + NewspaperWorks::Ingest::NDNP::IssueIngest.new(issue2) + end + + let(:page_data_minus_tiff) { issue_data.to_a[0] } + + def check_fileset(page) fileset = page.members.select { |m| m.class == FileSet }[0] # Reload fileset because jobs have modified: fileset.reload expect(fileset).not_to be_nil expect(fileset.original_file).not_to be_nil expect(fileset.original_file.mime_type).to eq 'image/tiff' - derivatives = NewspaperWorks::Data::WorkDerivatives.new(page, fileset) + expect(fileset.original_file.size).to be > 0 + end + + def expect_file_assignment_logging(adapter) + expect(adapter).to receive(:write_log).with( + satisfy { |v| v.include?('Assigned primary file to work') } + ).once + expect(adapter).to receive(:write_log).with( + satisfy { |v| v.include?('Assigned derivative file to work') } + ).exactly(3).times + expect(adapter).to receive(:write_log).with( + satisfy { |v| v.include?('Beginning file attachment') } + ).once + end + + def expect_page_import_logging(adapter) + expect(adapter).to receive(:write_log).with( + satisfy { |v| v.include?('Created NewspaperPage work') } + ).once + expect(adapter).to receive(:write_log).with( + satisfy { |v| v.include?('Saved metadata to NewspaperPage work') } + ).once + expect(adapter).to receive(:write_log).with( + satisfy { |v| v.include?('Linked NewspaperIssue') } + ).once + end + + it "attaches primary, derivative files", perform_enqueued: do_now_jobs do + expect_page_import_logging(adapter) + expect_file_assignment_logging(adapter) + adapter.ingest + page = adapter.target + check_fileset(page) + derivatives = NewspaperWorks::Data::WorkDerivatives.new(page) expect(derivatives.keys).to match_array ["jp2", "xml", "pdf"] end + + # support this use-case for evaluation purposes + it "generates TIFF when missing from page", perform_enqueued: do_now_jobs do + adapter = described_class.new(page_data_minus_tiff, issue) + expect_page_import_logging(adapter) + expect(adapter).to receive(:write_log).with( + satisfy { |arg| arg.include?('Creating TIFF') }, + Logger::WARN + ).exactly(1).times + expect_file_assignment_logging(adapter) + expect { adapter.ingest }.not_to raise_error + check_fileset(adapter.target) + end end end diff --git a/spec/lib/newspaper_works/ingest/ndnp/page_metadata_spec.rb b/spec/lib/newspaper_works/ingest/ndnp/page_metadata_spec.rb index a730dd32..439dbd17 100644 --- a/spec/lib/newspaper_works/ingest/ndnp/page_metadata_spec.rb +++ b/spec/lib/newspaper_works/ingest/ndnp/page_metadata_spec.rb @@ -37,8 +37,8 @@ describe "sample fixture 'batch_test_ver01" do let(:page) { described_class.new(issue2, nil, 'pageModsBib1') } - it "returns nil page number for page without one" do - expect(page.page_number).to eq nil + it "fallback to sequence number on page without page number" do + expect(page.page_number).to eq page.page_sequence_number.to_s end it "gets expected sequence number as Integer" do @@ -61,8 +61,9 @@ describe "sample fixture via Reel XML" do let(:page) { described_class.new(reel1, nil, 'targetModsBib1') } - it "returns nil page number for page without one" do + it "return nil page number when page and sequence missing" do expect(page.page_number).to eq nil + expect(page.page_sequence_number).to eq nil end it "gets expected sequence number as Integer" do diff --git a/spec/lib/newspaper_works/logging_spec.rb b/spec/lib/newspaper_works/logging_spec.rb new file mode 100644 index 00000000..80360613 --- /dev/null +++ b/spec/lib/newspaper_works/logging_spec.rb @@ -0,0 +1,53 @@ +require 'spec_helper' + +describe NewspaperWorks::Logging do + describe "mixin logging module" do + let(:klass) do + Class.new do + include NewspaperWorks::Logging + end + end + + let(:loggable) { klass.new } + + let(:configured) do + obj = loggable + # expectation is that this is called by consuming class constructor: + obj.configure_logger('ingest-test') + obj + end + + it "requires configuration by consuming class" do + name = 'random_testing_logname' + expect(loggable.instance_variable_get(:@logger)).to be_nil + expect(described_class.configured).not_to include name + loggable.configure_logger(name) + expect(loggable.instance_variable_get(:@logger)).not_to be_nil + expect(described_class.configured).to include name + end + + it "logs formatted message to rails logger with write_log" do + message = "FYI: heads-up, this is a message" + expect(Rails.logger).to receive(:add).with( + Logger::INFO, + configured.message_format(message), + nil + ) + configured.write_log(message) + end + + it "writes to named log file" do + # need to reset global de-dupe state for additional logger, just for + # purposes of this test + described_class.configured = [] + message = "Instant coffee" + named_log = configured.instance_variable_get(:@named_log) + expect(named_log).to receive(:add).with( + Logger::INFO, + configured.message_format(message), + nil + ) + configured.write_log(message) + end + end +end diff --git a/spec/lib/newspaper_works/text_extraction/render_alto_spec.rb b/spec/lib/newspaper_works/text_extraction/render_alto_spec.rb index 4acc67a6..07a1f194 100644 --- a/spec/lib/newspaper_works/text_extraction/render_alto_spec.rb +++ b/spec/lib/newspaper_works/text_extraction/render_alto_spec.rb @@ -12,6 +12,8 @@ Nokogiri::XML::Schema(File.read(xsdpath)) end + let(:page_prefix) { '