diff --git a/app/jobs/ingest_pulfa_job.rb b/app/jobs/ingest_pulfa_job.rb new file mode 100644 index 000000000..fbf6238bc --- /dev/null +++ b/app/jobs/ingest_pulfa_job.rb @@ -0,0 +1,89 @@ +class IngestPULFAJob < ApplicationJob + queue_as :ingest + + # @param [String] mets_file Filename of a PULFA METS file to ingest + # @param [String] user User to ingest as + def perform(mets_file, user) + logger.info "Ingesting PULFA METS #{mets_file}" + @mets = Nokogiri::XML(File.open(mets_file)) + @user = user + @pages = [] + + ingest + end + + private + + def ingest + r = ScannedResource.new + r.title = [@mets.xpath("//mets:structMap/mets:div/@LABEL").first.value] + r.replaces = @mets.xpath('/mets:mets/@OBJID').first.value + r.rights_statement = 'http://rightsstatements.org/vocab/NKC/1.0/' + r.visibility = Hydra::AccessControls::AccessRight::VISIBILITY_TEXT_VALUE_PUBLIC + r.apply_depositor_metadata @user + r.save! + Workflow::InitializeState.call(r, 'book_works', 'final_review') + logger.info "Created ScannedResource #{r.id} (#{r.replaces})" + + @mets.xpath("/mets:mets/mets:fileSec/mets:fileGrp").map do |group| + master = file_info(group.xpath("mets:file[@USE='master']")) + service = file_info(group.xpath("mets:file[@USE='deliverable']")) + if master[:file] + ingest_page(r, master, service) + elsif service[:type] == 'application/pdf' + attach_pdf(r, service) + end + end + + # add pages to order + r.ordered_members = @pages + r.save! + end + + def file_info(file) + return {} unless file.length > 0 + file_urn = file.xpath("mets:FLocat/@xlink:href").first.value + use = file.attribute('USE').value + fn = (use == 'master') ? master_for(file_urn) : service_for(file_urn) + groupid = file.xpath('../@ID').first.value + title = file.xpath("//mets:div[mets:fptr/@FILEID='" + groupid + "']/@LABEL").first.value + { id: id_for(file_urn), file: fn, use: use, type: file.attribute('MIMETYPE').value, title: title } + end + + def id_for(file_urn) + file_urn.sub('.*:', '') + end + + def master_for(master_urn) + master_urn.sub('urn:pudl:images:master:', "#{Plum.config['pulfa']['master_files']}/") + end + + def service_for(service_urn) + service_urn.sub('urn:pudl:images:deliverable:', "#{Plum.config['pulfa']['service_files']}/") + end + + def attach_pdf(resource, pdf_info) + pdf_file_set = FileSet.new + pdf_file_set.title = ['Original PDF'] + actor = BatchFileSetActor.new(pdf_file_set, @user) + actor.attach_related_object(resource) + actor.attach_content(File.open(pdf_info[:file])) + logger.info "Attached PDF #{pdf_info[:file]}" + end + + def ingest_page(resource, tiff_info, jp2_info) + file_set = FileSet.new + file_set.title = [tiff_info[:title]] + file_set.replaces = tiff_info[:id] + actor = BatchFileSetActor.new(file_set, @user) + actor.create_metadata(resource, {}) + actor.create_content(File.open(tiff_info[:file])) + @pages << file_set + logger.info "Ingested TIFF #{tiff_info[:file]}" + + dest = PairtreeDerivativePath.derivative_path_for_reference file_set.id, 'intermediate_file' + FileUtils.mkdir_p File.dirname(dest) + FileUtils.cp jp2_info[:file], dest + logger.info "Copied JP2 #{jp2_info[:file]} to #{dest}" + end +end diff --git a/app/models/file_set.rb b/app/models/file_set.rb index 555c68b8a..623cc5571 100644 --- a/app/models/file_set.rb +++ b/app/models/file_set.rb @@ -31,6 +31,7 @@ def iiif_path end def create_derivatives(filename) + return if replaces && replaces.start_with?('urn:pudl:images') case mime_type_storage.first when 'image/tiff' Hydra::Derivatives::Jpeg2kImageDerivatives.create( diff --git a/config/config.yml b/config/config.yml index af0f6aedc..bd997dd66 100644 --- a/config/config.yml +++ b/config/config.yml @@ -39,12 +39,18 @@ defaults: &defaults exchange: 'plum_events' geo_derivatives_path: <%= File.join(Rails.root, 'tmp', 'geo-derivatives') %> geoblacklight_provenance: 'Princeton' + pulfa: + master_files: <%= File.join(Rails.root, 'tmp', 'pulfa-master') %> + service_files: <%= File.join(Rails.root, 'tmp', 'pulfa-service') %> development: <<: *defaults test: <<: *defaults + pulfa: + master_files: <%= File.join(Rails.root, 'spec', 'fixtures', 'files') %> + service_files: <%= File.join(Rails.root, 'spec', 'fixtures', 'files') %> production: <<: *defaults diff --git a/config/environments/development.rb b/config/environments/development.rb index cd4acb514..292df7766 100644 --- a/config/environments/development.rb +++ b/config/environments/development.rb @@ -39,4 +39,7 @@ # Raises error for missing translations # config.action_view.raise_on_missing_translations = true config.action_mailer.default_url_options = { host: "localhost:3000" } + + # Run ActiveJob jobs inline + config.active_job.queue_adapter = :inline end diff --git a/lib/tasks/pulfa.rake b/lib/tasks/pulfa.rake new file mode 100644 index 000000000..8baac4f74 --- /dev/null +++ b/lib/tasks/pulfa.rake @@ -0,0 +1,24 @@ +namespace :pulfa do + desc "Ingest a PULFA METS file" + task ingest: :environment do + user = User.find_by_user_key( ENV['USER'] ) if ENV['USER'] + user = User.all.select{ |u| u.admin? }.first unless user + + logger = Logger.new(STDOUT) + IngestPULFAJob.logger = logger + + dir = ENV['METS_DIR'] + logger.info "ingesting pulfa mets files from: #{dir}" + logger.info "ingesting as: #{user.user_key} (override with USER=foo)" + abort "usage: rake pulfa:ingest METS_DIR=/path/to/mets/files" unless dir && Dir.exist?(dir) + + Dir["#{dir}/**/*.mets"].each do |file| + begin + IngestPULFAJob.perform_later(file, user) + rescue => e + puts "Error: #{e.message}" + puts e.backtrace + end + end + end +end diff --git a/spec/fixtures/files/AC057-c18.mets b/spec/fixtures/files/AC057-c18.mets new file mode 100644 index 000000000..9b7170893 --- /dev/null +++ b/spec/fixtures/files/AC057-c18.mets @@ -0,0 +1,132 @@ + + + + http://findingaids.princeton.edu/folders/AC057/c18.mets + + + + + + WORLD + + + + + + + PRINCETON_ONLY + + + + + + + + + 2552 + 3300 + + + + + + + 2552 + 3300 + + + + + + + 2557 + 3300 + + + + + + + 2557 + 3300 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/spec/fixtures/files/color.jp2 b/spec/fixtures/files/color.jp2 new file mode 100644 index 000000000..a8a57ab11 Binary files /dev/null and b/spec/fixtures/files/color.jp2 differ diff --git a/spec/fixtures/files/gray.jp2 b/spec/fixtures/files/gray.jp2 new file mode 100644 index 000000000..156b3a1ff Binary files /dev/null and b/spec/fixtures/files/gray.jp2 differ diff --git a/spec/fixtures/files/test.pdf b/spec/fixtures/files/test.pdf new file mode 100644 index 000000000..1ffdb1d9c Binary files /dev/null and b/spec/fixtures/files/test.pdf differ diff --git a/spec/jobs/ingest_pulfa_job_spec.rb b/spec/jobs/ingest_pulfa_job_spec.rb new file mode 100644 index 000000000..e9d1e0071 --- /dev/null +++ b/spec/jobs/ingest_pulfa_job_spec.rb @@ -0,0 +1,50 @@ +require 'rails_helper' + +RSpec::Matchers.define :a_file_named do |x| + match { |actual| actual.path == x } +end + +RSpec.describe IngestPULFAJob do + describe "ingesting a mets file" do + let(:mets) { fixture('files/AC057-c18.mets') } + let(:pdf) { fixture('files/test.pdf') } + let(:tiff1) { fixture('files/color.tif') } + let(:tiff2) { fixture('files/gray.tif') } + let(:jp2_source) { fixture('files/color.jp2') } + let(:jp3_source) { fixture('files/gray.jp2') } + let(:jp2_dest) { Rails.root.join('tmp', 'derivatives', 'fi', 'le', 'se', 't2-intermediate_file.jp2') } + let(:jp3_dest) { Rails.root.join('tmp', 'derivatives', 'fi', 'le', 'se', 't3-intermediate_file.jp2') } + let(:user) { FactoryGirl.build(:admin) } + let(:actor1) { double('actor1') } + let(:actor2) { double('actor2') } + let(:actor3) { double('actor2') } + let(:fileset1) { FileSet.new id: 'fileset1' } + let(:fileset2) { FileSet.new id: 'fileset2' } + let(:fileset3) { FileSet.new id: 'fileset3' } + let(:resource) { ScannedResource.new id: 'resource01' } + + before do + allow(BatchFileSetActor).to receive(:new).and_return(actor1, actor2, actor3) + allow(ScannedResource).to receive(:new).and_return(resource) + allow(FileSet).to receive(:new).and_return(fileset1, fileset2, fileset3) + allow(FileUtils).to receive(:cp) + allow(resource).to receive(:save!) + end + + it "ingests a mets file" do + expect(actor1).to receive(:attach_related_object).with(resource) + expect(actor1).to receive(:attach_content).with(a_file_named(pdf.path)) + expect(actor2).to receive(:create_metadata).with(resource, {}) + expect(actor2).to receive(:create_content).with(a_file_named(tiff1.path)) + expect(actor3).to receive(:create_metadata).with(resource, {}) + expect(actor3).to receive(:create_content).with(a_file_named(tiff2.path)) + expect(FileUtils).to receive(:cp).with(jp2_source.path, jp2_dest.to_s) + expect(FileUtils).to receive(:cp).with(jp3_source.path, jp3_dest.to_s) + described_class.perform_now(mets, user) + expect(resource.title.first.to_s).to eq("Henkin, Leon and Tucker, Albert [Transcript no. 19], 1984 May 18") + expect(resource.visibility).to eq(Hydra::AccessControls::AccessRight::VISIBILITY_TEXT_VALUE_PUBLIC) + expect(fileset2.title.first.to_s).to eq("[1]") + expect(fileset3.title.first.to_s).to eq("[2]") + end + end +end