Skip to content
This repository has been archived by the owner on May 14, 2022. It is now read-only.

Commit

Permalink
Rake task and background job to ingest PULFA METS
Browse files Browse the repository at this point in the history
  • Loading branch information
escowles committed Feb 7, 2017
1 parent 1af8e2f commit 22f7b14
Show file tree
Hide file tree
Showing 10 changed files with 305 additions and 0 deletions.
89 changes: 89 additions & 0 deletions app/jobs/ingest_pulfa_job.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
class IngestPULFAJob < ApplicationJob
queue_as :ingest

# @param [String] mets_file Filename of a PULFA METS file to ingest
# @param [String] user User to ingest as
def perform(mets_file, user)
logger.info "Ingesting PULFA METS #{mets_file}"
@mets = Nokogiri::XML(File.open(mets_file))
@user = user
@pages = []

ingest
end

private

def ingest
r = ScannedResource.new
r.title = [@mets.xpath("//mets:structMap/mets:div/@LABEL").first.value]
r.replaces = @mets.xpath('/mets:mets/@OBJID').first.value
r.rights_statement = 'http://rightsstatements.org/vocab/NKC/1.0/'
r.visibility = Hydra::AccessControls::AccessRight::VISIBILITY_TEXT_VALUE_PUBLIC
r.apply_depositor_metadata @user
r.save!
Workflow::InitializeState.call(r, 'book_works', 'final_review')
logger.info "Created ScannedResource #{r.id} (#{r.replaces})"

@mets.xpath("/mets:mets/mets:fileSec/mets:fileGrp").map do |group|
master = file_info(group.xpath("mets:file[@USE='master']"))
service = file_info(group.xpath("mets:file[@USE='deliverable']"))
if master[:file]
ingest_page(r, master, service)
elsif service[:type] == 'application/pdf'
attach_pdf(r, service)
end
end

# add pages to order
r.ordered_members = @pages
r.save!
end

def file_info(file)
return {} unless file.length > 0
file_urn = file.xpath("mets:FLocat/@xlink:href").first.value
use = file.attribute('USE').value
fn = (use == 'master') ? master_for(file_urn) : service_for(file_urn)
groupid = file.xpath('../@ID').first.value
title = file.xpath("//mets:div[mets:fptr/@FILEID='" + groupid + "']/@LABEL").first.value
{ id: id_for(file_urn), file: fn, use: use, type: file.attribute('MIMETYPE').value, title: title }
end

def id_for(file_urn)
file_urn.sub('.*:', '')
end

def master_for(master_urn)
master_urn.sub('urn:pudl:images:master:', "#{Plum.config['pulfa']['master_files']}/")
end

def service_for(service_urn)
service_urn.sub('urn:pudl:images:deliverable:', "#{Plum.config['pulfa']['service_files']}/")
end

def attach_pdf(resource, pdf_info)
pdf_file_set = FileSet.new
pdf_file_set.title = ['Original PDF']
actor = BatchFileSetActor.new(pdf_file_set, @user)
actor.attach_related_object(resource)
actor.attach_content(File.open(pdf_info[:file]))
logger.info "Attached PDF #{pdf_info[:file]}"
end

def ingest_page(resource, tiff_info, jp2_info)
file_set = FileSet.new
file_set.title = [tiff_info[:title]]
file_set.replaces = tiff_info[:id]
actor = BatchFileSetActor.new(file_set, @user)
actor.create_metadata(resource, {})
actor.create_content(File.open(tiff_info[:file]))
@pages << file_set
logger.info "Ingested TIFF #{tiff_info[:file]}"

dest = PairtreeDerivativePath.derivative_path_for_reference file_set.id, 'intermediate_file'
FileUtils.mkdir_p File.dirname(dest)
FileUtils.cp jp2_info[:file], dest
logger.info "Copied JP2 #{jp2_info[:file]} to #{dest}"
end
end
1 change: 1 addition & 0 deletions app/models/file_set.rb
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def iiif_path
end

def create_derivatives(filename)
return if replaces && replaces.start_with?('urn:pudl:images')
case mime_type_storage.first
when 'image/tiff'
Hydra::Derivatives::Jpeg2kImageDerivatives.create(
Expand Down
6 changes: 6 additions & 0 deletions config/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,18 @@ defaults: &defaults
exchange: 'plum_events'
geo_derivatives_path: <%= File.join(Rails.root, 'tmp', 'geo-derivatives') %>
geoblacklight_provenance: 'Princeton'
pulfa:
master_files: <%= File.join(Rails.root, 'tmp', 'pulfa-master') %>
service_files: <%= File.join(Rails.root, 'tmp', 'pulfa-service') %>

development:
<<: *defaults

test:
<<: *defaults
pulfa:
master_files: <%= File.join(Rails.root, 'spec', 'fixtures', 'files') %>
service_files: <%= File.join(Rails.root, 'spec', 'fixtures', 'files') %>

production:
<<: *defaults
Expand Down
3 changes: 3 additions & 0 deletions config/environments/development.rb
Original file line number Diff line number Diff line change
Expand Up @@ -39,4 +39,7 @@
# Raises error for missing translations
# config.action_view.raise_on_missing_translations = true
config.action_mailer.default_url_options = { host: "localhost:3000" }

# Run ActiveJob jobs inline
config.active_job.queue_adapter = :inline
end
24 changes: 24 additions & 0 deletions lib/tasks/pulfa.rake
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
namespace :pulfa do
desc "Ingest a PULFA METS file"
task ingest: :environment do
user = User.find_by_user_key( ENV['USER'] ) if ENV['USER']
user = User.all.select{ |u| u.admin? }.first unless user

logger = Logger.new(STDOUT)
IngestPULFAJob.logger = logger

dir = ENV['METS_DIR']
logger.info "ingesting pulfa mets files from: #{dir}"
logger.info "ingesting as: #{user.user_key} (override with USER=foo)"
abort "usage: rake pulfa:ingest METS_DIR=/path/to/mets/files" unless dir && Dir.exist?(dir)

Dir["#{dir}/**/*.mets"].each do |file|
begin
IngestPULFAJob.perform_later(file, user)
rescue => e
puts "Error: #{e.message}"
puts e.backtrace
end
end
end
end
132 changes: 132 additions & 0 deletions spec/fixtures/files/AC057-c18.mets
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
<?xml version="1.0" encoding="UTF-8"?>
<mets:mets xmlns:mets="http://www.loc.gov/METS/"
xmlns:xlink="http://www.w3.org/1999/xlink"
xmlns:mix="http://www.loc.gov/mix/v20"
xmlns:dct="http://purl.org/dc/terms/"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.loc.gov/METS/ http://www.loc.gov/standards/mets/mets.xsd"
TYPE="DigitalArchivalObject"
OBJID="AC057/c18">
<mets:metsHdr CREATEDATE="2015-11-19T21:37:13Z">
<mets:metsDocumentID>http://findingaids.princeton.edu/folders/AC057/c18.mets</mets:metsDocumentID>
</mets:metsHdr>
<mets:amdSec ID="rights">
<mets:rightsMD ID="w">
<mets:mdWrap MDTYPE="DC">
<mets:xmlData>
<dct:accessRights>WORLD</dct:accessRights>
</mets:xmlData>
</mets:mdWrap>
</mets:rightsMD>
<mets:rightsMD ID="po">
<mets:mdWrap MDTYPE="DC">
<mets:xmlData>
<dct:accessRights>PRINCETON_ONLY</dct:accessRights>
</mets:xmlData>
</mets:mdWrap>
</mets:rightsMD>
</mets:amdSec>
<mets:amdSec ID="tech">
<mets:techMD ID="td1e20">
<mets:mdWrap MDTYPE="NISOIMG">
<mets:xmlData>
<mix:imageWidth>2552</mix:imageWidth>
<mix:imageHeight>3300</mix:imageHeight>
</mets:xmlData>
</mets:mdWrap>
</mets:techMD>
<mets:techMD ID="td1e42">
<mets:mdWrap MDTYPE="NISOIMG">
<mets:xmlData>
<mix:imageWidth>2552</mix:imageWidth>
<mix:imageHeight>3300</mix:imageHeight>
</mets:xmlData>
</mets:mdWrap>
</mets:techMD>
<mets:techMD ID="td1e67">
<mets:mdWrap MDTYPE="NISOIMG">
<mets:xmlData>
<mix:imageWidth>2557</mix:imageWidth>
<mix:imageHeight>3300</mix:imageHeight>
</mets:xmlData>
</mets:mdWrap>
</mets:techMD>
<mets:techMD ID="td1e89">
<mets:mdWrap MDTYPE="NISOIMG">
<mets:xmlData>
<mix:imageWidth>2557</mix:imageWidth>
<mix:imageHeight>3300</mix:imageHeight>
</mets:xmlData>
</mets:mdWrap>
</mets:techMD>
</mets:amdSec>
<mets:fileSec>
<mets:fileGrp ID="fgd1e1">
<mets:file USE="deliverable"
ID="fd1e3"
CHECKSUM="6c51b385ed40ad1a39f12a935b242acd2a8bcc78"
CHECKSUMTYPE="SHA-1"
MIMETYPE="application/pdf"
SIZE="150850">
<mets:FLocat LOCTYPE="URN" xlink:href="urn:pudl:images:deliverable:test.pdf"/>
</mets:file>
</mets:fileGrp>
<mets:fileGrp ID="fgd1e18">
<mets:file USE="master"
ID="fd1e20"
CHECKSUM="fc1fe35874a4f0814a1d3cbc612dbce1a95c639c"
CHECKSUMTYPE="SHA-1"
MIMETYPE="image/tiff"
SIZE="8434288"
ADMID="td1e20">
<mets:FLocat LOCTYPE="URN"
xlink:href="urn:pudl:images:master:color.tif"/>
</mets:file>
<mets:file USE="deliverable"
ID="fd1e42"
CHECKSUM="1ae09ec614784daba7b506afa2a34ccabcfde7b0"
CHECKSUMTYPE="SHA-1"
MIMETYPE="image/jp2"
SIZE="1157805"
ADMID="td1e42">
<mets:FLocat LOCTYPE="URN"
xlink:href="urn:pudl:images:deliverable:color.jp2"/>
</mets:file>
</mets:fileGrp>
<mets:fileGrp ID="fgd1e65">
<mets:file USE="master"
ID="fd1e67"
CHECKSUM="7696720b2860004093ff0d46feea99ebec791d61"
CHECKSUMTYPE="SHA-1"
MIMETYPE="image/tiff"
SIZE="8450788"
ADMID="td1e67">
<mets:FLocat LOCTYPE="URN"
xlink:href="urn:pudl:images:master:gray.tif"/>
</mets:file>
<mets:file USE="deliverable"
ID="fd1e89"
CHECKSUM="5070f2c80db03f4a1220029fb91dc7d081178a98"
CHECKSUMTYPE="SHA-1"
MIMETYPE="image/jp2"
SIZE="1289871"
ADMID="td1e89">
<mets:FLocat LOCTYPE="URN"
xlink:href="urn:pudl:images:deliverable:gray.jp2"/>
</mets:file>
</mets:fileGrp>
</mets:fileSec>
<mets:structMap>
<mets:div LABEL="Henkin, Leon and Tucker, Albert [Transcript no. 19], 1984 May 18"
TYPE="Folder"
ADMID="w">
<mets:fptr FILEID="fgd1e1"/>
<mets:div LABEL="[1]" ORDER="1" TYPE="FolderMember">
<mets:fptr FILEID="fgd1e18"/>
</mets:div>
<mets:div LABEL="[2]" ORDER="2" TYPE="FolderMember">
<mets:fptr FILEID="fgd1e65"/>
</mets:div>
</mets:div>
</mets:structMap>
</mets:mets>
Binary file added spec/fixtures/files/color.jp2
Binary file not shown.
Binary file added spec/fixtures/files/gray.jp2
Binary file not shown.
Binary file added spec/fixtures/files/test.pdf
Binary file not shown.
50 changes: 50 additions & 0 deletions spec/jobs/ingest_pulfa_job_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
require 'rails_helper'

RSpec::Matchers.define :a_file_named do |x|
match { |actual| actual.path == x }
end

RSpec.describe IngestPULFAJob do
describe "ingesting a mets file" do
let(:mets) { fixture('files/AC057-c18.mets') }
let(:pdf) { fixture('files/test.pdf') }
let(:tiff1) { fixture('files/color.tif') }
let(:tiff2) { fixture('files/gray.tif') }
let(:jp2_source) { fixture('files/color.jp2') }
let(:jp3_source) { fixture('files/gray.jp2') }
let(:jp2_dest) { Rails.root.join('tmp', 'derivatives', 'fi', 'le', 'se', 't2-intermediate_file.jp2') }
let(:jp3_dest) { Rails.root.join('tmp', 'derivatives', 'fi', 'le', 'se', 't3-intermediate_file.jp2') }
let(:user) { FactoryGirl.build(:admin) }
let(:actor1) { double('actor1') }
let(:actor2) { double('actor2') }
let(:actor3) { double('actor2') }
let(:fileset1) { FileSet.new id: 'fileset1' }
let(:fileset2) { FileSet.new id: 'fileset2' }
let(:fileset3) { FileSet.new id: 'fileset3' }
let(:resource) { ScannedResource.new id: 'resource01' }

before do
allow(BatchFileSetActor).to receive(:new).and_return(actor1, actor2, actor3)
allow(ScannedResource).to receive(:new).and_return(resource)
allow(FileSet).to receive(:new).and_return(fileset1, fileset2, fileset3)
allow(FileUtils).to receive(:cp)
allow(resource).to receive(:save!)
end

it "ingests a mets file" do
expect(actor1).to receive(:attach_related_object).with(resource)
expect(actor1).to receive(:attach_content).with(a_file_named(pdf.path))
expect(actor2).to receive(:create_metadata).with(resource, {})
expect(actor2).to receive(:create_content).with(a_file_named(tiff1.path))
expect(actor3).to receive(:create_metadata).with(resource, {})
expect(actor3).to receive(:create_content).with(a_file_named(tiff2.path))
expect(FileUtils).to receive(:cp).with(jp2_source.path, jp2_dest.to_s)
expect(FileUtils).to receive(:cp).with(jp3_source.path, jp3_dest.to_s)
described_class.perform_now(mets, user)
expect(resource.title.first.to_s).to eq("Henkin, Leon and Tucker, Albert [Transcript no. 19], 1984 May 18")
expect(resource.visibility).to eq(Hydra::AccessControls::AccessRight::VISIBILITY_TEXT_VALUE_PUBLIC)
expect(fileset2.title.first.to_s).to eq("[1]")
expect(fileset3.title.first.to_s).to eq("[2]")
end
end
end

0 comments on commit 22f7b14

Please sign in to comment.