This repository has been archived by the owner on May 14, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 12
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Rake task and background job to ingest PULFA METS
- Loading branch information
Showing
10 changed files
with
305 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
class IngestPULFAJob < ApplicationJob | ||
queue_as :ingest | ||
|
||
# @param [String] mets_file Filename of a PULFA METS file to ingest | ||
# @param [String] user User to ingest as | ||
def perform(mets_file, user) | ||
logger.info "Ingesting PULFA METS #{mets_file}" | ||
@mets = Nokogiri::XML(File.open(mets_file)) | ||
@user = user | ||
@pages = [] | ||
|
||
ingest | ||
end | ||
|
||
private | ||
|
||
def ingest | ||
r = ScannedResource.new | ||
r.title = [@mets.xpath("//mets:structMap/mets:div/@LABEL").first.value] | ||
r.replaces = @mets.xpath('/mets:mets/@OBJID').first.value | ||
r.rights_statement = 'http://rightsstatements.org/vocab/NKC/1.0/' | ||
r.visibility = Hydra::AccessControls::AccessRight::VISIBILITY_TEXT_VALUE_PUBLIC | ||
r.apply_depositor_metadata @user | ||
r.save! | ||
Workflow::InitializeState.call(r, 'book_works', 'final_review') | ||
logger.info "Created ScannedResource #{r.id} (#{r.replaces})" | ||
|
||
@mets.xpath("/mets:mets/mets:fileSec/mets:fileGrp").map do |group| | ||
master = file_info(group.xpath("mets:file[@USE='master']")) | ||
service = file_info(group.xpath("mets:file[@USE='deliverable']")) | ||
if master[:file] | ||
ingest_page(r, master, service) | ||
elsif service[:type] == 'application/pdf' | ||
attach_pdf(r, service) | ||
end | ||
end | ||
|
||
# add pages to order | ||
r.ordered_members = @pages | ||
r.save! | ||
end | ||
|
||
def file_info(file) | ||
return {} unless file.length > 0 | ||
file_urn = file.xpath("mets:FLocat/@xlink:href").first.value | ||
use = file.attribute('USE').value | ||
fn = (use == 'master') ? master_for(file_urn) : service_for(file_urn) | ||
groupid = file.xpath('../@ID').first.value | ||
title = file.xpath("//mets:div[mets:fptr/@FILEID='" + groupid + "']/@LABEL").first.value | ||
{ id: id_for(file_urn), file: fn, use: use, type: file.attribute('MIMETYPE').value, title: title } | ||
end | ||
|
||
def id_for(file_urn) | ||
file_urn.sub('.*:', '') | ||
end | ||
|
||
def master_for(master_urn) | ||
master_urn.sub('urn:pudl:images:master:', "#{Plum.config['pulfa']['master_files']}/") | ||
end | ||
|
||
def service_for(service_urn) | ||
service_urn.sub('urn:pudl:images:deliverable:', "#{Plum.config['pulfa']['service_files']}/") | ||
end | ||
|
||
def attach_pdf(resource, pdf_info) | ||
pdf_file_set = FileSet.new | ||
pdf_file_set.title = ['Original PDF'] | ||
actor = BatchFileSetActor.new(pdf_file_set, @user) | ||
actor.attach_related_object(resource) | ||
actor.attach_content(File.open(pdf_info[:file])) | ||
logger.info "Attached PDF #{pdf_info[:file]}" | ||
end | ||
|
||
def ingest_page(resource, tiff_info, jp2_info) | ||
file_set = FileSet.new | ||
file_set.title = [tiff_info[:title]] | ||
file_set.replaces = tiff_info[:id] | ||
actor = BatchFileSetActor.new(file_set, @user) | ||
actor.create_metadata(resource, {}) | ||
actor.create_content(File.open(tiff_info[:file])) | ||
@pages << file_set | ||
logger.info "Ingested TIFF #{tiff_info[:file]}" | ||
|
||
dest = PairtreeDerivativePath.derivative_path_for_reference file_set.id, 'intermediate_file' | ||
FileUtils.mkdir_p File.dirname(dest) | ||
FileUtils.cp jp2_info[:file], dest | ||
logger.info "Copied JP2 #{jp2_info[:file]} to #{dest}" | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
namespace :pulfa do | ||
desc "Ingest a PULFA METS file" | ||
task ingest: :environment do | ||
user = User.find_by_user_key( ENV['USER'] ) if ENV['USER'] | ||
user = User.all.select{ |u| u.admin? }.first unless user | ||
|
||
logger = Logger.new(STDOUT) | ||
IngestPULFAJob.logger = logger | ||
|
||
dir = ENV['METS_DIR'] | ||
logger.info "ingesting pulfa mets files from: #{dir}" | ||
logger.info "ingesting as: #{user.user_key} (override with USER=foo)" | ||
abort "usage: rake pulfa:ingest METS_DIR=/path/to/mets/files" unless dir && Dir.exist?(dir) | ||
|
||
Dir["#{dir}/**/*.mets"].each do |file| | ||
begin | ||
IngestPULFAJob.perform_later(file, user) | ||
rescue => e | ||
puts "Error: #{e.message}" | ||
puts e.backtrace | ||
end | ||
end | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,132 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<mets:mets xmlns:mets="http://www.loc.gov/METS/" | ||
xmlns:xlink="http://www.w3.org/1999/xlink" | ||
xmlns:mix="http://www.loc.gov/mix/v20" | ||
xmlns:dct="http://purl.org/dc/terms/" | ||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||
xsi:schemaLocation="http://www.loc.gov/METS/ http://www.loc.gov/standards/mets/mets.xsd" | ||
TYPE="DigitalArchivalObject" | ||
OBJID="AC057/c18"> | ||
<mets:metsHdr CREATEDATE="2015-11-19T21:37:13Z"> | ||
<mets:metsDocumentID>http://findingaids.princeton.edu/folders/AC057/c18.mets</mets:metsDocumentID> | ||
</mets:metsHdr> | ||
<mets:amdSec ID="rights"> | ||
<mets:rightsMD ID="w"> | ||
<mets:mdWrap MDTYPE="DC"> | ||
<mets:xmlData> | ||
<dct:accessRights>WORLD</dct:accessRights> | ||
</mets:xmlData> | ||
</mets:mdWrap> | ||
</mets:rightsMD> | ||
<mets:rightsMD ID="po"> | ||
<mets:mdWrap MDTYPE="DC"> | ||
<mets:xmlData> | ||
<dct:accessRights>PRINCETON_ONLY</dct:accessRights> | ||
</mets:xmlData> | ||
</mets:mdWrap> | ||
</mets:rightsMD> | ||
</mets:amdSec> | ||
<mets:amdSec ID="tech"> | ||
<mets:techMD ID="td1e20"> | ||
<mets:mdWrap MDTYPE="NISOIMG"> | ||
<mets:xmlData> | ||
<mix:imageWidth>2552</mix:imageWidth> | ||
<mix:imageHeight>3300</mix:imageHeight> | ||
</mets:xmlData> | ||
</mets:mdWrap> | ||
</mets:techMD> | ||
<mets:techMD ID="td1e42"> | ||
<mets:mdWrap MDTYPE="NISOIMG"> | ||
<mets:xmlData> | ||
<mix:imageWidth>2552</mix:imageWidth> | ||
<mix:imageHeight>3300</mix:imageHeight> | ||
</mets:xmlData> | ||
</mets:mdWrap> | ||
</mets:techMD> | ||
<mets:techMD ID="td1e67"> | ||
<mets:mdWrap MDTYPE="NISOIMG"> | ||
<mets:xmlData> | ||
<mix:imageWidth>2557</mix:imageWidth> | ||
<mix:imageHeight>3300</mix:imageHeight> | ||
</mets:xmlData> | ||
</mets:mdWrap> | ||
</mets:techMD> | ||
<mets:techMD ID="td1e89"> | ||
<mets:mdWrap MDTYPE="NISOIMG"> | ||
<mets:xmlData> | ||
<mix:imageWidth>2557</mix:imageWidth> | ||
<mix:imageHeight>3300</mix:imageHeight> | ||
</mets:xmlData> | ||
</mets:mdWrap> | ||
</mets:techMD> | ||
</mets:amdSec> | ||
<mets:fileSec> | ||
<mets:fileGrp ID="fgd1e1"> | ||
<mets:file USE="deliverable" | ||
ID="fd1e3" | ||
CHECKSUM="6c51b385ed40ad1a39f12a935b242acd2a8bcc78" | ||
CHECKSUMTYPE="SHA-1" | ||
MIMETYPE="application/pdf" | ||
SIZE="150850"> | ||
<mets:FLocat LOCTYPE="URN" xlink:href="urn:pudl:images:deliverable:test.pdf"/> | ||
</mets:file> | ||
</mets:fileGrp> | ||
<mets:fileGrp ID="fgd1e18"> | ||
<mets:file USE="master" | ||
ID="fd1e20" | ||
CHECKSUM="fc1fe35874a4f0814a1d3cbc612dbce1a95c639c" | ||
CHECKSUMTYPE="SHA-1" | ||
MIMETYPE="image/tiff" | ||
SIZE="8434288" | ||
ADMID="td1e20"> | ||
<mets:FLocat LOCTYPE="URN" | ||
xlink:href="urn:pudl:images:master:color.tif"/> | ||
</mets:file> | ||
<mets:file USE="deliverable" | ||
ID="fd1e42" | ||
CHECKSUM="1ae09ec614784daba7b506afa2a34ccabcfde7b0" | ||
CHECKSUMTYPE="SHA-1" | ||
MIMETYPE="image/jp2" | ||
SIZE="1157805" | ||
ADMID="td1e42"> | ||
<mets:FLocat LOCTYPE="URN" | ||
xlink:href="urn:pudl:images:deliverable:color.jp2"/> | ||
</mets:file> | ||
</mets:fileGrp> | ||
<mets:fileGrp ID="fgd1e65"> | ||
<mets:file USE="master" | ||
ID="fd1e67" | ||
CHECKSUM="7696720b2860004093ff0d46feea99ebec791d61" | ||
CHECKSUMTYPE="SHA-1" | ||
MIMETYPE="image/tiff" | ||
SIZE="8450788" | ||
ADMID="td1e67"> | ||
<mets:FLocat LOCTYPE="URN" | ||
xlink:href="urn:pudl:images:master:gray.tif"/> | ||
</mets:file> | ||
<mets:file USE="deliverable" | ||
ID="fd1e89" | ||
CHECKSUM="5070f2c80db03f4a1220029fb91dc7d081178a98" | ||
CHECKSUMTYPE="SHA-1" | ||
MIMETYPE="image/jp2" | ||
SIZE="1289871" | ||
ADMID="td1e89"> | ||
<mets:FLocat LOCTYPE="URN" | ||
xlink:href="urn:pudl:images:deliverable:gray.jp2"/> | ||
</mets:file> | ||
</mets:fileGrp> | ||
</mets:fileSec> | ||
<mets:structMap> | ||
<mets:div LABEL="Henkin, Leon and Tucker, Albert [Transcript no. 19], 1984 May 18" | ||
TYPE="Folder" | ||
ADMID="w"> | ||
<mets:fptr FILEID="fgd1e1"/> | ||
<mets:div LABEL="[1]" ORDER="1" TYPE="FolderMember"> | ||
<mets:fptr FILEID="fgd1e18"/> | ||
</mets:div> | ||
<mets:div LABEL="[2]" ORDER="2" TYPE="FolderMember"> | ||
<mets:fptr FILEID="fgd1e65"/> | ||
</mets:div> | ||
</mets:div> | ||
</mets:structMap> | ||
</mets:mets> |
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
require 'rails_helper' | ||
|
||
RSpec::Matchers.define :a_file_named do |x| | ||
match { |actual| actual.path == x } | ||
end | ||
|
||
RSpec.describe IngestPULFAJob do | ||
describe "ingesting a mets file" do | ||
let(:mets) { fixture('files/AC057-c18.mets') } | ||
let(:pdf) { fixture('files/test.pdf') } | ||
let(:tiff1) { fixture('files/color.tif') } | ||
let(:tiff2) { fixture('files/gray.tif') } | ||
let(:jp2_source) { fixture('files/color.jp2') } | ||
let(:jp3_source) { fixture('files/gray.jp2') } | ||
let(:jp2_dest) { Rails.root.join('tmp', 'derivatives', 'fi', 'le', 'se', 't2-intermediate_file.jp2') } | ||
let(:jp3_dest) { Rails.root.join('tmp', 'derivatives', 'fi', 'le', 'se', 't3-intermediate_file.jp2') } | ||
let(:user) { FactoryGirl.build(:admin) } | ||
let(:actor1) { double('actor1') } | ||
let(:actor2) { double('actor2') } | ||
let(:actor3) { double('actor2') } | ||
let(:fileset1) { FileSet.new id: 'fileset1' } | ||
let(:fileset2) { FileSet.new id: 'fileset2' } | ||
let(:fileset3) { FileSet.new id: 'fileset3' } | ||
let(:resource) { ScannedResource.new id: 'resource01' } | ||
|
||
before do | ||
allow(BatchFileSetActor).to receive(:new).and_return(actor1, actor2, actor3) | ||
allow(ScannedResource).to receive(:new).and_return(resource) | ||
allow(FileSet).to receive(:new).and_return(fileset1, fileset2, fileset3) | ||
allow(FileUtils).to receive(:cp) | ||
allow(resource).to receive(:save!) | ||
end | ||
|
||
it "ingests a mets file" do | ||
expect(actor1).to receive(:attach_related_object).with(resource) | ||
expect(actor1).to receive(:attach_content).with(a_file_named(pdf.path)) | ||
expect(actor2).to receive(:create_metadata).with(resource, {}) | ||
expect(actor2).to receive(:create_content).with(a_file_named(tiff1.path)) | ||
expect(actor3).to receive(:create_metadata).with(resource, {}) | ||
expect(actor3).to receive(:create_content).with(a_file_named(tiff2.path)) | ||
expect(FileUtils).to receive(:cp).with(jp2_source.path, jp2_dest.to_s) | ||
expect(FileUtils).to receive(:cp).with(jp3_source.path, jp3_dest.to_s) | ||
described_class.perform_now(mets, user) | ||
expect(resource.title.first.to_s).to eq("Henkin, Leon and Tucker, Albert [Transcript no. 19], 1984 May 18") | ||
expect(resource.visibility).to eq(Hydra::AccessControls::AccessRight::VISIBILITY_TEXT_VALUE_PUBLIC) | ||
expect(fileset2.title.first.to_s).to eq("[1]") | ||
expect(fileset3.title.first.to_s).to eq("[2]") | ||
end | ||
end | ||
end |