Skip to content

Commit

Permalink
Create derivatives from the original
Browse files Browse the repository at this point in the history
Not from a copy retrieved from the repository
This makes importing faster as less network traffic is required.
  • Loading branch information
jcoyne committed Sep 30, 2015
1 parent 3672db3 commit ce4e3e5
Show file tree
Hide file tree
Showing 23 changed files with 288 additions and 204 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -45,35 +45,46 @@ def create_metadata(upload_set_id, work_id, generic_file_params = {})
yield(generic_file) if block_given?
end

# Puts the uploaded content into a staging directory. Then kicks off a
# job to characterize and create derivatives with this on disk variant.
# Simultaneously moving a preservation copy to the repostiory.
# TODO create a job to monitor this directory and prune old files that
# have made it to the repo
# @param [ActionDigest::HTTP::UploadedFile, Tempfile] file the file uploaded by the user.
def create_content(file)
# Tell UploadFileToGenericFile service to skip versioning because versions will be minted by VersionCommitter (called by save_characterize_and_record_committer) when necessary
Hydra::Works::UploadFileToGenericFile.call(generic_file, file, versioning: false)
generic_file.label ||= file.original_filename
generic_file.title = [generic_file.label] if generic_file.title.blank?
save_characterize_and_record_committer do
if CurationConcerns.config.respond_to?(:after_create_content)
CurationConcerns.config.after_create_content.call(generic_file, user)
end
end
return false unless generic_file.save

working_file = copy_file_to_working_directory(file, generic_file.id)
IngestFileJob.perform_later(generic_file.id, working_file, file.content_type, user.user_key)
make_derivative(generic_file.id, working_file)
true
end

def revert_content(revision_id)
generic_file.original_file.restore_version(revision_id)
save_characterize_and_record_committer do
if CurationConcerns.config.respond_to?(:after_revert_content)
CurationConcerns.config.after_revert_content.call(generic_file, user, revision_id)
end
end

return false unless generic_file.save

CurationConcerns::VersioningService.create(generic_file.original_file, user)

# Retrieve a copy of the orginal file from the repository
working_file = copy_repository_resource_to_working_directory(generic_file)
make_derivative(generic_file.id, working_file)

return true unless CurationConcerns.config.respond_to?(:after_revert_content)
CurationConcerns.config.after_revert_content.call(generic_file, user, revision_id)
true
end

def update_content(file)
# Tell UploadFileToGenericFile service to skip versioning because versions will be minted by VersionCommitter (called by save_characterize_and_record_committer) when necessary
Hydra::Works::UploadFileToGenericFile.call(generic_file, file, versioning: false)
save_characterize_and_record_committer do
if CurationConcerns.config.respond_to?(:after_update_content)
CurationConcerns.config.after_update_content.call(generic_file, user)
end
end
working_file = copy_file_to_working_directory(file, generic_file.id)
IngestFileJob.perform_later(generic_file.id, working_file, file.content_type, user.user_key)
make_derivative(generic_file.id, working_file)
return true unless CurationConcerns.config.respond_to?(:after_update_content)
CurationConcerns.config.after_update_content.call(generic_file, user)
true
end

def update_metadata(model_attributes, all_attributes)
Expand All @@ -96,14 +107,38 @@ def destroy

private

# Saves the generic file, queues a job to characterize it, and records the committer.
# Takes a block which is run if the save was successful.
def save_characterize_and_record_committer
save do
push_characterize_job
CurationConcerns::VersioningService.create(generic_file.original_file, user)
yield if block_given?
end
def make_derivative(generic_file_id, working_file)
CharacterizeJob.perform_later(generic_file_id, working_file)
end

# @param [ActionDispatch::Http::UploadedFile] file
# @param [String] id the identifer
# @return [String] path of the working file
def copy_file_to_working_directory(file, id)
copy_stream_to_working_directory(id, file.original_filename, file)
end

# @param [GenericFile] generic_file the resource
# @return [String] path of the working file
def copy_repository_resource_to_working_directory(generic_file)
file = generic_file.original_file
copy_stream_to_working_directory(generic_file.id, file.original_name, StringIO.new(file.content))
end

# @param [String] id the identifer
# @param [String] name the file name
# @param [#read] stream the stream to copy to the working directory
# @return [String] path of the working file
def copy_stream_to_working_directory(id, name, stream)
working_path = full_filename(id, name)
FileUtils.mkdir_p(File.dirname(working_path))
IO.copy_stream(stream, working_path)
working_path
end

def full_filename(id, original_name)
pair = id.scan(/..?/).first(4)
File.join(CurationConcerns.config.working_path, *pair, original_name)
end

# Takes an optional block and executes the block if the save was successful.
Expand All @@ -124,10 +159,6 @@ def save
true
end

def push_characterize_job
CharacterizeJob.perform_later(@generic_file.id)
end

# Adds a GenericFile to the work using ore:Aggregations.
# Locks to ensure that only one process is operating on
# the list at a time.
Expand Down
8 changes: 5 additions & 3 deletions curation_concerns-models/app/jobs/characterize_job.rb
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
class CharacterizeJob < ActiveFedoraIdBasedJob
queue_as :characterize

def perform(id)
# @param [String] id
# @param [String] filename a local filepath of whicih to characterize. By using this, we don't have to pull a copy out of fedora.
def perform(id, filename)
@id = id
CurationConcerns::CharacterizationService.run(generic_file)
CurationConcerns::CharacterizationService.run(generic_file, filename)
generic_file.save
CreateDerivativesJob.perform_later(generic_file.id)
CreateDerivativesJob.perform_later(generic_file.id, filename)
end
end
8 changes: 3 additions & 5 deletions curation_concerns-models/app/jobs/create_derivatives_job.rb
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
class CreateDerivativesJob < ActiveFedoraIdBasedJob
queue_as :derivatives

def perform(id)
def perform(id, file_name)
@id = id
return unless generic_file.original_file.has_content?
return unless CurationConcerns.config.enable_ffmpeg if generic_file.video?
return if generic_file.video? && !CurationConcerns.config.enable_ffmpeg

generic_file.create_derivatives
generic_file.save
generic_file.create_derivatives(file_name)
end
end
18 changes: 18 additions & 0 deletions curation_concerns-models/app/jobs/ingest_file_job.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
class IngestFileJob < ActiveJob::Base
queue_as :ingest

def perform(generic_file_id, filename, mime_type, user_key)
generic_file = GenericFile.find(generic_file_id)
file = Hydra::Derivatives::IoDecorator.new(File.open(filename, "rb"))
file.mime_type = mime_type
file.original_name = File.basename(filename)

# Tell UploadFileToGenericFile service to skip versioning because versions will be minted by VersionCommitter (called by save_characterize_and_record_committer) when necessary
Hydra::Works::UploadFileToGenericFile.call(generic_file, file, versioning: false)
generic_file.save!
CurationConcerns::VersioningService.create(generic_file.original_file, user_key)

return unless CurationConcerns.config.respond_to?(:after_create_content)
CurationConcerns.config.after_create_content.call(generic_file, user_key)
end
end
2 changes: 1 addition & 1 deletion curation_concerns-models/app/jobs/ingest_local_file_job.rb
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
class IngestLocalFileJob < ActiveJob::Base
attr_accessor :directory, :filename, :user_key, :generic_file_id

queue_as :ingest
queue_as :ingest_local

def perform(generic_file_id, directory, filename, user_key)
@generic_file_id = generic_file_id
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,46 @@ module Derivatives
extend ActiveSupport::Concern

included do
Hydra::Derivatives.source_file_service = CurationConcerns::LocalFileService
Hydra::Derivatives.output_file_service = CurationConcerns::PersistDerivatives
end

makes_derivatives do |obj|
case obj.original_file.mime_type
when *audio_mime_types
obj.transform_file :original_file, { mp3: { format: 'mp3' }, ogg: { format: 'ogg' } }, processor: :audio
when *video_mime_types
obj.transform_file :original_file, { webm: { format: 'webm' }, mp4: { format: 'mp4' } }, processor: :video
end
# This completely overrides the version in Hydra::Works so that we
# read and write to a local file. It's important that characterization runs
# before derivatives so that we have a credible mime_type field to work with.
def create_derivatives(filename)
case mime_type
when *self.class.pdf_mime_types
Hydra::Derivatives::PdfDerivatives.create(filename,
outputs: [{ label: :thumbnail, format: 'jpg', size: '338x493', url: derivative_url('thumbnail') }])
when *self.class.office_document_mime_types
Hydra::Derivatives::DocumentDerivatives.create(filename,
outputs: [{ label: :thumbnail, format: 'jpg',
size: '200x150>',
url: derivative_url('thumbnail') }])
when *self.class.audio_mime_types
Hydra::Derivatives::AudioDerivatives.create(filename,
outputs: [{ label: 'mp3', format: 'mp3', url: derivative_url('mp3') },
{ label: 'ogg', format: 'ogg', url: derivative_url('ogg') }])
when *self.class.video_mime_types
Hydra::Derivatives::VideoDerivatives.create(filename,
outputs: [{ label: :thumbnail, format: 'jpg', url: derivative_url('thumbnail') },
{ label: 'webm', format: 'webm', url: derivative_url('webm') },
{ label: 'mp4', format: 'mp4', url: derivative_url('mp4') }])
when *self.class.image_mime_types
Hydra::Derivatives::ImageDerivatives.create(filename,
outputs: [{ label: :thumbnail, format: 'jpg', size: '200x150>', url: derivative_url('thumbnail') }])
end
end

private

# The destination_name parameter has to match up with the file parameter
# passed to the DownloadsController
def derivative_url(destination_name)
path = DerivativePath.derivative_path_for_reference(self, destination_name)
URI("file://#{path}").to_s
end
end
end
end
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,26 @@ module CurationConcerns
# Run FITS to gather technical metadata about the content and the full text.
# Store this extracted metadata in the characterization datastream.
class CharacterizationService
attr_reader :generic_file
attr_reader :generic_file, :file_path

def self.run(generic_file)
new(generic_file).characterize
# @param [GenericFile] generic_file
# @param [String] file_path path to the file on disk
def self.run(generic_file, file_path)
new(generic_file, file_path).characterize
end

def initialize(generic_file)
# @param [GenericFile] generic_file
# @param [String] file_path path to the file on disk
def initialize(generic_file, file_path)
@generic_file = generic_file
@file_path = file_path
end

## Extract the metadata from the content datastream and record it in the characterization datastream
def characterize
store_metadata(extract_metadata)
store_fulltext(extract_fulltext)
generic_file.filename = original_file.original_name
generic_file.filename = File.basename(file_path)
end

protected
Expand All @@ -28,7 +33,7 @@ def store_fulltext(extracted_text)
end

def extract_fulltext
Hydra::Works::FullTextExtractionService.run(generic_file)
Hydra::Works::FullTextExtractionService.run(generic_file, file_path)
end

def store_metadata(metadata)
Expand All @@ -41,8 +46,8 @@ def original_file
end

def extract_metadata
return unless original_file.has_content?
Hydra::FileCharacterization.characterize(original_file.content, original_file.original_name, :fits) do |config|
return unless File.exist?(file_path)
Hydra::FileCharacterization.characterize(File.open(file_path).read, File.basename(file_path), :fits) do |config|
config[:fits] = Hydra::Derivatives.fits_path
end
end
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
module CurationConcerns
class LocalFileService
# @param [String] file_name path to the file
# @param [Hash] _options
# @yield [File] opens the file and yields it to the block
def self.call(file_name, _options)
yield File.open(file_name)
end
end
end
Original file line number Diff line number Diff line change
Expand Up @@ -5,26 +5,25 @@ class PersistDerivatives < Hydra::Derivatives::PersistOutputFileService
# This service is an alternative to the default Hydra::Derivatives::PersistOutputFileService.
# This service will always update existing and does not do versioning of persisted files.
#
# @param [Hydra::Works::GenericFile::Base] object the file will be added to
# @param [Hydra::Derivatives::IoDecorator] file the derivative filestream
# @param [String] extract file type (e.g. 'thumbnail') from Hydra::Derivatives created destination_name
#
def self.call(object, file, destination_name)
output_file(object, destination_name) do |output|
while buffer = file.read(4096)
output.write buffer
end
# @param [#read] stream the derivative filestream
# @param [Hash] directives
# @option directives [String] :url a url to the file destination
def self.call(stream, directives)
output_file(directives) do |output|
IO.copy_stream(stream, output)
end
end

# Open the output file to write and yield the block to the
# file. It will make the directories in the path if
# necessary.
def self.output_file(object, destination_name, &blk)
name = derivative_path_factory.derivative_path_for_reference(object, destination_name)
output_file_dir = File.dirname(name)
# file. It makes the directories in the path if necessary.
def self.output_file(directives, &blk)
# name = derivative_path_factory.derivative_path_for_reference(object, destination_name)
raise ArgumentError, "No :url was provided in the transcoding directives" unless directives.key?(:url)
uri = URI(directives.fetch(:url))
raise ArgumentError, "Must provide a file uri" unless uri.scheme == 'file'
output_file_dir = File.dirname(uri.path)
FileUtils.mkdir_p(output_file_dir) unless File.directory?(output_file_dir)
File.open(name, 'wb', &blk)
File.open(uri.path, 'wb', &blk)
end

def self.derivative_path_factory
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ module CurationConcerns
class VersioningService
# Make a version and record the version committer
# @param [ActiveFedora::File] content
# @param [User] user
# @param [User, String] user
def self.create(content, user = nil)
content.create_version
record_committer(content, user) if user
Expand All @@ -15,11 +15,12 @@ def self.latest_version_of(file)

# Record the version committer of the last version
# @param [ActiveFedora::File] content
# @param [User] user
def self.record_committer(content, user)
# @param [User, String] user_key
def self.record_committer(content, user_key)
user_key = user_key.user_key if user_key.respond_to?(:user_key)
version = latest_version_of(content)
return if version.nil?
VersionCommitter.create(version_id: version.uri, committer_login: user.user_key)
VersionCommitter.create(version_id: version.uri, committer_login: user_key)
end
end
end
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,12 @@ def derivatives_path
@derivatives_path ||= File.join(Rails.root, 'tmp', 'derivatives')
end

# Path on the local file system where originals will be staged before being ingested into Fedora.
attr_writer :working_path
def working_path
@working_path ||= File.join(Rails.root, 'tmp', 'uploads')
end

attr_writer :enable_ffmpeg
def enable_ffmpeg
return @enable_ffmpeg unless @enable_ffmpeg.nil?
Expand Down
Loading

0 comments on commit ce4e3e5

Please sign in to comment.