diff --git a/.rubocop.yml b/.rubocop.yml index 61246df8..f89d6e1a 100644 --- a/.rubocop.yml +++ b/.rubocop.yml @@ -1,3 +1,4 @@ + require: - rubocop-performance - rubocop-rails @@ -25,3 +26,6 @@ Metrics/AbcSize: Metrics/MethodLength: Exclude: - 'app/controllers/dor_controller.rb' + +Naming/PredicateName: + NamePrefixBlacklist: is_ diff --git a/.rubocop_todo.yml b/.rubocop_todo.yml index 7ace370d..469f7d26 100644 --- a/.rubocop_todo.yml +++ b/.rubocop_todo.yml @@ -1,15 +1,48 @@ # This configuration was generated by # `rubocop --auto-gen-config` -# on 2019-09-24 11:52:09 -0700 using RuboCop version 0.74.0. +# on 2020-01-08 23:14:02 -0800 using RuboCop version 0.74.0. # The point is for the user to remove these configuration records # one by one as the offenses are removed from the code base. # Note that changes in the inspected code, or installation of new # versions of RuboCop, may require this file to be generated again. # Offense count: 1 +Lint/UselessAssignment: + Exclude: + - 'app/controllers/dor_controller.rb' + +# Offense count: 5 +Metrics/AbcSize: + Max: 35 + +# Offense count: 1 +Metrics/CyclomaticComplexity: + Max: 7 + +# Offense count: 3 +# Configuration parameters: CountComments, ExcludedMethods. +Metrics/MethodLength: + Max: 31 + +# Offense count: 1 +# Configuration parameters: CountKeywordArgs. +Metrics/ParameterLists: + Max: 6 + +# Offense count: 2 +RSpec/AnyInstance: + Exclude: + - 'spec/indexers/composite_indexer_spec.rb' + - 'spec/indexers/processable_indexer_spec.rb' + +# Offense count: 9 # Configuration parameters: Max. RSpec/ExampleLength: Exclude: + - 'spec/indexers/composite_indexer_spec.rb' + - 'spec/indexers/describable_indexer_spec.rb' + - 'spec/indexers/identifiable_indexer_spec.rb' + - 'spec/indexers/processable_indexer_spec.rb' - 'spec/routing/dor_spec.rb' # Offense count: 1 @@ -24,10 +57,10 @@ RSpec/MessageSpies: Exclude: - 'spec/controllers/dor_controller_spec.rb' -# Offense count: 8 +# Offense count: 18 # Configuration parameters: AggregateFailuresByDefault. RSpec/MultipleExpectations: - Max: 4 + Max: 10 # Offense count: 2 # Configuration parameters: IgnoreSharedExamples. @@ -35,15 +68,56 @@ RSpec/NamedSubject: Exclude: - 'spec/models/queue_status_spec.rb' -# Offense count: 1 +# Offense count: 3 +RSpec/NestedGroups: + Max: 4 + +# Offense count: 3 # Configuration parameters: IgnoreNameless, IgnoreSymbolicNames. RSpec/VerifiedDoubles: Exclude: + - 'spec/indexers/composite_indexer_spec.rb' - 'spec/models/queue_status_spec.rb' -# Offense count: 65 +# Offense count: 1 +# Cop supports --auto-correct. +# Configuration parameters: EnforcedStyle. +# SupportedStyles: strict, flexible +Rails/TimeZone: + Exclude: + - 'app/indexers/process_indexer.rb' + +# Offense count: 4 +Style/ClassVars: + Exclude: + - 'app/indexers/identifiable_indexer.rb' + +# Offense count: 9 +Style/Documentation: + Exclude: + - 'spec/**/*' + - 'test/**/*' + - 'app/indexers/composite_indexer.rb' + - 'app/indexers/describable_indexer.rb' + - 'app/indexers/editable_indexer.rb' + - 'app/indexers/identifiable_indexer.rb' + - 'app/indexers/processable_indexer.rb' + - 'app/indexers/releasable_indexer.rb' + - 'app/indexers/solr_doc_helper.rb' + - 'app/services/indexer.rb' + +# Offense count: 1 +# Cop supports --auto-correct. +# Configuration parameters: AutoCorrect, EnforcedStyle, IgnoredMethods. +# SupportedStyles: predicate, comparison +Style/NumericPredicate: + Exclude: + - 'spec/**/*' + - 'app/indexers/processable_indexer.rb' + +# Offense count: 215 # Cop supports --auto-correct. # Configuration parameters: AutoCorrect, AllowHeredoc, AllowURI, URISchemes, IgnoreCopDirectives, IgnoredPatterns. # URISchemes: http, https Metrics/LineLength: - Max: 189 + Max: 192 diff --git a/Gemfile b/Gemfile index a8d6b7f3..2dd801e6 100644 --- a/Gemfile +++ b/Gemfile @@ -26,9 +26,11 @@ group :production do end group :development, :test do + gem 'byebug' gem 'coveralls', require: false gem 'rspec-rails', '~> 3.0' gem 'simplecov', require: false + gem 'webmock' end group :development do diff --git a/Gemfile.lock b/Gemfile.lock index 35cf977a..345571bc 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -57,6 +57,8 @@ GEM i18n (>= 0.7, < 2) minitest (~> 5.1) tzinfo (~> 1.1) + addressable (2.7.0) + public_suffix (>= 2.0.2, < 5.0) airbrussh (1.4.0) sshkit (>= 1.6.1, != 1.7.0) arel (9.0.0) @@ -67,6 +69,7 @@ GEM bundler-audit (0.6.1) bundler (>= 1.2.0, < 3) thor (~> 0.18) + byebug (11.0.1) capistrano (3.11.2) airbrussh (>= 1.0.0) i18n @@ -96,6 +99,8 @@ GEM term-ansicolor (~> 1.3) thor (>= 0.19.4, < 2.0) tins (~> 1.6) + crack (0.4.3) + safe_yaml (~> 1.0.0) crass (1.0.6) daemons (1.3.1) deep_merge (1.2.1) @@ -191,6 +196,7 @@ GEM haml (5.1.2) temple (>= 0.8.0) tilt + hashdiff (1.0.0) honeybadger (3.3.1) hooks (0.4.1) uber (~> 0.0.14) @@ -247,6 +253,7 @@ GEM parallel (1.19.1) parser (2.7.0.2) ast (~> 2.4.0) + public_suffix (4.0.3) puma (3.12.2) rack (2.1.1) rack-test (1.1.0) @@ -347,6 +354,7 @@ GEM mime-types nokogiri rest-client + safe_yaml (1.0.5) simplecov (0.16.1) docile (~> 1.1) json (>= 1.8, < 3) @@ -390,6 +398,10 @@ GEM unf_ext unf_ext (0.0.7.6) unicode-display_width (1.6.1) + webmock (3.7.6) + addressable (>= 2.3.6) + crack (>= 0.3.2) + hashdiff (>= 0.4.0, < 2.0.0) websocket-driver (0.7.1) websocket-extensions (>= 0.1.0) websocket-extensions (0.1.4) @@ -401,6 +413,7 @@ PLATFORMS DEPENDENCIES bootsnap (>= 1.1.0) + byebug capistrano (~> 3.0) capistrano-bundler capistrano-passenger @@ -425,6 +438,7 @@ DEPENDENCIES rubocop-rails rubocop-rspec simplecov + webmock BUNDLED WITH 2.1.4 diff --git a/app/controllers/dor_controller.rb b/app/controllers/dor_controller.rb index 5b7cd02a..c28f5b18 100644 --- a/app/controllers/dor_controller.rb +++ b/app/controllers/dor_controller.rb @@ -47,7 +47,8 @@ def reindex_pid(pid, logger:, add_attributes:) # benchmark how long it takes to convert the object to a Solr document to_solr_stats = Benchmark.measure('to_solr') do - solr_doc = obj.to_solr + indexer = Indexer.for(obj) + solr_doc = indexer.to_solr solr.add(solr_doc, add_attributes: add_attributes) end.format('%n realtime %rs total CPU %ts').gsub(/[\(\)]/, '') diff --git a/app/indexers/composite_indexer.rb b/app/indexers/composite_indexer.rb new file mode 100644 index 00000000..cfc380c7 --- /dev/null +++ b/app/indexers/composite_indexer.rb @@ -0,0 +1,26 @@ +# frozen_string_literal: true + +# Borrowed from https://github.com/samvera/valkyrie/blob/master/lib/valkyrie/persistence/solr/composite_indexer.rb +class CompositeIndexer + attr_reader :indexers + def initialize(*indexers) + @indexers = indexers + end + + def new(resource:) + Instance.new(indexers, resource: resource) + end + + class Instance + attr_reader :indexers, :resource + def initialize(indexers, resource:) + @resource = resource + @indexers = indexers.map { |i| i.new(resource: resource) } + end + + # @return [Hash] the merged solr document for all the sub-indexers + def to_solr + indexers.map(&:to_solr).inject({}, &:merge) + end + end +end diff --git a/app/indexers/data_indexer.rb b/app/indexers/data_indexer.rb new file mode 100644 index 00000000..d319c32f --- /dev/null +++ b/app/indexers/data_indexer.rb @@ -0,0 +1,22 @@ +# frozen_string_literal: true + +# Indexing provided by ActiveFedora +class DataIndexer + include ActiveFedora::Indexing + + attr_reader :resource + def initialize(resource:) + @resource = resource + end + + # we need to override this until https://github.com/samvera/active_fedora/pull/1371 + # has been released + def to_solr(solr_doc = {}, opts = {}) + super.tap do |doc| + doc['active_fedora_model_ssi'] = has_model + end + end + + delegate :create_date, :modified_date, :state, :pid, :inner_object, + :datastreams, :relationships, :has_model, to: :resource +end diff --git a/app/indexers/describable_indexer.rb b/app/indexers/describable_indexer.rb new file mode 100644 index 00000000..f06d1a36 --- /dev/null +++ b/app/indexers/describable_indexer.rb @@ -0,0 +1,58 @@ +# frozen_string_literal: true + +class DescribableIndexer + attr_reader :resource + def initialize(resource:) + @resource = resource + end + + # @return [Hash] the partial solr document for describable concerns + def to_solr + add_metadata_format_to_solr_doc.merge(add_mods_to_solr_doc) + end + + def add_metadata_format_to_solr_doc + { 'metadata_format_ssim' => 'mods' } + end + + def add_mods_to_solr_doc + solr_doc = {} + mods_sources = { + sw_title_display: %w[sw_display_title_tesim], + main_author_w_date: %w[sw_author_ssim sw_author_tesim], + sw_sort_author: %w[sw_author_sort_ssi], + sw_language_facet: %w[sw_language_ssim sw_language_tesim], + sw_genre: %w[sw_genre_ssim sw_genre_tesim], + format_main: %w[sw_format_ssim sw_format_tesim], + topic_facet: %w[sw_topic_ssim sw_topic_tesim], + era_facet: %w[sw_subject_temporal_ssim sw_subject_temporal_tesim], + geographic_facet: %w[sw_subject_geographic_ssim sw_subject_geographic_tesim], + %i[term_values typeOfResource] => %w[mods_typeOfResource_ssim mods_typeOfResource_tesim], + pub_year_sort_str: %w[sw_pub_date_sort_ssi], + pub_year_int: %w[sw_pub_date_sort_isi], + pub_year_display_str: %w[sw_pub_date_facet_ssi] + } + + mods_sources.each_pair do |meth, solr_keys| + vals = meth.is_a?(Array) ? resource.stanford_mods.send(meth.shift, *meth) : resource.stanford_mods.send(meth) + + next if vals.nil? || (vals.respond_to?(:empty?) && vals.empty?) + + solr_keys.each do |key| + solr_doc[key] ||= [] + solr_doc[key].push(*vals) + end + # asterisk to avoid multi-dimensional array: push values, not the array + end + + # convert multivalued fields to single value + %w[sw_pub_date_sort_ssi sw_pub_date_sort_isi sw_pub_date_facet_ssi].each do |key| + solr_doc[key] = solr_doc[key].first unless solr_doc[key].nil? + end + # some fields get explicit "(none)" placeholder values, mostly for faceting + %w[sw_language_tesim sw_genre_tesim sw_format_tesim].each do |key| + solr_doc[key] = ['(none)'] if solr_doc[key].blank? + end + solr_doc + end +end diff --git a/app/indexers/editable_indexer.rb b/app/indexers/editable_indexer.rb new file mode 100644 index 00000000..bfb0959e --- /dev/null +++ b/app/indexers/editable_indexer.rb @@ -0,0 +1,23 @@ +# frozen_string_literal: true + +class EditableIndexer + include SolrDocHelper + + attr_reader :resource + def initialize(resource:) + @resource = resource + end + + def to_solr + {}.tap do |solr_doc| + add_solr_value(solr_doc, 'default_rights', default_rights_for_indexing, :string, [:symbol]) + add_solr_value(solr_doc, 'agreement', resource.agreement, :string, [:symbol]) if resource.agreement_object + add_solr_value(solr_doc, 'default_use_license_machine', resource.use_license, :string, [:stored_sortable]) + end + end + + # @return [String] A description of the rights defined in the default object rights datastream. Can be 'Stanford', 'World', 'Dark' or 'None' + def default_rights_for_indexing + Dor::RightsMetadataDS::RIGHTS_TYPE_CODES.fetch(resource.default_rights, 'Unrecognized default rights value') + end +end diff --git a/app/indexers/identifiable_indexer.rb b/app/indexers/identifiable_indexer.rb new file mode 100644 index 00000000..ac960cf3 --- /dev/null +++ b/app/indexers/identifiable_indexer.rb @@ -0,0 +1,100 @@ +# frozen_string_literal: true + +class IdentifiableIndexer + include SolrDocHelper + + INDEX_VERSION_FIELD = 'dor_services_version_ssi' + NS_HASH = { 'hydra' => 'http://projecthydra.org/ns/relations#', + 'fedora' => 'info:fedora/fedora-system:def/relations-external#', + 'rdf' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' }.freeze + + attr_reader :resource + def initialize(resource:) + @resource = resource + end + + ## Module-level variables, shared between ALL mixin includers (and ALL *their* includers/extenders)! + ## used for caching found values + @@collection_hash = {} + @@apo_hash = {} + + # @return [Hash] the partial solr document for identifiable concerns + def to_solr + solr_doc = {} + solr_doc[INDEX_VERSION_FIELD] = Dor::VERSION + solr_doc['indexed_at_dtsi'] = Time.now.utc.xmlschema + resource.datastreams.values.each do |ds| + # This is used to draw the table of datastreams in Argo + add_solr_value(solr_doc, 'ds_specs', ds.datastream_spec_string, :string, [:symbol]) unless ds.new? + end + + add_solr_value(solr_doc, 'title_sort', resource.label, :string, [:stored_sortable]) + + rels_doc = Nokogiri::XML(resource.datastreams['RELS-EXT'].content) + apos = rels_doc.search('//rdf:RDF/rdf:Description/hydra:isGovernedBy', NS_HASH) + collections = rels_doc.search('//rdf:RDF/rdf:Description/fedora:isMemberOfCollection', NS_HASH) + solrize_related_obj_titles(solr_doc, apos, @@apo_hash, 'apo_title', 'nonhydrus_apo_title', 'hydrus_apo_title') + solrize_related_obj_titles(solr_doc, collections, @@collection_hash, 'collection_title', 'nonhydrus_collection_title', 'hydrus_collection_title') + solr_doc['public_dc_relation_tesim'] ||= solr_doc['collection_title_tesim'] if solr_doc['collection_title_tesim'] + solr_doc['metadata_source_ssi'] = identity_metadata_source + solr_doc + end + + # @return [String] calculated value for Solr index + def identity_metadata_source + if resource.identityMetadata.otherId('catkey').first || + resource.identityMetadata.otherId('barcode').first + 'Symphony' + else + 'DOR' + end + end + + # Clears out the cache of items. Used primarily in testing. + def self.reset_cache! + @@collection_hash = {} + @@apo_hash = {} + end + + private + + def solrize_related_obj_titles(solr_doc, relationships, title_hash, union_field_name, nonhydrus_field_name, hydrus_field_name) + # TODO: if you wanted to get a little fancier, you could also solrize a 2 level hierarchy and display using hierarchial facets, like + # ["SOURCE", "SOURCE : TITLE"] (e.g. ["Hydrus", "Hydrus : Special Collections"], see (exploded) tags in IdentityMetadataDS#to_solr). + title_type = :symbol # we'll get an _ssim because of the type + title_attrs = [:stored_searchable] # we'll also get a _tesim from this attr + relationships.each do |rel_node| + rel_druid = rel_node['rdf:resource'] + next unless rel_druid # TODO: warning here would also be useful + + rel_druid = rel_druid.gsub('info:fedora/', '') + + # populate cache if necessary + unless title_hash.key?(rel_druid) + begin + related_obj = Dor.find(rel_druid) + related_obj_title = related_obj_display_title(related_obj, rel_druid) + is_from_hydrus = (related_obj&.tags&.include?('Project : Hydrus')) + title_hash[rel_druid] = { 'related_obj_title' => related_obj_title, 'is_from_hydrus' => is_from_hydrus } + rescue ActiveFedora::ObjectNotFoundError + # This may happen if the given APO or Collection does not exist (bad data) + title_hash[rel_druid] = { 'related_obj_title' => rel_druid, 'is_from_hydrus' => false } + end + end + + # cache should definitely be populated, so just use that to write solr field + if title_hash[rel_druid]['is_from_hydrus'] + add_solr_value(solr_doc, hydrus_field_name, title_hash[rel_druid]['related_obj_title'], title_type, title_attrs) + else + add_solr_value(solr_doc, nonhydrus_field_name, title_hash[rel_druid]['related_obj_title'], title_type, title_attrs) + end + add_solr_value(solr_doc, union_field_name, title_hash[rel_druid]['related_obj_title'], title_type, title_attrs) + end + end + + def related_obj_display_title(related_obj, default_title) + return default_title unless related_obj + + related_obj.full_title || default_title + end +end diff --git a/app/indexers/process_indexer.rb b/app/indexers/process_indexer.rb new file mode 100644 index 00000000..ab65c236 --- /dev/null +++ b/app/indexers/process_indexer.rb @@ -0,0 +1,56 @@ +# frozen_string_literal: true + +# Indexes the process for a workflow +class ProcessIndexer + ERROR_OMISSION = '... (continued)' + private_constant :ERROR_OMISSION + + # see https://lucene.apache.org/core/7_3_1/core/org/apache/lucene/util/BytesRefHash.MaxBytesLengthExceededException.html + MAX_ERROR_LENGTH = 32_768 - 2 - ERROR_OMISSION.length + private_constant :MAX_ERROR_LENGTH + + # @param [WorkflowSolrDocument] solr_doc + # @param [String] workflow_name + # @param [Dor::Workflow::Response::Process] process + def initialize(solr_doc:, workflow_name:, process:) + @solr_doc = solr_doc + @workflow_name = workflow_name + @process = process + end + + # @return [Hash] the partial solr document for the workflow document + def to_solr + return unless status + + # add a record of the robot having operated on this item, so we can track robot activity + solr_doc.add_process_time(workflow_name, name, Time.parse(process.datetime)) if has_time? + + index_error_message + + # workflow name, process status then process name + solr_doc.add_wsp("#{workflow_name}:#{status}", "#{workflow_name}:#{status}:#{name}") + + # workflow name, process name then process status + solr_doc.add_wps("#{workflow_name}:#{name}", "#{workflow_name}:#{name}:#{status}") + + # process status, workflowname then process name + solr_doc.add_swp(process.status.to_s, "#{status}:#{workflow_name}", "#{status}:#{workflow_name}:#{name}") + end + + private + + attr_reader :process, :workflow_name, :solr_doc + delegate :status, :name, :state, :error_message, :datetime, to: :process + + def has_time? + datetime && (status == 'completed' || status == 'error') + end + + # index the error message without the druid so we hopefully get some overlap + # truncate to avoid org.apache.lucene.util.BytesRefHash$MaxBytesLengthExceededException + def index_error_message + return unless error_message + + solr_doc.error = "#{workflow_name}:#{name}:#{error_message}".truncate(MAX_ERROR_LENGTH, omission: ERROR_OMISSION) + end +end diff --git a/app/indexers/processable_indexer.rb b/app/indexers/processable_indexer.rb new file mode 100644 index 00000000..ee9b9c41 --- /dev/null +++ b/app/indexers/processable_indexer.rb @@ -0,0 +1,97 @@ +# frozen_string_literal: true + +class ProcessableIndexer + include SolrDocHelper + + attr_reader :resource + def initialize(resource:) + @resource = resource + @status_service = Dor::StatusService.new(resource) + end + + # @return [Hash] the partial solr document for processable concerns + def to_solr + {}.tap do |solr_doc| + add_versions(solr_doc) + add_milestones(solr_doc) + solr_doc['modified_latest_dttsi'] = resource.modified_date.to_datetime.utc.strftime('%FT%TZ') + add_solr_value(solr_doc, 'rights', resource.rights, :string, [:symbol]) if resource.respond_to? :rights + add_status(solr_doc) + end + end + + private + + attr_reader :status_service + + def current_version + @current_version ||= begin + resource.current_version + rescue StandardError + '1' + end + end + + def add_status(solr_doc) + solr_doc['status_ssi'] = status_service.status # status is singular (i.e. the current one) + status_info_hash = status_service.status_info + status_code = status_info_hash[:status_code] + add_solr_value(solr_doc, 'processing_status_text', simplified_status_code_disp_txt(status_code), :string, [:stored_sortable]) + solr_doc['processing_status_code_isi'] = status_code + end + + def add_milestones(solr_doc) + status_service.milestones.each do |milestone| + timestamp = milestone[:at].utc.xmlschema + milestone[:version] ||= current_version + solr_doc['lifecycle_ssim'] ||= [] + solr_doc['lifecycle_ssim'] << milestone[:milestone] + add_solr_value(solr_doc, 'lifecycle', "#{milestone[:milestone]}:#{timestamp};#{milestone[:version]}", :symbol) + end + + add_sortable_milestones(solr_doc) + end + + def sortable_milestones + sortable = {} + status_service.milestones.each do |milestone| + sortable[milestone[:milestone]] ||= [] + sortable[milestone[:milestone]] << milestone[:at].utc.xmlschema + end + sortable + end + + def add_sortable_milestones(solr_doc) + sortable_milestones.each do |milestone, unordered_dates| + dates = unordered_dates.sort + # create the published_dttsi and published_day fields and the like + dates.each do |date| + solr_doc["#{milestone}_dttsim"] ||= [] + solr_doc["#{milestone}_dttsim"] << date unless solr_doc["#{milestone}_dttsim"].include?(date) + end + # fields for OAI havester to sort on: _dttsi is trie date +stored +indexed (single valued, i.e. sortable) + solr_doc["#{milestone}_earliest_dttsi"] = dates.first + solr_doc["#{milestone}_latest_dttsi"] = dates.last + end + end + + def add_versions(solr_doc) + current_version_num = current_version.to_i + solr_doc['current_version_isi'] = current_version_num + + return unless resource.respond_to?('versionMetadata') + + # add an entry with version id, tag and description for each version + while current_version_num > 0 + new_val = "#{current_version_num};#{resource.versionMetadata.tag_for_version(current_version_num.to_s)};#{resource.versionMetadata.description_for_version(current_version_num.to_s)}" + add_solr_value(solr_doc, 'versions', new_val, :string, [:displayable]) + current_version_num -= 1 + end + end + + # @return [String] text translation of the status code, minus any trailing parenthetical explanation + # e.g. 'In accessioning (described)' and 'In accessioning (described, published)' both return 'In accessioning' + def simplified_status_code_disp_txt(status_code) + Dor::StatusService::STATUS_CODE_DISP_TXT[status_code].gsub(/\(.*\)$/, '').strip + end +end diff --git a/app/indexers/releasable_indexer.rb b/app/indexers/releasable_indexer.rb new file mode 100644 index 00000000..f7e8490b --- /dev/null +++ b/app/indexers/releasable_indexer.rb @@ -0,0 +1,32 @@ +# frozen_string_literal: true + +class ReleasableIndexer + include SolrDocHelper + + attr_reader :resource + def initialize(resource:) + @resource = resource + end + + # @return [Hash] the partial solr document for releasable concerns + def to_solr + solr_doc = {} + + # TODO: sort of worried about the performance impact in bulk reindex + # situations, since released_for recurses all parent collections. jmartin 2015-07-14 + released_for.each do |release_target, release_info| + add_solr_value(solr_doc, 'released_to', release_target, :symbol, []) if release_info['release'] + end + + # TODO: need to solrize whether item is released to purl? does released_for return that? + # logic is: "True when there is a published lifecycle and Access Rights is anything but Dark" + + solr_doc + end + + private + + def released_for + Dor::ReleaseTagService.for(resource).released_for(skip_live_purl: true) + end +end diff --git a/app/indexers/solr_doc_helper.rb b/app/indexers/solr_doc_helper.rb new file mode 100644 index 00000000..826fd372 --- /dev/null +++ b/app/indexers/solr_doc_helper.rb @@ -0,0 +1,11 @@ +# frozen_string_literal: true + +module SolrDocHelper + def add_solr_value(solr_doc, field_name, value, field_type = :default, index_types = [:searchable]) + case field_type + when :symbol + index_types << field_type + end + ::Solrizer.insert_field(solr_doc, field_name, value, *index_types) + end +end diff --git a/app/indexers/workflow_indexer.rb b/app/indexers/workflow_indexer.rb new file mode 100644 index 00000000..3b46facd --- /dev/null +++ b/app/indexers/workflow_indexer.rb @@ -0,0 +1,45 @@ +# frozen_string_literal: true + +# Indexes the objects position in workflows +class WorkflowIndexer + # @param [Workflow::Response::Workflow] workflow the workflow document to index + def initialize(workflow:) + @workflow = workflow + end + + # @return [Hash] the partial solr document for the workflow document + def to_solr + WorkflowSolrDocument.new do |solr_doc| + solr_doc.name = workflow_name + + errors = 0 # The error count is used by the Report class in Argo + processes.each do |process| + ProcessIndexer.new(solr_doc: solr_doc, workflow_name: workflow_name, process: process).to_solr + errors += 1 if process.status == 'error' + end + solr_doc.status = [workflow_name, workflow_status, errors, repository].join('|') + end + end + + private + + attr_reader :workflow + delegate :workflow_name, :repository, to: :workflow + + def definition_process_names + @definition_process_names ||= begin + definition = Dor::Config.workflow.client.workflow_template(workflow_name) + definition['processes'].map { |p| p['name'] } + end + end + + def processes + @processes ||= definition_process_names.map do |process_name| + workflow.process_for_recent_version(name: process_name) + end + end + + def workflow_status + workflow.complete? ? 'completed' : 'active' + end +end diff --git a/app/indexers/workflows_indexer.rb b/app/indexers/workflows_indexer.rb new file mode 100644 index 00000000..5d1df78a --- /dev/null +++ b/app/indexers/workflows_indexer.rb @@ -0,0 +1,32 @@ +# frozen_string_literal: true + +# Indexes the objects position in workflows +class WorkflowsIndexer + attr_reader :resource + def initialize(resource:) + @resource = resource + end + + # @return [Hash] the partial solr document for workflow concerns + def to_solr + WorkflowSolrDocument.new do |combined_doc| + workflows.each do |wf| + doc = WorkflowIndexer.new(workflow: wf).to_solr + combined_doc.merge!(doc) + end + end.to_h + end + + private + + # @return [Array] + def workflows + all_workflows.workflows + end + + # TODO: remove Dor::Workflow::Document + # @return [Workflow::Response::Workflows] + def all_workflows + @all_workflows ||= Dor::Config.workflow.client.workflow_routes.all_workflows pid: resource.pid + end +end diff --git a/app/models/workflow_solr_document.rb b/app/models/workflow_solr_document.rb new file mode 100644 index 00000000..94090460 --- /dev/null +++ b/app/models/workflow_solr_document.rb @@ -0,0 +1,91 @@ +# frozen_string_literal: true + +# Represents that part of the solr document that holds workflow data +class WorkflowSolrDocument + WORKFLOW_SOLR = 'wf_ssim' + # field that indexes workflow name, process status then process name + WORKFLOW_WPS_SOLR = 'wf_wps_ssim' + # field that indexes workflow name, process name then process status + WORKFLOW_WSP_SOLR = 'wf_wsp_ssim' + # field that indexes process status, workflowname then process name + WORKFLOW_SWP_SOLR = 'wf_swp_ssim' + WORKFLOW_ERROR_SOLR = 'wf_error_ssim' + WORKFLOW_STATUS_SOLR = 'workflow_status_ssim' + + KEYS_TO_MERGE = [ + WORKFLOW_SOLR, + WORKFLOW_WPS_SOLR, + WORKFLOW_WSP_SOLR, + WORKFLOW_SWP_SOLR, + WORKFLOW_STATUS_SOLR, + WORKFLOW_ERROR_SOLR + ].freeze + + def initialize + @data = empty_document + yield self if block_given? + end + + def name=(wf_name) + data[WORKFLOW_SOLR] += [wf_name] + data[WORKFLOW_WPS_SOLR] += [wf_name] + data[WORKFLOW_WSP_SOLR] += [wf_name] + end + + def status=(status) + data[WORKFLOW_STATUS_SOLR] += [status] + end + + def error=(message) + data[WORKFLOW_ERROR_SOLR] += [message] + end + + # Add to the field that indexes workflow name, process status then process name + def add_wps(*messages) + data[WORKFLOW_WPS_SOLR] += messages + end + + # Add to the field that indexes workflow name, process name then process status + def add_wsp(*messages) + data[WORKFLOW_WSP_SOLR] += messages + end + + # Add to the field that indexes process status, workflow name then process name + def add_swp(*messages) + data[WORKFLOW_SWP_SOLR] += messages + end + + # Add the processes data_time attribute to the solr document + # @param [String] wf_name + # @param [String] process_name + # @param [Time] time + def add_process_time(wf_name, process_name, time) + data["wf_#{wf_name}_#{process_name}_dttsi"] = time.utc.iso8601 + end + + def to_h + KEYS_TO_MERGE.each { |k| data[k].uniq! } + data + end + + delegate :except, :[], to: :data + + # @param [WorkflowSolrDocument] doc + def merge!(doc) + # This is going to get the date fields, e.g. `wf_assemblyWF_jp2-create_dttsi' + @data.merge!(doc.except(*KEYS_TO_MERGE)) + + # Combine the non-unique fields together + KEYS_TO_MERGE.each do |k| + data[k] += doc[k] + end + end + + private + + attr_reader :data + + def empty_document + KEYS_TO_MERGE.each_with_object({}) { |k, obj| obj[k] = [] } + end +end diff --git a/app/services/indexer.rb b/app/services/indexer.rb new file mode 100644 index 00000000..cbbfd6ba --- /dev/null +++ b/app/services/indexer.rb @@ -0,0 +1,53 @@ +# frozen_string_literal: true + +class Indexer + WORKFLOW_INDEXER = CompositeIndexer.new( + DataIndexer, + DescribableIndexer, + IdentifiableIndexer, + ProcessableIndexer, + WorkflowsIndexer + ) + + ADMIN_POLICY_INDEXER = CompositeIndexer.new( + DataIndexer, + DescribableIndexer, + EditableIndexer, + IdentifiableIndexer, + ProcessableIndexer, + WorkflowsIndexer + ) + + ETD_INDEXER = CompositeIndexer.new( + DataIndexer + ) + + ITEM_INDEXER = CompositeIndexer.new( + DataIndexer, + DescribableIndexer, + IdentifiableIndexer, + ProcessableIndexer, + ReleasableIndexer, + WorkflowsIndexer + ) + + SET_INDEXER = CompositeIndexer.new( + DataIndexer, + DescribableIndexer, + IdentifiableIndexer, + ProcessableIndexer, + WorkflowsIndexer + ) + + INDEXERS = { + Dor::WorkflowObject => WORKFLOW_INDEXER, + Dor::AdminPolicyObject => ADMIN_POLICY_INDEXER, + Dor::Etd => ETD_INDEXER, + Dor::Item => ITEM_INDEXER, + Dor::Set => SET_INDEXER + }.freeze + + def self.for(obj) + INDEXERS.fetch(obj.class).new(resource: obj) + end +end diff --git a/config/settings.yml b/config/settings.yml index e06b49f9..0fd6418a 100644 --- a/config/settings.yml +++ b/config/settings.yml @@ -47,4 +47,4 @@ MESSAGE_QUEUES: [] # URLs FEDORA_URL: 'https://user:password@fedora.example.com:1000/fedora' SOLRIZER_URL: 'https://solr.example.com/solr/collection' -WORKFLOW_URL: 'https://workflow.example.com/workflow' +WORKFLOW_URL: 'https://workflow.example.edu/' diff --git a/spec/controllers/dor_controller_spec.rb b/spec/controllers/dor_controller_spec.rb index 49820348..e9e6d9d1 100644 --- a/spec/controllers/dor_controller_spec.rb +++ b/spec/controllers/dor_controller_spec.rb @@ -8,12 +8,14 @@ expect(Logger).to receive(:new).and_return(mock_logger) allow(ActiveFedora.solr).to receive(:conn).and_return(mock_solr_conn) allow(Dor).to receive(:find).with(mock_druid).and_return(mock_af_doc) + allow(Indexer).to receive(:for).with(mock_af_doc).and_return(mock_indexer) end let(:mock_logger) { instance_double(Logger, :formatter= => true, info: true) } let(:mock_solr_conn) { instance_double(RSolr::Client, add: true, commit: true) } - let(:mock_af_doc) { instance_double(Dor::Item, to_solr: mock_solr_doc) } + let(:mock_af_doc) { Dor::Item.new } let(:mock_druid) { 'asdf:1234' } + let(:mock_indexer) { instance_double(CompositeIndexer::Instance, to_solr: mock_solr_doc) } let(:mock_solr_doc) { { id: mock_druid, text_field_tesim: 'a field to be searched' } } it 'reindexes an object with default commitWithin param and a hard commit' do diff --git a/spec/indexers/composite_indexer_spec.rb b/spec/indexers/composite_indexer_spec.rb new file mode 100644 index 00000000..4f560b28 --- /dev/null +++ b/spec/indexers/composite_indexer_spec.rb @@ -0,0 +1,81 @@ +# frozen_string_literal: true + +require 'rails_helper' + +RSpec.describe CompositeIndexer do + let(:model) { Dor::Abstract } + let(:mods) do + double('mods', sw_title_display: 'foo', sw_genre: ['test genre'], + main_author_w_date: '1999', + sw_sort_author: 'baz', + sw_language_facet: 'en', + format_main: 'foofmt', + topic_facet: 'topicbar', + era_facet: ['17th century', '18th century'], + geographic_facet: %w[Europe Europe], + term_values: 'huh?', + pub_year_sort_str: '1600', + pub_year_int: 1600, + pub_year_display_str: '1600') + end + let(:obj) do + instance_double(Dor::Item, + stanford_mods: mods, + datastreams: datastreams, + label: 'obj label', + identityMetadata: identity_metadata, + versionMetadata: version_metadata, + current_version: '7', + modified_date: '1999-12-30') + end + let(:datastreams) do + { 'RELS-EXT' => double('datastream', datastream_spec_string: 'huh', new?: false, content: '') } + end + let(:identity_metadata) do + instance_double(Dor::IdentityMetadataDS, otherId: 'foo') + end + let(:version_metadata) do + instance_double(Dor::VersionMetadataDS, tag_for_version: 'tag7', description_for_version: 'desc7', current_version_id: '7') + end + + let(:indexer) do + described_class.new( + DescribableIndexer, + IdentifiableIndexer, + ProcessableIndexer + ) + end + + describe 'to_solr' do + before do + allow_any_instance_of(Dor::StatusService).to receive(:milestones).and_return({}) + end + + let(:doc) { indexer.new(resource: obj).to_solr } + + it 'searchworks date-fu: temporal periods and pub_dates' do + expect(doc).to match a_hash_including( + 'sw_subject_temporal_ssim' => a_collection_containing_exactly('18th century', '17th century'), + 'sw_subject_temporal_tesim' => a_collection_containing_exactly('18th century', '17th century'), + 'sw_pub_date_sort_ssi' => '1600', + 'sw_pub_date_sort_isi' => 1600, + 'sw_pub_date_facet_ssi' => '1600' + ) + end + + it 'subject geographic fields' do + expect(doc).to match a_hash_including( + 'sw_subject_geographic_ssim' => %w[Europe Europe], + 'sw_subject_geographic_tesim' => %w[Europe Europe] + ) + end + + it 'genre fields' do + genre_list = obj.stanford_mods.sw_genre + expect(doc).to match a_hash_including( + 'sw_genre_ssim' => genre_list, + 'sw_genre_tesim' => genre_list + ) + end + end +end diff --git a/spec/indexers/data_indexer_spec.rb b/spec/indexers/data_indexer_spec.rb new file mode 100644 index 00000000..6d8cfef1 --- /dev/null +++ b/spec/indexers/data_indexer_spec.rb @@ -0,0 +1,33 @@ +# frozen_string_literal: true + +require 'rails_helper' + +RSpec.describe DataIndexer do + let(:obj) do + Dor::AdminPolicyObject.new + end + + let(:indexer) do + described_class.new(resource: obj) + end + + describe '#to_solr' do + let(:indexer) do + CompositeIndexer.new( + described_class + ).new(resource: obj) + end + let(:doc) { indexer.to_solr } + + before do + obj.use_statement = 'Rights are owned by Stanford University Libraries.' + obj.copyright_statement = 'Additional copyright info' + end + + it 'makes a solr doc' do + expect(doc).to match a_hash_including('use_statement_ssim' => + ['Rights are owned by Stanford University Libraries.']) + expect(doc).to match a_hash_including('copyright_ssim' => ['Additional copyright info']) + end + end +end diff --git a/spec/indexers/describable_indexer_spec.rb b/spec/indexers/describable_indexer_spec.rb new file mode 100644 index 00000000..fdaa9554 --- /dev/null +++ b/spec/indexers/describable_indexer_spec.rb @@ -0,0 +1,126 @@ +# frozen_string_literal: true + +require 'rails_helper' + +RSpec.describe DescribableIndexer do + let(:xml) do + <<~XML + + + + The + complete works of Henry George + + + George, Henry + + 1839-1897 + + creator + + + + George, Henry + + 1862-1916 + + text + + + xx + + + + Garden City, N. Y + + Doubleday, Page + 1911 + 1911 + [Library ed.] + + monographic + + + eng + + + + print + + 10 v. fronts (v. 1-9) ports. 21 cm. + + + YNG + 731210 + 19900625062034.0 + 68184 + + 757655 + + + + electronic + preservation + reformatted digital + + + I. Progress and poverty.--II. Social problems.--III. The land question. Property in land. The condition of labor.--IV. Protection or free trade.--V. A perplexed philosopher [Herbert Spencer]--VI. The science of political economy, books I and II.--VII. The science of political economy, books III to V. "Moses": a lecture.--VIII. Our land and land policy.--IX-X. The life of Henry George, by his son Henry George, jr. + On cover: Complete works of Henry George. Fels fund. Library edition. + + Economics + 1800-1900 + + + + DOR_MARC2MODS3-3.xsl Revision 1.1 + 2011-02-25T18:20:23.132-08:00 + 36105010700545 + + druid:pz263ny9658 + + Stanford University Libraries + + http://purl.stanford.edu/pz263ny9658 + + + XML + end + let(:obj) { Dor::Abstract.new } + + let(:indexer) do + described_class.new(resource: obj) + end + + describe '#to_solr' do + let(:doc) { indexer.to_solr } + + before do + obj.datastreams['descMetadata'].content = xml + end + + it 'includes values from stanford_mods' do + expect(doc).to match a_hash_including( + 'sw_language_ssim' => ['English'], + 'sw_language_tesim' => ['English'], + 'sw_format_ssim' => ['Book'], + 'sw_format_tesim' => ['Book'], + 'sw_subject_temporal_ssim' => ['1800-1900'], + 'sw_subject_temporal_tesim' => ['1800-1900'], + 'sw_pub_date_sort_ssi' => '1911', + 'sw_pub_date_sort_isi' => 1911, + 'sw_pub_date_facet_ssi' => '1911' + ) + end + + it 'does not include empty values' do + doc.keys.sort_by(&:to_s).each do |k| + expect(doc).to include(k) + expect(doc).to match hash_excluding(k => nil) + expect(doc).to match hash_excluding(k => []) + end + end + end +end diff --git a/spec/indexers/editable_indexer_spec.rb b/spec/indexers/editable_indexer_spec.rb new file mode 100644 index 00000000..b8b07147 --- /dev/null +++ b/spec/indexers/editable_indexer_spec.rb @@ -0,0 +1,40 @@ +# frozen_string_literal: true + +require 'rails_helper' + +RSpec.describe EditableIndexer do + let(:obj) do + instance_double(Dor::AdminPolicyObject, + default_rights: 'world', + use_license: 'by-nc-sa') + end + let(:indexer) do + described_class.new(resource: obj) + end + + describe '#default_rights_for_indexing' do + it 'uses the OM template if the ds is empty' do + expect(indexer.default_rights_for_indexing).to eq('World') + end + end + + describe '#to_solr' do + let(:indexer) do + CompositeIndexer.new( + described_class + ).new(resource: obj) + end + let(:doc) { indexer.to_solr } + + before do + allow(obj).to receive(:agreement).and_return('druid:agreement') + allow(obj).to receive(:agreement_object).and_return(true) + end + + it 'makes a solr doc' do + expect(doc).to match a_hash_including('default_rights_ssim' => ['World']) # note that this is capitalized, because it comes from default_rights_for_indexing + expect(doc).to match a_hash_including('agreement_ssim' => ['druid:agreement']) + expect(doc).to match a_hash_including('default_use_license_machine_ssi' => 'by-nc-sa') + end + end +end diff --git a/spec/indexers/identifiable_indexer_spec.rb b/spec/indexers/identifiable_indexer_spec.rb new file mode 100644 index 00000000..0f0ac1bb --- /dev/null +++ b/spec/indexers/identifiable_indexer_spec.rb @@ -0,0 +1,143 @@ +# frozen_string_literal: true + +require 'rails_helper' + +RSpec.describe IdentifiableIndexer do + let(:xml) do + <<~XML + + druid:rt923jk342 + item + google download barcode 36105049267078 + DOR + Squirrels of North America + Eder, Tamara, 1974- + STANFORD_342837261527 + 36105049267078 + 129483625 + 7f3da130-7b02-11de-8a39-0800200c9a66 + Google Books : Phase 1 + Google Books : Scan source STANFORD + Project : Beautiful Books + Registered By : blalbrit + DPG : Beautiful Books : Octavo : newpri + Remediated By : 4.15.4 + true + true + + XML + end + + let(:obj) { Dor::Abstract.new } + + let(:indexer) do + described_class.new(resource: obj) + end + + before do + obj.identityMetadata.content = xml + described_class.reset_cache! + end + + describe '#identity_metadata_source' do + it 'depends on remove_other_Id' do + obj.identityMetadata.remove_other_Id('catkey', '129483625') + obj.identityMetadata.remove_other_Id('barcode', '36105049267078') + obj.identityMetadata.add_other_Id('catkey', '129483625') + expect(indexer.identity_metadata_source).to eq 'Symphony' + obj.identityMetadata.remove_other_Id('catkey', '129483625') + obj.identityMetadata.add_other_Id('barcode', '36105049267078') + expect(indexer.identity_metadata_source).to eq 'Symphony' + obj.identityMetadata.remove_other_Id('barcode', '36105049267078') + expect(indexer.identity_metadata_source).to eq 'DOR' + obj.identityMetadata.remove_other_Id('foo', 'bar') + expect(indexer.identity_metadata_source).to eq 'DOR' + end + + it 'indexes metadata source' do + expect(indexer.identity_metadata_source).to eq 'Symphony' + end + end + + describe '#to_solr' do + let(:doc) { indexer.to_solr } + + context 'with related objects' do + let(:mock_rel_druid) { 'druid:does_not_exist' } + let(:mock_rels_ext_xml) do + %( + + + + + + ) + end + + before do + allow(obj.datastreams['RELS-EXT']).to receive(:content).and_return(mock_rels_ext_xml) + end + + context 'when related collection and APOs are not found' do + before do + allow(Dor).to receive(:find).with(mock_rel_druid).and_raise(ActiveFedora::ObjectNotFoundError) + end + + it 'generate collections and apo title fields' do + expect(doc[Solrizer.solr_name('collection_title', :symbol)].first).to eq mock_rel_druid + expect(doc[Solrizer.solr_name('collection_title', :stored_searchable)].first).to eq mock_rel_druid + expect(doc[Solrizer.solr_name('apo_title', :symbol)].first).to eq mock_rel_druid + expect(doc[Solrizer.solr_name('apo_title', :stored_searchable)].first).to eq mock_rel_druid + expect(doc[Solrizer.solr_name('nonhydrus_apo_title', :symbol)].first).to eq mock_rel_druid + expect(doc[Solrizer.solr_name('nonhydrus_apo_title', :stored_searchable)].first).to eq mock_rel_druid + end + end + + context 'when related collection and APOs are found' do + let(:mock_obj) { instance_double(Dor::Item, full_title: 'Test object', tags: '') } + + before do + allow(Dor).to receive(:find).with(mock_rel_druid).and_return(mock_obj) + end + + it 'generate collections and apo title fields' do + expect(doc[Solrizer.solr_name('collection_title', :symbol)].first).to eq 'Test object' + expect(doc[Solrizer.solr_name('collection_title', :stored_searchable)].first).to eq 'Test object' + expect(doc[Solrizer.solr_name('apo_title', :symbol)].first).to eq 'Test object' + expect(doc[Solrizer.solr_name('apo_title', :stored_searchable)].first).to eq 'Test object' + expect(doc[Solrizer.solr_name('nonhydrus_apo_title', :symbol)].first).to eq 'Test object' + expect(doc[Solrizer.solr_name('nonhydrus_apo_title', :stored_searchable)].first).to eq 'Test object' + end + end + end + + it 'indexes metadata source' do + expect(doc).to match a_hash_including('metadata_source_ssi' => 'Symphony') + end + end + + describe '#related_obj_display_title' do + subject { indexer.send(:related_obj_display_title, mock_apo_obj, mock_default_title) } + + let(:mock_default_title) { 'druid:zy098xw7654' } + + context 'when the main title is available' do + let(:mock_apo_obj) { instance_double(Dor::AdminPolicyObject, full_title: 'apo title') } + + it { is_expected.to eq 'apo title' } + end + + context 'when the first descMetadata main title entry is empty string' do + let(:mock_apo_obj) { instance_double(Dor::AdminPolicyObject, full_title: nil) } + + it { is_expected.to eq mock_default_title } + end + + context 'when the related object is nil' do + let(:mock_apo_obj) { nil } + + it { is_expected.to eq mock_default_title } + end + end +end diff --git a/spec/indexers/processable_indexer_spec.rb b/spec/indexers/processable_indexer_spec.rb new file mode 100644 index 00000000..ea7d0888 --- /dev/null +++ b/spec/indexers/processable_indexer_spec.rb @@ -0,0 +1,162 @@ +# frozen_string_literal: true + +require 'rails_helper' + +RSpec.describe ProcessableIndexer do + let(:indexer) { described_class.new(resource: obj) } + + describe '#simplified_status_code_disp_txt' do + let(:obj) do + instance_double(Dor::Item) + end + + it "trims off parens but doesn't harm the strings otherwise" do + expect(indexer.send(:simplified_status_code_disp_txt, 2)).to eq('In accessioning') + expect(indexer.send(:simplified_status_code_disp_txt, 3)).to eq('In accessioning') + end + end + + describe '#to_solr' do + let(:obj) do + instance_double(Dor::Item, + current_version: '4', + pid: '99', + modified_date: '1999-12-20') + end + + let(:solr_doc) { indexer.to_solr } + + context 'with rights set' do + let(:version_metadata) do + instance_double(Dor::VersionMetadataDS, tag_for_version: 'tag7', description_for_version: 'desc7', current_version_id: '7') + end + let(:obj) do + instance_double(Dor::Item, + rights: 'World', + modified_date: '1999-12-20', + current_version: '7', + versionMetadata: version_metadata) + end + + describe '#to_solr' do + let(:indexer) do + CompositeIndexer.new( + described_class + ).new(resource: obj) + end + + before do + allow_any_instance_of(Dor::StatusService).to receive(:milestones).and_return({}) + end + + it 'includes a rights facet' do + expect(solr_doc).to match a_hash_including('rights_ssim' => ['World']) + end + + it 'does not error if there is nothing in the datastream' do + allow(obj).to receive(:rightsMetadata).and_return(Dor::RightsMetadataDS.new) + expect { solr_doc }.not_to raise_error + end + end + end + + context 'with milestones' do + let(:dsxml) do + ' + + + Initial version + + + Replacing main PDF + + + Fixed title typo + + + Another typo + + + ' + end + let(:xml) do + Nokogiri::XML(' + + published + opened + submitted + published + accessioned + described + opened + submitted + described + published + published + ') + end + let(:version_metadata) { Dor::VersionMetadataDS.from_xml(dsxml) } + + before do + allow(Dor::Config.workflow.client.lifecycle_routes).to receive(:query_lifecycle).and_return(xml) + allow(obj).to receive(:versionMetadata).and_return(version_metadata) + end + + it 'includes the semicolon delimited version, an earliest published date and a status' do + # lifecycle_display should have the semicolon delimited version + expect(solr_doc['lifecycle_ssim']).to include('published:2012-01-27T05:06:54Z;2') + # published date should be the first published date + expect(solr_doc).to match a_hash_including('status_ssi' => 'v4 In accessioning (described, published)') + expect(solr_doc).to match a_hash_including('opened_dttsim' => including('2012-11-07T00:21:02Z')) + expect(solr_doc['published_earliest_dttsi']).to eq('2012-01-27T05:06:54Z') + expect(solr_doc['published_latest_dttsi']).to eq('2012-11-07T00:59:39Z') + expect(solr_doc['published_dttsim'].first).to eq(solr_doc['published_earliest_dttsi']) + expect(solr_doc['published_dttsim'].last).to eq(solr_doc['published_latest_dttsi']) + expect(solr_doc['published_dttsim'].size).to eq(3) # not 4 because 1 deduplicated value removed! + expect(solr_doc['opened_earliest_dttsi']).to eq('2012-10-29T23:30:07Z') # 2012-10-29T16:30:07-0700 + expect(solr_doc['opened_latest_dttsi']).to eq('2012-11-07T00:21:02Z') # 2012-11-06T16:21:02-0800 + end + + it 'skips the versioning related steps if a new version has not been opened' do + allow(Dor::Config.workflow.client.lifecycle_routes).to receive(:query_lifecycle).and_return(Nokogiri::XML(' + + submitted + described + published + published + ')) + expect(solr_doc['opened_dttsim']).to be_nil + end + + it 'creates a modified_latest date field' do + # the facet field should have a date in it. + expect(solr_doc['modified_latest_dttsi']).to match(/^\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\dZ$/) + end + + it 'creates a version field for each version, including the version number, tag and description' do + expect(solr_doc['versions_ssm'].length).to be > 1 + expect(solr_doc['versions_ssm']).to include('4;2.2.0;Another typo') + end + + it 'handles a missing description for a version' do + dsxml = ' + + + Initial version + + + Replacing main PDF + + + Fixed title typo + + + + + ' + allow(obj).to receive(:versionMetadata).and_return(Dor::VersionMetadataDS.from_xml(dsxml)) + expect(solr_doc['versions_ssm']).to include('4;2.2.0;') + end + end + end +end diff --git a/spec/indexers/releasable_indexer_spec.rb b/spec/indexers/releasable_indexer_spec.rb new file mode 100644 index 00000000..168d1760 --- /dev/null +++ b/spec/indexers/releasable_indexer_spec.rb @@ -0,0 +1,29 @@ +# frozen_string_literal: true + +require 'rails_helper' + +RSpec.describe ReleasableIndexer do + let(:obj) { instance_double(Dor::Abstract) } + + describe 'to_solr' do + let(:doc) { described_class.new(resource: obj).to_solr } + + let(:released_for_info) do + { + 'Project' => { 'release' => true }, + 'test_target' => { 'release' => true }, + 'test_nontarget' => { 'release' => false } + } + end + let(:service) { instance_double(Dor::ReleaseTagService, released_for: released_for_info) } + let(:released_to_field_name) { Solrizer.solr_name('released_to', :symbol) } + + before do + allow(Dor::ReleaseTagService).to receive(:for).and_return(service) + end + + it 'indexes release tags' do + expect(doc).to match a_hash_including(released_to_field_name => %w[Project test_target]) + end + end +end diff --git a/spec/indexers/workflow_indexer_spec.rb b/spec/indexers/workflow_indexer_spec.rb new file mode 100644 index 00000000..f4487da1 --- /dev/null +++ b/spec/indexers/workflow_indexer_spec.rb @@ -0,0 +1,179 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe WorkflowIndexer do + before do + stub_request(:get, 'https://workflow.example.edu/workflow_templates/accessionWF') + .to_return(status: 200, body: workflow_template_json) + end + + let(:document) { Dor::Workflow::Response::Workflow.new(xml: xml) } + let(:indexer) { described_class.new(workflow: document) } + + let(:workflow_template_json) do + '{"processes":[{"name":"hello"},{"name":"goodbye"},{"name":"technical-metadata"},{"name":"some-other-step"}]}' + end + + let(:step1) { 'hello' } + let(:step2) { 'goodbye' } + let(:step3) { 'technical-metadata' } + let(:step4) { 'some-other-step' } + + describe '#to_solr' do + subject(:solr_doc) { indexer.to_solr.to_h } + + context 'when not all of the steps are completed' do + let(:xml) do + <<-XML + + + + + + XML + end + + it 'creates the workflow_status field with the workflow repository included, and indicates that the workflow is still active' do + expect(solr_doc[Solrizer.solr_name('workflow_status', :symbol)].first).to eq('accessionWF|active|0|dor') + end + end + + context 'when the template has been changed to have new steps, but the workflow service indicates all steps are completed' do + let(:workflow_template_json) do + '{"processes":[{"name":"hello"},{"name":"goodbye"},{"name":"technical-metadata"},{"name":"some-other-step"}]}' + end + + let(:xml) do + <<-XML + + + + + + XML + end + + it 'indicates that the workflow is complete' do + expect(solr_doc[Solrizer.solr_name('workflow_status', :symbol)].first).to eq('accessionWF|completed|0|dor') + end + end + + context 'when all steps are completed or skipped' do + let(:xml) do + <<-XML + + + + + + + + XML + end + + it 'indexes the right workflow status (completed)' do + expect(solr_doc).to match a_hash_including('workflow_status_ssim' => ['accessionWF|completed|0|dor']) + end + end + + context 'when the xml has dates for completed and errored steps' do + let(:xml) do + <<-XML + + + + + + + + XML + end + + it 'indexes the iso8601 UTC dates' do + expect(solr_doc).to match a_hash_including('wf_accessionWF_hello_dttsi' => '2012-11-07T00:18:57Z') + expect(solr_doc).to match a_hash_including('wf_accessionWF_technical-metadata_dttsi' => '2012-11-07T00:18:58Z') + end + end + + context 'when the xml does not have dates for completed and errored steps' do + let(:xml) do + <<-XML + + + + + + + + XML + end + + it 'only indexes the dates on steps that include a date' do + expect(solr_doc).to match a_hash_including('wf_accessionWF_technical-metadata_dttsi') + expect(solr_doc).not_to match a_hash_including('wf_accessionWF_hello_dttsi') + expect(solr_doc).not_to match a_hash_including('wf_accessionWF_start-accession_dttsi') + expect(solr_doc).not_to match a_hash_including('wf_accessionWF_goodbye_dttsi') + end + end + + context 'when there are error messages' do + let(:xml) do + <<-XML + + + + + + XML + end + + let(:wf_error) { solr_doc[Solrizer.solr_name('wf_error', :symbol)] } + + it 'indexes the error messages' do + expect(wf_error).to eq ['accessionWF:technical-metadata:druid:gv054hp4128 - Item error; caused by 413 Request Entity Too Large:'] + end + end + + context 'when the error messages are crazy long' do + let(:error_length) { 40_000 } + let(:error) { (0...error_length).map { rand(65..90).chr }.join } + let(:xml) do + <<-XML + + + + + + XML + end + + let(:wf_error) { solr_doc[Solrizer.solr_name('wf_error', :symbol)] } + + it "truncates the error messages to below Solr's limit" do + # 31 is the leader + expect(wf_error.first.length).to be < 32_766 + end + end + end +end diff --git a/spec/indexers/workflows_indexer_spec.rb b/spec/indexers/workflows_indexer_spec.rb new file mode 100644 index 00000000..3f5402fc --- /dev/null +++ b/spec/indexers/workflows_indexer_spec.rb @@ -0,0 +1,135 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe WorkflowsIndexer do + let(:obj) { instance_double(Dor::Item, pid: 'druid:ab123cd4567') } + + let(:indexer) { described_class.new(resource: obj) } + + describe '#to_solr' do + let(:solr_doc) { indexer.to_solr } + let(:xml) do + <<~XML + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + XML + end + + let(:accession_json) do + { 'processes' => [ + { 'name' => 'start-accession' }, + { 'name' => 'descriptive-metadata' }, + { 'name' => 'rights-metadata' }, + { 'name' => 'content-metadata' }, + { 'name' => 'technical-metadata' }, + { 'name' => 'remediate-object' }, + { 'name' => 'shelve' }, + { 'name' => 'publish' }, + { 'name' => 'provenance-metadata' }, + { 'name' => 'sdr-ingest-transfer' }, + { 'name' => 'sdr-ingest-received' }, + { 'name' => 'reset-workspace' }, + { 'name' => 'end-accession' } + ] } + end + + let(:assembly_json) do + { 'processes' => [ + { 'name' => 'start-assembly' }, + { 'name' => 'content-metadata-create' }, + { 'name' => 'jp2-create' }, + { 'name' => 'checksum-compute' }, + { 'name' => 'exif-collect' }, + { 'name' => 'accessioning-initiate' } + ] } + end + + let(:dissemination_json) do + { + 'processes' => [ + { 'name' => 'cleanup' } + ] + } + end + + let(:hydrus_json) do + { 'processes' => [ + { 'name' => 'start-deposit' }, + { 'name' => 'submit' }, + { 'name' => 'approve' }, + { 'name' => 'start-assembly' } + ] } + end + + let(:versioning_json) do + { 'processes' => [ + { 'name' => 'start-version' }, + { 'name' => 'submit-version' }, + { 'name' => 'start-accession' } + ] } + end + + before do + allow(Dor::Config.workflow.client).to receive(:workflow_template).with('accessionWF').and_return(accession_json) + allow(Dor::Config.workflow.client).to receive(:workflow_template).with('assemblyWF').and_return(assembly_json) + allow(Dor::Config.workflow.client).to receive(:workflow_template).with('disseminationWF').and_return(dissemination_json) + allow(Dor::Config.workflow.client).to receive(:workflow_template).with('hydrusAssemblyWF').and_return(hydrus_json) + allow(Dor::Config.workflow.client).to receive(:workflow_template).with('versioningWF').and_return(versioning_json) + + allow(Dor::Config.workflow.client.workflow_routes).to receive(:all_workflows) + .and_return(Dor::Workflow::Response::Workflows.new(xml: xml)) + end + + describe 'workflow_status_ssim' do + subject { solr_doc['workflow_status_ssim'] } + + it { is_expected.to eq ['accessionWF|completed|0|dor', 'assemblyWF|active|1|dor', 'disseminationWF|completed|0|dor', 'hydrusAssemblyWF|completed|0|dor', 'versioningWF|completed|0|dor'] } + end + end +end diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb index 1a325387..066caa5f 100644 --- a/spec/spec_helper.rb +++ b/spec/spec_helper.rb @@ -29,6 +29,8 @@ ] SimpleCov.start 'rails' +require 'webmock/rspec' + RSpec.configure do |config| # rspec-expectations config goes here. You can use an alternate # assertion/expectation library such as wrong or the stdlib/minitest