Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Refactor sort process into a service
- Loading branch information
Showing
11 changed files
with
670 additions
and
66 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,244 @@ | ||
# Provide service for for sorting an array of hash based on the values at a specified key in the hash. | ||
module Qa | ||
module LinkedData | ||
class DeepSortService | ||
# @params [Array<Hash<Symbol,Array<RDF::Literal>>>] the array of hashes to sort | ||
# @params [sort_key] the key in the hash on whose value the array will be sorted | ||
# @returns instance of this class | ||
# @example the_array parameter | ||
# [ | ||
# {:uri=>[#<RDF::URI:0x3fcff54a829c URI:http://id.loc.gov/authorities/names/n2010043281>], | ||
# :id=>[#<RDF::Literal:0x3fcff4a367b4("n 2010043281")>], | ||
# :label=>[#<RDF::Literal:0x3fcff54a9a98("Valli, Sabrina"@en)>], | ||
# :altlabel=>[], | ||
# :sort=>[#<RDF::Literal:0x3fcff54b4c18("2")>]}, | ||
# {:uri=>[#<RDF::URI:0x3fcff54a829c URI:http://id.loc.gov/authorities/names/n201002344>], | ||
# :id=>[#<RDF::Literal:0x3fcff4a367b4("n 201002344")>], | ||
# :label=>[#<RDF::Literal:0x3fcff54a9a98("Cornell, Joseph"@en)>], | ||
# :altlabel=>[], | ||
# :sort=>[#<RDF::Literal:0x3fcff54b4c18("1")>]} | ||
# ] | ||
def initialize(the_array, sort_key, preferred_language = :en) | ||
@sortable_elements = the_array.map { |element| DeepSortElement.new(element, sort_key, preferred_language) } | ||
end | ||
|
||
# Sort an array of hash on the specified sort key. The value in the hash at sort key is expected to be an array | ||
# with one or more values that are RDF::Literals that translate to a number (e.g. 2), a string number (e.g. "3"), | ||
# a string (e.g. "hello"), or a language qualified string (e.g. "hello"@en). | ||
# The sort occurs in the following precedence. | ||
# * preference for numeric sort (if only one value each and both are integers or a string that can be converted to an integer) | ||
# * single value sort (if only one value each and at least one is not an integer) | ||
# * multiple values sort (if either has multiple values) | ||
# @returns the sorted array | ||
# @example returned sorted array | ||
# [ | ||
# {:uri=>[#<RDF::URI:0x3fcff54a829c URI:http://id.loc.gov/authorities/names/n201002344>], | ||
# :id=>[#<RDF::Literal:0x3fcff4a367b4("n 201002344")>], | ||
# :label=>[#<RDF::Literal:0x3fcff54a9a98("Cornell, Joseph"@en)>], | ||
# :altlabel=>[], | ||
# :sort=>[#<RDF::Literal:0x3fcff54b4c18("1")>]}, | ||
# {:uri=>[#<RDF::URI:0x3fcff54a829c URI:http://id.loc.gov/authorities/names/n2010043281>], | ||
# :id=>[#<RDF::Literal:0x3fcff4a367b4("n 2010043281")>], | ||
# :label=>[#<RDF::Literal:0x3fcff54a9a98("Valli, Sabrina"@en)>], | ||
# :altlabel=>[], | ||
# :sort=>[#<RDF::Literal:0x3fcff54b4c18("2")>]} | ||
# ] | ||
def sort | ||
@sortable_elements.sort.map(&:element) | ||
end | ||
|
||
class DeepSortElement | ||
attr_reader :element, :literals, :preferred_language | ||
private :preferred_language | ||
|
||
delegate :size, to: :@literals | ||
|
||
def initialize(element, sort_key, preferred_language) | ||
element[sort_key] = Qa::LinkedData::LanguageSortService.new(element[sort_key], preferred_language).sort | ||
@element = element | ||
@literals = element[sort_key] | ||
@preferred_language = preferred_language | ||
@has_preferred_language = includes_preferred_language? | ||
@all_same_language = all_same_language? | ||
end | ||
|
||
def <=>(other) | ||
return numeric_comparator(other) if integer? && other.integer? | ||
return single_value_comparator(other) if single? && other.single? | ||
multiple_value_comparator(other) | ||
end | ||
|
||
# @returns true if there is a single literal that is an integer or a string that can be converted to an integer; otherwise, false | ||
def integer? | ||
return false unless single? | ||
(/\A[-+]?\d+\z/ === literal.to_s) # rubocop:disable Style/CaseEquality | ||
end | ||
|
||
def integer(idx = 0) | ||
Integer(literal(idx).to_s) | ||
end | ||
|
||
# @returns true if there is only one value; otherwise, false | ||
def single? | ||
@single ||= literals.size == 1 | ||
end | ||
|
||
def literal(idx = 0) | ||
literals[idx] | ||
end | ||
|
||
def downcase_string(idx = 0) | ||
to_downcase(literal(idx)) | ||
end | ||
|
||
def language(idx = 0, list = literals) | ||
return list[idx].language if list[idx].respond_to?(:language) | ||
nil | ||
end | ||
|
||
def includes_preferred_language? | ||
return @has_preferred_language if @has_preferred_language.present? | ||
filtered = filtered_literals(preferred_language) | ||
@has_preferred_language = filtered.size.positive? | ||
end | ||
|
||
def all_same_language? | ||
return @all_same_language if @all_same_language.present? | ||
@all_same_language = true | ||
1.upto(size - 1) { |idx| return @all_same_language = false unless language(idx) == language(0) } | ||
@all_same_language | ||
end | ||
|
||
def languages | ||
filtered_literals_by_language.keys | ||
end | ||
|
||
def filtered_literals(filter_language) | ||
filtered_literals_by_language.key?(filter_language) ? filtered_literals_by_language[filter_language] : [] | ||
end | ||
|
||
private | ||
|
||
# If both test values are single value and both are integers, do a numeric sort | ||
def numeric_comparator(other) | ||
integer <=> other.integer | ||
end | ||
|
||
# If both test values are single value and at least one is not numeric, do a string sort taking language into consideration | ||
# * sort values if neither has a language marker or they both have the same language marker | ||
# * otherwise, sort language markers | ||
def single_value_comparator(other) | ||
return downcase_string <=> other.downcase_string if same_language?(literal, other.literal) | ||
compare_languages(language, other.language) | ||
end | ||
|
||
def compare_languages(lang, other_lang) | ||
return -1 if preferred_language? lang | ||
return 1 if preferred_language? other_lang | ||
return -1 if no_language? other_lang | ||
return 1 if no_language? lang | ||
lang <=> other_lang | ||
end | ||
|
||
# If at least one of the test values has multiple values, sort the multiple values taking language into consideration | ||
# * if both lists have all the same language or no language markers at all, just sort the lists and compare each element | ||
# * if either list has the preferred language, try to sort the two lists by element after filtering for the preferred language | ||
# * otherwise, sort by language until there is a difference | ||
def multiple_value_comparator(other) | ||
return single_language_list_comparator(other) if all_same_language? && other.all_same_language? | ||
return specified_language_list_comparator(other, preferred_language) if includes_preferred_language? && other.includes_preferred_language? | ||
multi_language_list_comparator(other) | ||
end | ||
|
||
def single_language_list_comparator(other) | ||
list_comparator(literals, other.literals) | ||
end | ||
|
||
def specified_language_list_comparator(other, language) | ||
filtered = filtered_literals(language) | ||
other_filtered = other.filtered_literals(language) | ||
return -1 if !filtered.empty? && other_filtered.empty? | ||
return 1 if filtered.empty? && !other_filtered.empty? | ||
list_comparator(filtered, other_filtered) | ||
end | ||
|
||
# Walk through language sorted lists | ||
# * for each language, determine how closely the list of terms matches | ||
# * prioritize the list that gets the most low values | ||
def multi_language_list_comparator(other) | ||
combined_languages = languages.concat(other.languages).uniq | ||
by_language_comparisons = {} | ||
combined_languages.each do |lang| | ||
cmp = list_comparator(filtered_literals(lang), other.filtered_literals(lang)) | ||
by_language_comparisons[lang] = cmp | ||
end | ||
cmp_sum = by_language_comparisons.values.sum | ||
return 1 if cmp_sum.positive? | ||
return -1 if cmp_sum.negative? | ||
0 | ||
end | ||
|
||
def list_comparator(list, other_list) | ||
# if an element doesn't have any terms in a language, the other element sorts lower | ||
return -1 if other_list.empty? | ||
return 1 if list.empty? | ||
shorter_list_size = [list.size, other_list.size].min | ||
cmp = 0 | ||
0.upto(shorter_list_size - 1) do |idx| | ||
cmp = to_downcase(list[idx]) <=> to_downcase(other_list[idx]) | ||
return cmp unless cmp.zero? | ||
end | ||
return cmp if list.size == other_list.size | ||
other_list.size < list.size ? 1 : -1 # didn't find any diffs, shorter list is considered lower | ||
end | ||
|
||
def same_language?(lit, other_lit) | ||
return false if only_one_has_language_marker?(lit, other_lit) | ||
return true if neither_have_language_markers?(lit, other_lit) | ||
lit.language == other_lit.language | ||
end | ||
|
||
def neither_have_language_markers?(lit, other_lit) | ||
!language?(lit) && !language?(other_lit) | ||
end | ||
|
||
def only_one_has_language_marker?(lit, other_lit) | ||
(!language?(lit) && language?(other_lit)) || (language?(lit) && !language?(other_lit)) | ||
end | ||
|
||
def language?(lit) | ||
language = lit.language if lit.respond_to?(:language) | ||
language.present? | ||
end | ||
|
||
def preferred_language?(language) | ||
preferred_language.present? ? language == preferred_language : false | ||
end | ||
|
||
def no_language?(language) | ||
language.blank? | ||
end | ||
|
||
def to_downcase(lit) | ||
lit.to_s.downcase | ||
end | ||
|
||
def filtered_literals_by_language | ||
@filtered_literals_by_language ||= create_all_filters | ||
end | ||
|
||
def create_all_filters | ||
bins = {} | ||
0.upto(size - 1) do |idx| | ||
lang = language(idx, literals) | ||
filter = bins.key?(lang) ? bins[lang] : [] | ||
filter << literal(idx) | ||
bins[lang] = filter | ||
end | ||
bins | ||
end | ||
end | ||
private_constant :DeepSortElement | ||
end | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
# Service to sort an array of literals by language and within language. | ||
module Qa | ||
module LinkedData | ||
class LanguageSortService | ||
LANGUAGE_LOCALE_KEY_FOR_NO_LANGUAGE = :NO_LANGUAGE | ||
|
||
attr_reader :literals, :preferred_language | ||
attr_reader :languages, :bins | ||
private :literals, :preferred_language, :languages, :bins | ||
# private :literals, :preferred_language, :languages, :languages=, :bins, :bins= | ||
|
||
# @param [Array<RDF::Literals>] string literals to sort | ||
# @param [Symbol] preferred language to appear first in the list; defaults to no preference | ||
# @return instance of this class | ||
def initialize(literals, preferred_language = nil) | ||
@literals = literals | ||
@preferred_language = preferred_language | ||
@languages = [] | ||
@bins = {} | ||
end | ||
|
||
# Sort the literals stored in this instance of the service | ||
# @return sorted version of literals | ||
def sort | ||
return literals unless literals.present? | ||
return @sorted_literals if @sorted_literals.present? | ||
parse_into_language_bins | ||
sort_languages | ||
sort_language_bins | ||
@sorted_literals = construct_sorted_literals | ||
end | ||
|
||
private | ||
|
||
def construct_sorted_literals | ||
sorted_literals = [] | ||
0.upto(languages.size - 1) { |idx| sorted_literals.concat(bins[languages[idx]]) } | ||
sorted_literals | ||
end | ||
|
||
def language(literal) | ||
language = literal.language if literal.respond_to?(:language) | ||
language.present? ? language : LANGUAGE_LOCALE_KEY_FOR_NO_LANGUAGE | ||
end | ||
|
||
def move_no_language_to_end | ||
return unless languages.include?(LANGUAGE_LOCALE_KEY_FOR_NO_LANGUAGE) | ||
languages.delete(LANGUAGE_LOCALE_KEY_FOR_NO_LANGUAGE) | ||
languages << LANGUAGE_LOCALE_KEY_FOR_NO_LANGUAGE | ||
end | ||
|
||
def move_preferred_language_to_front | ||
return unless preferred_language.present? && languages.include?(preferred_language) | ||
languages.delete(preferred_language) | ||
languages.insert(0, preferred_language) | ||
end | ||
|
||
def parse_into_language_bins | ||
0.upto(literals.size - 1) do |idx| | ||
lang = language(literals[idx]) | ||
languages << lang | ||
bin = bins.key?(lang) ? bins[lang] : [] | ||
bin << literals[idx] | ||
bins[lang] = bin | ||
end | ||
@language = languages | ||
@bins = bins | ||
end | ||
|
||
def sort_languages | ||
languages.sort!.uniq! | ||
move_preferred_language_to_front | ||
move_no_language_to_end | ||
end | ||
|
||
def sort_language_bins | ||
bins.each_value { |bin| bin.sort_by! { |literal| literal.to_s.downcase } } | ||
end | ||
end | ||
end | ||
end |
Oops, something went wrong.