Permalink
Browse files

Move TerminologyBasedSolrizer into OM

  • Loading branch information...
1 parent ab1f2bc commit 52a36503e2c3af158cc1583d530d35d9aa2e102f @jcoyne jcoyne committed Feb 3, 2013
@@ -1,6 +1,3 @@
-require "nokogiri"
-require 'yaml'
-
module Solrizer
# Provides utilities for extracting solr fields from a variety of objects and/or creating solr documents from a given object
@@ -34,8 +34,6 @@ class UnknownIndexMacro < SolrizerError; end #nodoc#
# # t.dish_name :index_as => [:some_field_type] -maps to-> dish_name_ssim
# # t.ingredients :index_as => [:some_field_type, :edible] -maps to-> ingredients_ssim, ingredients_food
#
- # (See Solrizer::XML::TerminologyBasedSolrizer for instructions on applying a custom mapping once you have defined it.)
- #
#
# == Custom Value Converters
#
View
@@ -1,7 +1,5 @@
-require "solrizer"
-require "om"
module Solrizer::XML
end
Dir[File.join(File.dirname(__FILE__),"xml","*.rb")].each {|file| require file }
-Solrizer::Extractor.send(:include, Solrizer::XML::Extractor)
+Solrizer::Extractor.send(:include, Solrizer::XML::Extractor)
@@ -4,7 +4,7 @@ module Solrizer::XML::Extractor
#
# This method extracts solr fields from simple xml
- # If you want to do anything more nuanced with the xml, use TerminologyBasedSolrizer instead.
+ # If you want to do anything more nuanced with the xml, use OM instead.
#
# @param [xml] text xml content to index
# @param [Hash] solr_doc
@@ -1,92 +0,0 @@
-# This module is only suitable to mix into Classes that use the OM::XML::Document Module
-module Solrizer::XML::TerminologyBasedSolrizer
- def self.included(klass)
- klass.send(:include, Solrizer::Common)
- klass.send(:extend, ClassMethods)
- end
-
- # Module Methods
- module ClassMethods
-
- # Build a solr document from +doc+ based on its terminology
- # @param [OM::XML::Document] doc
- # @param [Hash] (optional) solr_doc (values hash) to populate
- def solrize(doc, solr_doc=Hash.new, field_mapper = nil)
- unless doc.class.terminology.nil?
- doc.class.terminology.terms.each_pair do |term_name,term|
- doc.solrize_term(term, solr_doc, field_mapper)
- end
- end
-
- return solr_doc
- end
-
- # Populate a solr document with fields based on nodes in +xml+
- # Values for a term are gathered by to +term_pointer+ using OM::XML::TermValueOperators.term_values
- # and are deserialized by OM according to :type, as determined in its terminology.
- # The content of the actual field in solr is each +node+ of the +nodeset+ returned by OM,
- # rendered to a string.
- # @param [OM::XML::Document] doc xml document to extract values from
- # @param [OM::XML::Term] term corresponding to desired xml values
- # @param [Hash] (optional) solr_doc (values hash) to populate
- def solrize_term(doc, term, solr_doc = Hash.new, field_mapper = nil, opts={})
- parents = opts.fetch(:parents, [])
- term_pointer = parents+[term.name]
- nodeset = doc.term_values(*term_pointer)
-
- nodeset.each do |n|
- doc.solrize_node(n, term_pointer, term, solr_doc, field_mapper)
-# FIXME: there should be no dependencies on OM in Solrizer
- unless term.kind_of? OM::XML::NamedTermProxy
- term.children.each_pair do |child_term_name, child_term|
- doc.solrize_term(child_term, solr_doc, field_mapper, opts={:parents=>parents+[{term.name=>nodeset.index(n)}]})
- end
- end
- end
- solr_doc
- end
-
- # Populate a solr document with solr fields corresponding to the given xml node
- # Field names are generated using settings from the term in the +doc+'s terminology corresponding to +term_pointer+
- # If the supplied term does not have an index_as attribute, no indexing will be performed.
- # @param [Nokogiri::XML::Node] node to solrize
- # @param [OM::XML::Document] doc document the node came from
- # @param [Array] term_pointer Array pointing to the term that should be used for solrization settings
- # @param [Term] term the term to be solrized
- # @param [Hash] (optional) solr_doc (values hash) to populate
- # @return [Hash] the solr doc
- def solrize_node(node_value, doc, term_pointer, term, solr_doc = Hash.new, field_mapper = nil, opts = {})
- return solr_doc unless term.index_as && !term.index_as.empty?
-
-# FIXME: there should be no dependencies on OM in Solrizer
- generic_field_name_base = OM::XML::Terminology.term_generic_name(*term_pointer)
- create_and_insert_terms(generic_field_name_base, node_value, term.index_as, solr_doc)
-
- if term_pointer.length > 1
-# FIXME: there should be no dependencies on OM in Solrizer
- hierarchical_field_name_base = OM::XML::Terminology.term_hierarchical_name(*term_pointer)
- create_and_insert_terms(hierarchical_field_name_base, node_value, term.index_as, solr_doc)
- end
- solr_doc
- end
-
- end
-
-
- # Instance Methods
-
- attr_accessor :field_mapper
-
- def to_solr(solr_doc = Hash.new, field_mapper = self.field_mapper) # :nodoc:
- self.class.solrize(self, solr_doc, field_mapper)
- end
-
- def solrize_term(term, solr_doc = Hash.new, field_mapper = self.field_mapper, opts={})
- self.class.solrize_term(self, term, solr_doc, field_mapper, opts)
- end
-
- def solrize_node(node, term_pointer, term, solr_doc = Hash.new, field_mapper = self.field_mapper, opts={})
- self.class.solrize_node(node, self, term_pointer, term, solr_doc, field_mapper, opts)
- end
-
-end
View
@@ -13,7 +13,6 @@ Gem::Specification.new do |s|
s.description = %q{Use solrizer to populate solr indexes. You can run solrizer from within your app, using the provided rake tasks, or as a JMS listener}
s.add_dependency "nokogiri"
- s.add_dependency "om", ">=1.5.0"
s.add_dependency "xml-simple"
s.add_dependency "mediashelf-loggable", "~>0.4.7"
s.add_dependency "stomp"
@@ -1,90 +0,0 @@
-module Samples
- class ModsArticle
-
- include OM::XML::Document
-
- set_terminology do |t|
- t.root(:path=>"mods", :xmlns=>"http://www.loc.gov/mods/v3", :schema=>"http://www.loc.gov/standards/mods/v3/mods-3-2.xsd", "xmlns:foo"=>"http://my.custom.namespace")
-
-
- t.title_info(:path=>"titleInfo") {
- t.main_title(:index_as=>[:facetable],:path=>"title", :label=>"title") {
- t.main_title_lang(:path=>{:attribute=> "xml:lang"})
- }
- t.french_title(:ref=>[:title_info,:main_title], :attributes=>{"xml:lang"=>"fre"})
-
- t.language(:index_as=>[:facetable, :stored_searchable],:path=>{:attribute=>"lang"})
- }
- t.language{
- t.lang_code(:index_as=>[:facetable], :path=>"languageTerm", :attributes=>{:type=>"code"})
- }
- t.abstract(:index_as=>[:stored_searchable])
- t.subject {
- t.topic(:index_as=>[:facetable])
- }
- t.topic_tag(:proxy=>[:subject, :topic], :index_as=>[:stored_searchable])
- # t.topic_tag(:index_as=>[:facetable],:path=>"subject", :default_content_path=>"topic")
- # This is a mods:name. The underscore is purely to avoid namespace conflicts.
- t.name_ {
- # this is a namepart
- t.namePart(:type=>:string, :label=>"generic name")
- # affiliations are great
- t.affiliation
- t.institution(:path=>"affiliation", :index_as=>[:facetable], :label=>"organization")
- t.displayForm
- t.role(:ref=>[:role])
- t.description(:index_as=>[:facetable])
- t.date(:path=>"namePart", :attributes=>{:type=>"date"})
- t.last_name(:path=>"namePart", :attributes=>{:type=>"family"}, :index_as=>[:stored_searchable])
- t.first_name(:path=>"namePart", :attributes=>{:type=>"given"}, :label=>"first name")
- t.terms_of_address(:path=>"namePart", :attributes=>{:type=>"termsOfAddress"})
- t.computing_id
- t.name_content(:path=>"text()")
- }
- # lookup :person, :first_name
- t.person(:ref=>:name, :attributes=>{:type=>"personal"}, :index_as=>[:facetable])
- t.department(:proxy=>[:person,:description],:index_as=>[:facetable])
- t.organization(:ref=>:name, :attributes=>{:type=>"corporate"}, :index_as=>[:facetable])
- t.conference(:ref=>:name, :attributes=>{:type=>"conference"}, :index_as=>[:facetable])
- t.role {
- t.text(:path=>"roleTerm",:attributes=>{:type=>"text"}, :index_as=>[:stored_searchable])
- t.code(:path=>"roleTerm",:attributes=>{:type=>"code"})
- }
- t.journal(:path=>'relatedItem', :attributes=>{:type=>"host"}) {
- t.title_info(:index_as=>[:facetable],:ref=>[:title_info])
- t.origin_info(:path=>"originInfo") {
- t.publisher
- t.date_issued(:path=>"dateIssued", :type => :date, :index_as => [:stored_searchable])
- t.issuance(:index_as=>[:facetable])
- }
- t.issn(:path=>"identifier", :attributes=>{:type=>"issn"})
- t.issue(:path=>"part") {
- t.volume(:path=>"detail", :attributes=>{:type=>"volume"}, :default_content_path=>"number")
- t.level(:path=>"detail", :attributes=>{:type=>"number"}, :default_content_path=>"number")
- t.extent
- t.pages(:path=>"extent", :attributes=>{:unit=>"pages"}) {
- t.start
- t.end
- }
- t.start_page(:proxy=>[:pages, :start])
- t.end_page(:proxy=>[:pages, :end])
- t.publication_date(:path=>"date", :type => :date, :index_as => [:stored_searchable])
- }
- }
- t.note
- t.location(:path=>"location") {
- t.url(:path=>"url")
- }
- t.publication_url(:proxy=>[:location,:url])
- t.title(:proxy=>[:title_info, :main_title])
- t.journal_title(:proxy=>[:journal, :title_info, :main_title])
- t.pub_date(:proxy=>[:journal, :issue, :publication_date])
- t.issue_date(:ref=>[:journal, :origin_info, :date_issued], :type=> :date)
- end
-
- # Changes from OM::Properties implementation
- # renamed family_name => last_name
- # start_page & end_page now accessible as [:journal, :issue, :pages, :start] (etc.)
-
- end
-end
@@ -1,109 +0,0 @@
-require 'spec_helper'
-require 'fixtures/mods_article'
-
-# TODO: there should be no dependencies on OM in Solrizer
-describe Solrizer::XML::TerminologyBasedSolrizer do
-
- before(:all) do
- Samples::ModsArticle.send(:include, Solrizer::XML::TerminologyBasedSolrizer)
- end
-
- before(:each) do
- article_xml = fixture( File.join("mods_articles", "hydrangea_article1.xml") )
- @mods_article = Samples::ModsArticle.from_xml(article_xml)
- end
-
- describe ".to_solr" do
-
- it "should provide .to_solr and return a SolrDocument" do
- @mods_article.should respond_to(:to_solr)
- @mods_article.to_solr.should be_kind_of(Hash)
- end
-
- it "should optionally allow you to provide the Hash to add fields to and return that document when done" do
- doc = Hash.new
- @mods_article.to_solr(doc).should equal(doc)
- end
-
- it "should iterate through the terminology terms, calling .solrize_term on each and passing in the solr doc" do
- solr_doc = Hash.new
- @mods_article.field_mapper = Solrizer::FieldMapper.new
- Samples::ModsArticle.terminology.terms.each_pair do |k,v|
- @mods_article.should_receive(:solrize_term).with(v, solr_doc, @mods_article.field_mapper)
- end
- @mods_article.to_solr(solr_doc)
- end
-
- it "should use Solr mappings to generate field names" do
- solr_doc = @mods_article.to_solr
- solr_doc["abstract"].should be_nil
- # NOTE: OM's old default expected stored and indexed; this is a change.
- solr_doc["abstract_tesim"].should == ["ABSTRACT"]
- solr_doc["title_info_1_language_tesim"].should == ["finnish"]
- solr_doc["person_1_role_0_text_tesim"].should == ["teacher"]
- # No index_as on the code field.
- solr_doc["person_1_role_0_code_tesim"].should be_nil
- solr_doc["person_last_name_tesim"].sort.should == ["FAMILY NAME", "Gautama"]
- solr_doc["topic_tag_tesim"].sort.should == ["CONTROLLED TERM", "TOPIC 1", "TOPIC 2"]
- # These are a holdover from an old verison of OM
- solr_doc['journal_0_issue_0_publication_date_dtsim'].should == ["2007-02-01T00:00:00Z"]
- end
-
- end
-
- describe ".solrize_term" do
-
- it "should add fields to a solr document for all nodes corresponding to the given term and its children" do
- solr_doc = Hash.new
- result = @mods_article.solrize_term(Samples::ModsArticle.terminology.retrieve_term(:title_info), solr_doc)
- result.should == solr_doc
- end
-
- it "should add multiple fields based on index_as" do
- fake_solr_doc = {}
- term = Samples::ModsArticle.terminology.retrieve_term(:name)
- term.children[:namePart].index_as = [:searchable, :displayable, :facetable]
-
- @mods_article.solrize_term(term, fake_solr_doc)
-
- expected_names = ["DR.", "FAMILY NAME", "GIVEN NAMES"]
- %w(_teim _sim).each do |suffix|
- actual_names = fake_solr_doc["name_0_namePart#{suffix}"].sort
- actual_names.should == expected_names
- end
- end
-
- it "should add fields based on type using proxy" do
- unless RUBY_VERSION.match("1.8.7")
- solr_doc = Hash.new
- result = @mods_article.solrize_term(Samples::ModsArticle.terminology.retrieve_term(:pub_date), solr_doc)
- solr_doc["pub_date_dtsim"].should == ["2007-02-01T00:00:00Z"]
- end
- end
-
- it "should add fields based on type using ref" do
- solr_doc = Hash.new
- result = @mods_article.solrize_term(Samples::ModsArticle.terminology.retrieve_term(:issue_date), solr_doc)
- solr_doc["issue_date_dtsim"].should == ["2007-02-15T00:00:00Z"]
- end
-
- it "shouldn't index terms where index_as is an empty array" do
- fake_solr_doc = {}
- term = Samples::ModsArticle.terminology.retrieve_term(:name)
- term.children[:namePart].index_as = []
-
- @mods_article.solrize_term(term, fake_solr_doc)
- fake_solr_doc["name_0_namePart_teim"].should be_nil
- end
-
- it "should index terms where index_as is searchable" do
- fake_solr_doc = {}
- term = Samples::ModsArticle.terminology.retrieve_term(:name)
- term.children[:namePart].index_as = [:searchable]
-
- @mods_article.solrize_term(term, fake_solr_doc)
-
- fake_solr_doc["name_0_namePart_teim"].sort.should == ["DR.", "FAMILY NAME", "GIVEN NAMES"]
- end
- end
-end

0 comments on commit 52a3650

Please sign in to comment.