Skip to content

Commit

Permalink
basic metadata extraction from ISO, FGDC, and MODS
Browse files Browse the repository at this point in the history
  • Loading branch information
Darren Hardy committed Nov 14, 2015
1 parent a7b6029 commit 668ba71
Show file tree
Hide file tree
Showing 17 changed files with 771 additions and 4 deletions.
13 changes: 13 additions & 0 deletions app/helpers/metadata_extraction_helper.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
module MetadataExtractionHelper
# Extracts properties from the constitutent external metadata file
# @return [Hash]
# TODO: Does not support multiple external metadata files
def extract_metadata
return {} if metadata_files.blank?
h = metadata_files.first.extract_metadata
h.each do |k, v|
send("#{k}=".to_sym, v) # set each property
end
h
end
end
89 changes: 88 additions & 1 deletion app/models/concerns/external_metadata_file_behavior.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ module ExternalMetadataFileBehavior
included do
# Specifies the metadata standard to which the metadata file conforms
# @see http://dublincore.org/documents/dcmi-terms/#terms-conformsTo
property :conforms_to, predicate: ::RDF::DC.conformsTo do |index|
property :conforms_to, predicate: ::RDF::DC.conformsTo, multiple: false do |index|
index.as :stored_searchable, :facetable
end

Expand Down Expand Up @@ -43,4 +43,91 @@ def vector_file?
def external_metadata_file?
true
end

# Extracts properties from the constitutent external metadata file
# @return [Hash]
def extract_metadata
fn = "extract_#{conforms_to.downcase}_metadata"
if respond_to?(fn.to_sym)
send(fn, metadata_xml)
else
fail "Unsupported metadata standard: #{conforms_to}"
end
end

# Retrives data from PCDM::File
def metadata_xml
Nokogiri::XML(original_file.content)
end

# TODO: Migrate this code into an XSLT? Need to support multivalued fields
def extract_iso19139_metadata(doc)
ns = {
'xmlns:gmd' => 'http://www.isotc211.org/2005/gmd',
'xmlns:gco' => 'http://www.isotc211.org/2005/gco'
}
h = {}
doc.xpath('//gmd:identificationInfo/gmd:MD_DataIdentification/gmd:citation/gmd:CI_Citation/gmd:title/gco:CharacterString', ns).each do |node|
h[:title] = [node.text.strip]
end

doc.xpath('//gmd:identificationInfo/gmd:MD_DataIdentification/gmd:extent/gmd:EX_Extent/gmd:geographicElement/gmd:EX_GeographicBoundingBox', ns).each do |node|
w = node.at_xpath('gmd:westBoundLongitude/gco:Decimal', ns).text.to_f
e = node.at_xpath('gmd:eastBoundLongitude/gco:Decimal', ns).text.to_f
n = node.at_xpath('gmd:northBoundLatitude/gco:Decimal', ns).text.to_f
s = node.at_xpath('gmd:southBoundLatitude/gco:Decimal', ns).text.to_f
h[:bounding_box] = "#{s} #{w} #{n} #{e}"
end

doc.xpath('//gmd:identificationInfo/gmd:MD_DataIdentification/gmd:abstract/gco:CharacterString', ns).each do |node|
h[:description] = [node.text.strip]
end

doc.xpath('//gmd:identificationInfo/gmd:MD_DataIdentification/gmd:citation/gmd:CI_Citation/gmd:citedResponsibleParty/gmd:CI_ResponsibleParty/gmd:role/gmd:CI_RoleCode[@codeListValue=\'originator\']', ns).each do |node|
begin
h[:creator] = [node.at_xpath('ancestor-or-self::*/gmd:individualName', ns).text.strip]
rescue
h[:creator] = [node.at_xpath('ancestor-or-self::*/gmd:organisationName', ns).text.strip]
end
end

# TODO: Not sure if custodian is the same as source
doc.xpath('//gmd:identificationInfo/gmd:MD_DataIdentification/gmd:citation/gmd:CI_Citation/gmd:citedResponsibleParty/gmd:CI_ResponsibleParty/gmd:role/gmd:CI_RoleCode[@codeListValue=\'custodian\']', ns).each do |node|
begin
h[:source] = [node.at_xpath('ancestor-or-self::*/gmd:individualName', ns).text.strip]
rescue
h[:source] = [node.at_xpath('ancestor-or-self::*/gmd:organisationName', ns).text.strip]
end
end

h
end

def extract_fgdc_metadata(doc)
h = {
title: [doc.at_xpath('//idinfo/citation/citeinfo/title').text],
description: [doc.at_xpath('//idinfo/descript/abstract').text],
creator: [doc.at_xpath('//idinfo/citation/citeinfo/origin').text]
}

doc.xpath('//idinfo/spdom/bounding').each do |node|
w = node.at_xpath('westbc').text.to_f
e = node.at_xpath('eastbc').text.to_f
n = node.at_xpath('northbc').text.to_f
s = node.at_xpath('southbc').text.to_f
h[:bounding_box] = "#{s} #{w} #{n} #{e}"
end

h
end

def extract_mods_metadata(doc)
ns = {
'xmlns:mods' => 'http://www.loc.gov/mods/v3'
}
{
title: [doc.at_xpath('//mods:mods/mods:titleInfo/mods:title', ns).text],
description: [doc.at_xpath('//mods:mods/mods:abstract', ns).text]
}
end
end
1 change: 1 addition & 0 deletions app/models/concerns/image_work_behavior.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Attributes and methods for image works
module ImageWorkBehavior
extend ActiveSupport::Concern
include ::MetadataExtractionHelper
included do
type [Hydra::PCDM::Vocab::PCDMTerms.Object,
Hydra::Works::Vocab::WorksTerms.GenericWork,
Expand Down
1 change: 1 addition & 0 deletions app/models/concerns/raster_work_behavior.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Attributes and methods for raster works
module RasterWorkBehavior
extend ActiveSupport::Concern
include ::MetadataExtractionHelper
included do
type [Hydra::PCDM::Vocab::PCDMTerms.Object,
Hydra::Works::Vocab::WorksTerms.GenericWork,
Expand Down
1 change: 1 addition & 0 deletions app/models/concerns/vector_work_behavior.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Attributes and methods for vector works
module VectorWorkBehavior
extend ActiveSupport::Concern
include ::MetadataExtractionHelper
included do
type [Hydra::PCDM::Vocab::PCDMTerms.Object,
Hydra::Works::Vocab::WorksTerms.GenericWork,
Expand Down
1 change: 1 addition & 0 deletions spec/factories/external_metadata_files.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
end

after(:build) do |file, evaluator|
file.conforms_to = 'ISO19139'
file.apply_depositor_metadata(evaluator.user.user_key)
end

Expand Down
66 changes: 63 additions & 3 deletions spec/models/external_metadata_file_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@
end

it 'updates the metadata schema' do
subject.attributes = { conforms_to: ['ISO19139'] }
expect(subject.conforms_to).to eq(['ISO19139'])
subject.attributes = { conforms_to: 'ISO19139' }
expect(subject.conforms_to).to eq('ISO19139')
end

describe 'metadata' do
Expand Down Expand Up @@ -60,11 +60,71 @@
it 'has attached content' do
expect(subject.association(:original_file)).to be_kind_of ActiveFedora::Associations::DirectlyContainsOneAssociation
end

it 'will read the PCDM::File for its XML' do
expect(subject).to receive(:original_file) { Hydra::PCDM::File.new }
expect(subject.metadata_xml).to be_kind_of Nokogiri::XML::Document
end

it 'will route the extraction request for ISO' do
expect(subject).to receive(:original_file) { Hydra::PCDM::File.new }
expect(subject).to receive(:extract_iso19139_metadata)
subject.conforms_to = 'ISO19139'
expect(subject.extract_metadata).to be_nil
end

it 'will route the extraction request for FGDC' do
expect(subject).to receive(:original_file) { Hydra::PCDM::File.new }
expect(subject).to receive(:extract_fgdc_metadata)
subject.conforms_to = 'Fgdc'
expect(subject.extract_metadata).to be_nil
end

it 'will route the extraction request for MODS' do
expect(subject).to receive(:original_file) { Hydra::PCDM::File.new }
expect(subject).to receive(:extract_mods_metadata)
subject.conforms_to = 'mods'
expect(subject.extract_metadata).to be_nil
end

it 'will not route the extraction request for bogus standard' do
subject.conforms_to = 'bogus'
expect { subject.extract_metadata }.to raise_error(RuntimeError)
end

it 'can extract ISO 19139 metadata' do
doc = Nokogiri::XML(read_test_data_fixture('McKay/S_566_1914_clip_iso.xml'))
expect(subject.extract_iso19139_metadata(doc)).to include({
title: ['S_566_1914_clip.tif'],
bounding_box: '56.580532 -112.47033 57.465375 -109.622454',
description: ['This raster file is the result of georeferencing using esri\'s ArcScan of Sheet 566: McKay, Alberta, 1st ed. 1st of July, 1914. This sheet is part of the 3-mile to 1-inch sectional maps of Western Canada. vectorization was undertaken to extract a measure of line work density in order to measure Cartographic Intactness. The map series is described in Dubreuil, Lorraine. 1989. Sectional maps of western Canada, 1871-1955: An early Canadian topographic map series. Occasional paper no. 2, Association of Canadian Map Libraries and Archives.'],
source: ['Larry Laliberte']
})
end

it 'can extract FGDC metadata' do
doc = Nokogiri::XML(read_test_data_fixture('zipcodes_fgdc.xml'))
expect(subject.extract_fgdc_metadata(doc)).to include({
title: ['Louisiana ZIP Code Areas 2002'],
bounding_box: '28.926478 -94.043286 33.019481 -88.817478',
creator: ['Geographic Data Technology, Inc. (GDT)'],
description: ['Louisiana ZIP Code Areas represents five-digit ZIP Code areas used by the U.S. Postal Service to deliver mail more effectively. The first digit of a five-digit ZIP Code divides the country into 10 large groups of states numbered from 0 in the Northeast to 9 in the far West. Within these areas, each state is divided into an average of 10 smaller geographical areas, identified by the 2nd and 3rd digits. These digits, in conjunction with the first digit, represent a sectional center facility or a mail processing facility area. The 4th and 5th digits identify a post office, station, branch or local delivery area.']
})
end

it 'can extract MODS metadata' do
doc = Nokogiri::XML(read_test_data_fixture('bb099zb1450_mods.xml'))
expect(subject.extract_mods_metadata(doc)).to include({
title: ['Department Boundary: Haute-Garonne, France, 2010 '],
description: ["This polygon shapefile represents the Department Boundary for the Haute-Garonne department of France for 2010. These are the level 2 administrative divisions (ADM2) of Haute-Garonne. Department is one of the three levels of government below the national level (\"territorial collectivities\"), between the region and the commune. There are 96 departments in metropolitan France and 5 overseas departments, which also are classified as regions. Departments are further subdivided into 342 arrondissements, themselves divided into cantons; the latter two have no autonomy and are used for the organisation of public services and sometimes elections."]

})
end

describe "to_solr" do
let(:solr_doc) { FactoryGirl.build(:external_metadata_file,
date_uploaded: Date.today,
conforms_to: ['ISO19139']).to_solr
conforms_to: 'ISO19139').to_solr
}

it "indexes bbox field" do
Expand Down
17 changes: 17 additions & 0 deletions spec/models/vector_work_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -65,4 +65,21 @@
expect(subject.keys).to include 'bounding_box_tesim'
end
end

describe 'extract_metadata' do
subject { FactoryGirl.create(:vector_work_with_metadata_files) }

it 'has an extraction method' do
expect(subject).to respond_to(:extract_metadata)
end

it 'can route extraction' do
doc = Nokogiri::XML(read_test_data_fixture('McKay/S_566_1914_clip_iso.xml'))
externalMetadataFile = subject.metadata_files.first
expect(externalMetadataFile.conforms_to.downcase).to eq('iso19139')
allow(externalMetadataFile).to receive(:metadata_xml) { doc }
subject.extract_metadata
expect(subject.title).to eq(['S_566_1914_clip.tif'])
end
end
end
4 changes: 4 additions & 0 deletions spec/spec_helper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -88,3 +88,7 @@ def loaddb
end
end
end

def read_test_data_fixture(name)
File.read(File.join(File.dirname(__FILE__), '..', 'test-data', name))
end
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
83 changes: 83 additions & 0 deletions test-data/bb099zb1450_mods.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
<?xml version="1.0" encoding="UTF-8"?>
<mods xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://www.loc.gov/mods/v3" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" version="3.4" xsi:schemaLocation="http://www.loc.gov/mods/v3 http://www.loc.gov/standards/mods/v3/mods-3-4.xsd">
<titleInfo>
<title>Department Boundary: Haute-Garonne, France, 2010 </title>
</titleInfo>
<name type="corporate">
<namePart>Articque informatique</namePart>
<role>
<roleTerm type="text" authority="marcrelator">creator</roleTerm>
</role>
</name>
<typeOfResource>cartographic</typeOfResource>
<typeOfResource>software, multimedia</typeOfResource>
<genre authority="lcgft" valueURI="http://id.loc.gov/authorities/genreForms/gf2011026297">Geospatial data</genre>
<genre authority="rdacontent" valueURI="http://rdvocab.info/termList/RDAContentType/1001">cartographic dataset</genre>
<originInfo>
<publisher>Articque informatique</publisher>
<place>
<placeTerm type="text">Fondettes, FR</placeTerm>
</place>
<dateIssued encoding="w3cdtf" keyDate="yes">2010</dateIssued>
<dateValid encoding="w3cdtf">2010</dateValid>
<edition>1</edition>
</originInfo>
<language>
<languageTerm authority="iso639-2b">eng</languageTerm>
</language>
<physicalDescription>
<form>Shapefile</form>
<digitalOrigin>born digital</digitalOrigin>
</physicalDescription>
<subject>
<cartographics>
<scale>Scale not given.</scale>
<projection>EPSG::2154</projection>
<coordinates>(E 0°23ʹ50ʺ--E 2°3ʹ39ʺ/N 43°55ʹ37ʺ--N 42°41ʹ3ʺ)</coordinates>
</cartographics>
</subject>
<abstract displayLabel="Abstract" lang="eng">This polygon shapefile represents the Department Boundary for the Haute-Garonne department of France for 2010. These are the level 2 administrative divisions (ADM2) of Haute-Garonne. Department is one of the three levels of government below the national level ("territorial collectivities"), between the region and the commune. There are 96 departments in metropolitan France and 5 overseas departments, which also are classified as regions. Departments are further subdivided into 342 arrondissements, themselves divided into cantons; the latter two have no autonomy and are used for the organisation of public services and sometimes elections.</abstract>
<abstract displayLabel="Purpose" lang="eng">This dataset is intended for researchers, students, and policy makers for reference and mapping purposes, and may be used for basic applications such as viewing, querying, and map output production, or to provide a basemap to support graphical overlays and analysis with other spatial data.</abstract>
<note displayLabel="Preferred citation" lang="eng">Articque informatique. (2010). Department Boundary: Haute-Garonne, France, 2010. Articque informatique. Available at: http://purl.stanford.edu/bb099zb1450</note>
<subject>
<topic authority="lcsh" authorityURI="http://id.loc.gov/authorities/subjects.html" lang="eng">Administrative and political divisions</topic>
</subject>
<subject>
<topic authority="lcsh" authorityURI="http://id.loc.gov/authorities/subjects.html" lang="eng">Departments</topic>
</subject>
<subject>
<geographic lang="eng" valueURI="http://sws.geonames.org/3013767/" authority="geonames" authorityURI="http://www.geonames.org/ontology#">Haute-Garonne (France)</geographic>
</subject>
<subject>
<temporal encoding="w3cdtf">2010</temporal>
</subject>
<subject>
<topic authority="ISO19115TopicCategory" authorityURI="http://www.isotc211.org/2005/resources/Codelist/gmxCodelists.xml#MD_TopicCategoryCode" valueURI="boundaries">Boundaries</topic>
</subject>
<location>
<url>http://purl.stanford.edu/bb099zb1450</url>
</location>
<recordInfo>
<recordContentSource>Stanford</recordContentSource>
<recordIdentifier>edu.stanford.purl:bb099zb1450</recordIdentifier>
<recordOrigin>This record was translated from ISO 19139 to MODS v.3 using an xsl transformation.</recordOrigin>
<languageOfCataloging>
<languageTerm authority="iso639-2b" type="code">eng</languageTerm>
</languageOfCataloging>
</recordInfo>
<extension displayLabel="geo">
<rdf:RDF xmlns:gml="http://www.opengis.net/gml/3.2/" xmlns:dc="http://purl.org/dc/elements/1.1/">
<rdf:Description rdf:about="http://purl.stanford.edu/bb099zb1450">
<dc:format>application/x-esri-shapefile; format=Shapefile</dc:format>
<dc:type>Dataset#Polygon</dc:type>
<gml:boundedBy>
<gml:Envelope gml:srsName="EPSG:4326">
<gml:lowerCorner>0.441292 42.68919</gml:lowerCorner>
<gml:upperCorner>2.048281 43.921245</gml:upperCorner>
</gml:Envelope>
</gml:boundedBy>
<dc:coverage rdf:resource="http://sws.geonames.org/3013767/about.rdf" dc:language="eng" dc:title="Haute-Garonne (France)"/>
</rdf:Description>
</rdf:RDF>
</extension>
<subject authority="EPSG" valueURI="http://opengis.net/def/crs/EPSG/0/4326" displayLabel="WGS84"><cartographics><scale>Scale not given.</scale><projection>EPSG::4326</projection><coordinates>E 0°26ʹ29ʺ--E 2°2ʹ54ʺ/N 43°55ʹ16ʺ--N 42°41ʹ21ʺ</coordinates></cartographics></subject><note displayLabel="WGS84 Cartographics">This layer is presented in the WGS84 coordinate system for web display purposes. Downloadable data are provided in native coordinate system or projection.</note></mods>

0 comments on commit 668ba71

Please sign in to comment.