Permalink
Browse files

Initial import into git

  • Loading branch information...
0 parents commit 19b9f40b1b83eecf958b2600d6f3f944df6c7eaf @rsinger committed Aug 13, 2009
Showing with 531 additions and 0 deletions.
  1. +6 −0 README
  2. +9 −0 lcsh_labels.rb
  3. +277 −0 marcmodeler.rb
  4. +122 −0 ntriple_parser.rb
  5. +117 −0 rdf_resource.rb
6 README
@@ -0,0 +1,6 @@
+MARC2RDF Modeler is a set of scripts intended to help provide an easily hackable and modifiable way to take MARC21 bibliographic records and model them as RDF/XML.
+
+Requirements:
+ * enhanced_marc (http://github.com/rsinger/enhanced-marc/tree)
+ * datamapper (for LCSH linking)
+ * sqlite
9 lcsh_labels.rb
@@ -0,0 +1,9 @@
+require 'rubygems'
+require 'dm-core'
+DataMapper.setup(:default, "sqlite3:///#{Dir.pwd}/lcsh_labels.db")
+class Label
+ include DataMapper::Resource
+ property :id, Serial
+ property :uri, String, :index => true
+ property :label, String, :index => true
+end
277 marcmodeler.rb
@@ -0,0 +1,277 @@
+$KCODE = 'u'
+require 'rubygems'
+#require 'marc'
+require 'jcode'
+require 'enhanced_marc'
+require 'rdf_resource'
+require 'lcsh_labels'
+reader = MARC::Reader.new('cca.utf8.mrc')
+
+i = 0
+
+class MARC::Record
+ @@base_uri = 'http://library.cca.edu/core'
+ @@missing_id_prefix = 'cca'
+ @@missing_id_counter = 0
+
+ def strip_trailing_punct(str)
+ return str.sub(/[\.:,;\/\s]\s*$/,'').strip
+ end
+
+ def slug_literal(str)
+ slug = str.gsub(/[^\w\s\-]/,"")
+ slug.gsub!(/\s/,"_")
+ slug.downcase
+ end
+
+ def subdivided?(subject)
+ subject.subfields.each do | subfield |
+ if ["k","v","x","y","z"]
+ return true
+ end
+ end
+ return false
+ end
+
+ def subject_to_string(subject)
+ literal = ''
+ subject.subfields.each do | subfield |
+ if !literal.empty?
+ if ["v","x","y","z"].index(subfield.code)
+ literal << '--'
+ else
+ literal << ' ' if subfield.value =~ /^[\w\d]/
+ end
+ end
+ literal << subfield.value
+ end
+ literal.sub(/\.\s*/,'')
+ end
+
+ def top_concept(subject)
+ field = MARC::DataField.new(subject.tag, subject.indicator1, subject.indicator2)
+ subject.subfields.each do | subfield |
+ unless ["k","v","x","y","z"].index(subfield.code)
+ sub = MARC::Subfield.new(subfield.code, subfield.value)
+ field.append(sub)
+ end
+ end
+ return field
+ end
+
+ def to_rdf_resources
+ resources = []
+ unless self['001']
+ controlnum = MARC::ControlField.new('001')
+ controlnum.value = "#{@@missing_id_prefix}#{@@missing_id_counter}"
+ @@missing_id_counter += 1
+ self << controlnum
+ end
+ id = self['001'].value
+ resources << manifestation = RDFResource.new("#{@@base_uri}/m/#{id}")
+ manifestation.relate("[rdf:type]", "[frbr:Manifestation]")
+ if self['245']
+ if self['245']['a']
+ title = strip_trailing_punct(self['245']['a'])
+ manifestation.assert("[rda:titleProper]", strip_trailing_punct(self['245']['a']))
+ else
+ puts "No 245$a: #{self['245']}"
+ end
+ if self['245']['b']
+ title << " "+strip_trailing_punct(self['245']['b'])
+ manifestation.assert("[rda:otherTitleInformation]", strip_trailing_punct(self['245']['b']))
+ end
+ if self['245']['c']
+ manifestation.assert("[rda:statementOfResponsibility]", strip_trailing_punct(self['245']['c']))
+ end
+ end
+ manifestation.assert("[dct:title]", title)
+ if self['210']
+ manifestation.assert("[bibo:shortTitle]", strip_trailing_punct(self['210']['a']))
+ end
+ if self['020'] && self['020']['a']
+ manifestation.assert("[bibo:isbn]", strip_trailing_punct(self['020']['a']))
+ end
+
+ if self['022'] && self['022']['a']
+ manifestation.assert("[bibo:issn]", strip_trailing_punct(self['022']['a']))
+ end
+ if self['250'] && self['250']['a']
+ manifestation.assert("[bibo:edition]", self['250']['a'])
+ end
+ if self['246'] && self['246']['a']
+ manifestation.assert("[rda:parallelTitleProper]", strip_trailing_punct(self['246']['a']))
+ end
+ if self['767'] && self['767']['t']
+ manifestation.assert("[rda:parallelTitleProper]", strip_trailing_punct(self['767']['t']))
+ end
+ subjects = self.find_all {|field| field.tag =~ /^6../}
+
+ subjects.each do | subject |
+ authority = false
+ authorities = []
+ literal = subject_to_string(subject)
+ manifestation.assert("[dc:subject]", literal)
+ if !["653","690","691","696","697", "698", "699"].index(subject.tag) && subject.indicator2 =~ /^(0|1)$/
+ Label.all(:label=>literal).each do | auth |
+ next if (subject.indicator2 == "0" && auth.uri =~ /http:\/\/lcsubjects\.org\/subjects\/sj/) ||
+ (subject.indicator2 == "1" && auth.uri =~ /http:\/\/lcsubjects\.org\/subjects\/sh/)
+ manifestation.relate("[dct:subject]", auth.uri)
+ authorities << auth.uri
+ authority = true
+ end
+ end
+ if ["600","610","611","630"].index(subject.tag) || !authority
+
+ slugged_id = slug_literal(literal)
+
+ if subject.tag =~ /^(600|610|696|697)$/
+ if !subdivided?(subject)
+ concept = RDFResource.new("#{@@base_uri}/i/#{slugged_id}#concept")
+ identity = RDFResource.new("#{@@base_uri}/i/#{slugged_id}")
+ else
+ concept = RDFResource.new("#{@@base_uri}/s/#{slugged_id}#concept")
+ identity_subject = top_concept(subject)
+ identity = RDFResource.new("#{@@base_uri}/i/#{slug_literal(subject_to_string(identity_subject))}")
+ end
+ if subject.tag =~ /^(600|696)$/
+ identity.relate("[rdf:type]","[foaf:Person]")
+ if subject['u']
+ identity.assert("[ov:affiliation]", subject['u'].sub)
+ end
+ concept.relate("[skos:inScheme]", "#{@@base_uri}/s#personalNames")
+ else
+ identity.relate("[rdf:type]","[foaf:Organization]")
+ identity.assert("[dct:description]", subject['u'])
+ concept.relate("[skos:inScheme]", "#{@@base_uri}/s#corporateNames")
+ end
+ concept.relate("[rdfs:seeAlso]", identity.uri)
+ identity.relate("[rdfs:seeAlso]", concept.uri)
+ name = subject['a']
+ if subject['b']
+ name << " #{subject['b']}"
+ end
+ identity.assert("[foaf:name]",name)
+ if subject['d']
+ identity.assert("[dct:date]", subject['d'])
+ end
+ resources << identity
+ elsif subject.tag =~ /^(611|698)$/
+ if !subdivided?(subject)
+ concept = RDFResource.new("#{@@base_uri}/e/#{slugged_id}#concept")
+ event = RDFResource.new("#{@@base_uri}/e/#{slugged_id}")
+ else
+ concept = RDFResource.new("#{@@base_uri}/s/#{slugged_id}#concept")
+ event_subject = top_concept(subject)
+ event = RDFResource.new("#{@@base_uri}/e/#{slug_literal(subject_to_string(identity_subject))}")
+ end
+ concept.relate("[skos:inScheme]", "#{@@base_uri}/s#meetings")
+ event.relate("[rdf:type]","[event:Event]")
+ concept.relate("[rdfs:seeAlso]", event.uri)
+ event.relate("[rdfs:seeAlso]", concept.uri)
+ event.assert("[dct:title]", subject['a'])
+ if subject['d']
+ event.assert("[dct:date]", subject['d'])
+ end
+ if subject['c']
+ event.assert("[dct:description]", subject['c'])
+ end
+ resources << event
+ elsif subject.tag =~ /^(630|699)$/
+ unless subdivided?(subject)
+ concept = RDFResource.new("#{@@base_uri}/w/#{slugged_id}#concept")
+ work = RDFResource.new("#{@@base_uri}/w/#{slugged_id}")
+ else
+ concept = RDFResource.new("#{@@base_uri}/s/#{slugged_id}#concept")
+ work_subject = top_concept(subject)
+ work = RDFResource.new("#{@@base_uri}/w/#{slug_literal(subject_to_string(identity_subject))}")
+ end
+ concept.relate("[skos:inScheme]", "#{@@base_uri}/s#uniformTitles")
+ work.relate("[rdf:type]","[frbr:Work]")
+ concept.relate("[rdfs:seeAlso]", work.uri)
+ work.relate("[rdfs:seeAlso]", concept.uri)
+ work.assert("[dct:title]", subject['a'])
+ if subject['d']
+ work.assert("[dct:date]", subject['d'])
+ end
+ if subject['f']
+ work.assert("[dct:date]", subject['f'])
+ end
+ resources << work
+ else
+ concept = RDFResource.new("#{@@base_uri}/s/#{slugged_id}#concept")
+ if subject.tag =~ /^(650|690)$/
+ concept.relate("[skos:inScheme]","#{@@base_uri}/s#topicalTerms")
+ elsif subject.tag =~ /^(651|691)$/
+ concept.relate("[skos:inScheme]","#{@@base_uri}/s#geographicNames")
+ elsif subject.tag = "655"
+ concept.relate("[skos:inScheme]","#{@@base_uri}/s#genreFormTerms")
+ elsif subject.tag = "648"
+ concept.relate("[skos:inScheme]","#{@@base_uri}/s#chronologicalTerms")
+ elsif subject.tag = "656"
+ concept.relate("[skos:inScheme]","#{@@base_uri}/s#occupations")
+ end
+ end
+ concept.assert("[skos:prefLabel]", literal)
+
+ authorities.each do | auth |
+ concept.relate("[skos:exactMatch]", auth)
+ end
+
+ subject.subfields.each do | subfield |
+ scheme = case subfield.code
+ when "v" then "#{@@base_uri}/s#formSubdivision"
+ when "x" then "#{@@base_uri}/s#generalSubdivision"
+ when "y" then "#{@@base_uri}/s#chronologicalSubdivision"
+ when "z" then "#{@@base_uri}/s#geographicSubdivision"
+ else nil
+ end
+ if scheme
+ concept.relate("[skos:inScheme]",scheme)
+ end
+ end
+ resources << concept
+ end
+ authority = false
+ end
+ if self['010'] && self['010']['a']
+ manifestation.assert("[bibo:lccn]", self['010']['a'])
+ end
+ resources
+ end
+end
+
+class MARC::BookRecord
+
+ def to_rdf_resources
+ resources = super
+ book = resources[0]
+ book.relate("[rdf:type]", "[bibo:Book]")
+ if self.nature_of_contents
+ self.nature_of_contents(true).each do | genre |
+ book.assert("[cat:genre]", genre)
+ end
+ end
+ #puts book.to_rdfxml
+ return resources
+ end
+end
+
+class MARC::DataField
+ def [](code)
+ subfield = self.find {|s| s.code == code}
+ return subfield.value.sub(/\.\s*/,'') if subfield
+ return
+ end
+end
+
+
+@resources = []
+reader.each do | record |
+ @resources += record.to_rdf_resources
+ i += 1
+ break if i > 100
+end
+@resources.each do | resource |
+ puts resource.to_rdfxml
+end
122 ntriple_parser.rb
@@ -0,0 +1,122 @@
+$KCODE = 'u'
+require 'rubygems'
+require 'strscan'
+require 'iconv'
+require 'jcode'
+require 'uri'
+require 'active_support'
+require 'lcsh_labels'
+
+class UTF8Parser < StringScanner
+ STRING = /(([\x0-\x1f]|[\\\/bfnrt]|\\u[0-9a-fA-F]{4}|[\x20-\xff])*)/nx
+ UNPARSED = Object.new
+ UNESCAPE_MAP = Hash.new { |h, k| h[k] = k.chr }
+ UNESCAPE_MAP.update({
+ ?" => '"',
+ ?\\ => '\\',
+ ?/ => '/',
+ ?b => "\b",
+ ?f => "\f",
+ ?n => "\n",
+ ?r => "\r",
+ ?t => "\t",
+ ?u => nil,
+ })
+ UTF16toUTF8 = Iconv.new('utf-8', 'utf-16be')
+ def initialize(str)
+ super(str)
+ @string = str
+ end
+ def parse_string
+ if scan(STRING)
+ return '' if self[1].empty?
+ string = self[1].gsub(%r((?:\\[\\bfnrt"/]|(?:\\u(?:[A-Fa-f\d]{4}))+|\\[\x20-\xff]))n) do |c|
+ if u = UNESCAPE_MAP[$&[1]]
+ u
+ else # \uXXXX
+ bytes = ''
+ i = 0
+ while c[6 * i] == ?\\ && c[6 * i + 1] == ?u
+ bytes << c[6 * i + 2, 2].to_i(16) << c[6 * i + 4, 2].to_i(16)
+ i += 1
+ end
+ UTF16toUTF8.iconv(bytes)
+ end
+ end
+ if string.respond_to?(:force_encoding)
+ string.force_encoding(Encoding::UTF_8)
+ end
+ string
+ else
+ UNPARSED
+ end
+ rescue Iconv::Failure => e
+ raise GeneratorError, "Caught #{e.class}: #{e}"
+ end
+end
+
+class TripleParser
+ attr_reader :ntriple, :subject, :predicate, :data_type, :language, :literal
+ attr_accessor :object
+ def initialize(line)
+ @ntriple = line
+ parse_ntriple
+ end
+
+ def parse_ntriple
+ scanner = StringScanner.new(@ntriple)
+ @subject = scanner.scan_until(/> /)
+ @subject.sub!(/^</,'')
+ @subject.sub!(/> $/,'')
+ @predicate = scanner.scan_until(/> /)
+ @predicate.sub!(/^</,'')
+ @predicate.sub!(/> $/,'')
+ if scanner.match?(/</)
+ @object = scanner.scan_until(/>\s?\.\n/)
+ @object.sub!(/^</,'')
+ @object.sub!(/>\s?\.\n/,'')
+ @literal = false
+ else
+ @literal = true
+ scanner.getch
+ @object = scanner.scan_until(/("\s?\.\n)|("@[A-z])|("\^\^)/)
+ scanner.pos=(scanner.pos-2)
+ @object.sub!(/"..$/,'')
+ uscan = UTF8Parser.new(@object)
+ @object = uscan.parse_string
+ if scanner.match?(/@/)
+ scanner.getch
+ @language = scanner.scan_until(/\s?\.\n/)
+ @language.sub!(/\s?\.\n/,'')
+ elsif scanner.match?(/\^\^/)
+ scanner.skip_until(/</)
+ @data_type = scanner.scan_until(/>/)
+ @data_type.sub!(/>$/,'')
+ end
+ end
+ end
+end
+
+
+
+file = File.open("/Users/rosssinger/Downloads/20090604_164455.nt",'r').readlines
+
+def lcsh_to_platform_uri(uri)
+ platform_uri = URI.parse(uri)
+ platform_uri.host = 'lcsubjects.org'
+ platform_uri.path.sub!(/\/authorities\//,"/subjects/")
+ return platform_uri.to_s
+end
+Label.auto_migrate!
+puts "#{file.length} total triples"
+
+file.each do | triple |
+ parser = TripleParser.new(triple)
+ new_uri = lcsh_to_platform_uri(parser.subject)
+ next unless ["http://www.w3.org/2004/02/skos/core#prefLabel", "http://www.w3.org/2004/02/skos/core#altLabel", "http://www.w3.org/2004/02/skos/core#hiddenLabel"].index(parser.predicate)
+ label = Label.new(:uri=>new_uri,:label=>parser.object)
+ label.save
+end
+
+
+
117 rdf_resource.rb
@@ -0,0 +1,117 @@
+require 'uri'
+require 'builder'
+require 'date'
+require 'curies'
+class RDFResource
+ attr_reader :uri, :namespaces, :modifiers
+ def initialize(uri)
+ Curie.add_prefixes! :frbr=>"http://vocab.org/frbr/core#", :dct=>"http://purl.org/dc/terms/", :bibo=>"http://purl.org/ontology/bibo/",
+ :skos=>"http://www.w3.org/2004/02/skos/core#", :rda=>"http://RDVocab.info/Elements/", :cat=>"http://schema.talis.com/2009/catalontology/",
+ :rdfs=>"http://www.w3.org/2000/01/rdf-schema#", :ov=>"http://open.vocab.org/terms/", :event=>"http://purl.org/NET/c4dm/event.owl#"
+ @uri = Curie.parse uri
+ @namespaces = ['http://www.w3.org/1999/02/22-rdf-syntax-ns#']
+
+ @modifiers = {}
+ end
+
+ def assert(predicate, object, type=nil, lang=nil)
+ uri = URI.parse(Curie.parse predicate)
+ ns = nil
+ elem = nil
+ if uri.fragment
+ ns, elem = uri.to_s.split('#')
+ ns << '#'
+ else
+ elem = uri.path.split('/').last
+ ns = uri.to_s.sub(/#{elem}$/, '')
+ end
+ attr_name = ''
+ if i = @namespaces.index(ns)
+ attr_name = "n#{i}_#{elem}"
+ else
+ @namespaces << ns
+ attr_name = "n#{@namespaces.index(ns)}_#{elem}"
+ end
+ unless type
+ val = object
+ else
+ @modifiers[object.object_id] ||={}
+ @modifiers[object.object_id][:type] = type
+ val = case type
+ when 'http://www.w3.org/2001/XMLSchema#dateTime' then DateTime.parse(object)
+ when 'http://www.w3.org/2001/XMLSchema#date' then Date.parse(object)
+ when 'http://www.w3.org/2001/XMLSchema#int' then object.to_i
+ when 'http://www.w3.org/2001/XMLSchema#string' then object.to_s
+ when 'http://www.w3.org/2001/XMLSchema#boolean'
+ if object.downcase == 'true' || object == '1'
+ true
+ else
+ false
+ end
+ else
+ object
+ end
+ end
+ if lang
+ @modifiers[object.object_id] ||={}
+ @modifiers[val.object_id][:language] = lang
+ end
+ if self.instance_variable_defined?("@#{attr_name}")
+ unless self.instance_variable_get("@#{attr_name}").is_a?(Array)
+ att = self.instance_variable_get("@#{attr_name}")
+ self.instance_variable_set("@#{attr_name}", [att])
+ end
+ self.instance_variable_get("@#{attr_name}") << val
+ else
+ self.instance_variable_set("@#{attr_name}", val)
+ end
+ end
+
+ def relate(predicate, resource)
+ self.assert(predicate, self.class.new(resource))
+ end
+
+ def to_rdfxml
+ doc = Builder::XmlMarkup.new
+ xmlns = {}
+ i = 1
+ @namespaces.each do | ns |
+ next if ns == 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
+ xmlns["xmlns:n#{i}"] = ns
+ i += 1
+ end
+ doc.rdf :Description,xmlns.merge({:about=>uri}) do | rdf |
+ self.instance_variables.each do | ivar |
+ next unless ivar =~ /^@n[0-9]*_/
+ prefix, tag = ivar.split('_',2)
+ attrs = {}
+ curr_attr = self.instance_variable_get("#{ivar}")
+ prefix.sub!(/^@/,'')
+ prefix = 'rdf' if prefix == 'n0'
+ unless curr_attr.is_a?(Array)
+ curr_attr = [curr_attr]
+ end
+ curr_attr.each do | val |
+ if val.is_a?(RDFResource)
+ attrs['rdf:resource'] = val.uri
+ end
+ if @modifiers[val.object_id]
+ if @modifiers[val.object_id][:language]
+ attrs['xml:lang'] = @modifiers[val.object_id][:language]
+ end
+ if @modifiers[val.object_id][:type]
+ attrs['rdf:datatype'] = @modifiers[val.object_id][:type]
+ end
+ end
+ unless attrs['rdf:resource']
+ rdf.tag!("#{prefix}:#{tag}", attrs, val)
+ else
+ rdf.tag!("#{prefix}:#{tag}", attrs)
+ end
+ end
+ end
+ end
+ doc.target!
+ end
+
+end

0 comments on commit 19b9f40

Please sign in to comment.