Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support mapping from classification records #52

Merged
merged 1 commit into from
Jun 28, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions examples/rvk-gnd-mapping.ttl
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
@prefix dcterms: <http://purl.org/dc/terms/> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix skos: <http://www.w3.org/2004/02/skos/core#> .
@prefix xml: <http://www.w3.org/XML/1998/namespace> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

<http://rvk.uni-regensburg.de/nt/AA_09900> a skos:Concept ;
dcterms:created "2012-07-05"^^xsd:date ;
dcterms:identifier "3:" ;
dcterms:modified "2018-03-16"^^xsd:date ;
skos:altLabel "Bibliografie"@de,
"Zeitschrift"@de ;
skos:broader <http://rvk.uni-regensburg.de/nt/AA> ;
skos:closeMatch <http://d-nb.info/gnd/4006432-3>,
<http://d-nb.info/gnd/4067488-5> ;
skos:editorialNote "Erläuterungen zur Notationsvergabe s. RVK-Online - Nutzunghinweise"@de ;
skos:inScheme <http://rvk.uni-regensburg.de/nt/> ;
skos:notation "AA 09900" ;
skos:prefLabel "Bibliographische Zeitschriften"@de .

43 changes: 43 additions & 0 deletions examples/rvk-gnd-mapping.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
<?xml version="1.0" encoding="UTF-8"?>
<collection
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd"
xmlns="http://www.loc.gov/MARC21/slim">
<record>
<leader> nw a22 o 4500</leader>
<controlfield tag="001">3:</controlfield>
<controlfield tag="003">DE-625</controlfield>
<controlfield tag="005">201803160839.2</controlfield>
<controlfield tag="008">120705an|aznnaabbn | anc |c</controlfield>
<datafield tag="040" ind1=" " ind2=" ">
<subfield code="a">DE-625</subfield>
<subfield code="b">ger</subfield>
<subfield code="c">DE-625</subfield>
<subfield code="d">DE-625</subfield>
</datafield>
<datafield tag="084" ind1="0" ind2=" ">
<subfield code="a">rvk</subfield>
</datafield>
<datafield tag="153" ind1=" " ind2=" ">
<subfield code="a">AA 09900</subfield>
<subfield code="j">Bibliographische Zeitschriften</subfield>
<subfield code="e">A</subfield>
<subfield code="h">Allgemeines</subfield>
<subfield code="e">AA</subfield>
<subfield code="h">Bibliographien der Bibliographien, Universalbibliographien, Bibliothekskataloge, Nationalbibliographien</subfield>
</datafield>
<datafield tag="684" ind1="1" ind2=" ">
<subfield code="i">Erläuterungen zur Notationsvergabe s. RVK-Online - Nutzunghinweise</subfield>
</datafield>
<datafield tag="750" ind1="1" ind2="7">
<subfield code="0">(DE-588)4006432-3</subfield>
<subfield code="a">Bibliografie</subfield>
<subfield code="2">gnd</subfield>
</datafield>
<datafield tag="750" ind1="1" ind2="7">
<subfield code="0">(DE-588)4067488-5</subfield>
<subfield code="a">Zeitschrift</subfield>
<subfield code="2">gnd</subfield>
</datafield>
</record>
</collection>
141 changes: 82 additions & 59 deletions mc2skos/record.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def __init__(self, scheme_code=None, **kwargs):
@python_2_unicode_compatible
class ConceptScheme(object):

def __init__(self, concept_type, code=None, edition=None, options=None):
def __init__(self, concept_type=None, code=None, edition=None, options=None):
self.type = concept_type
self.code = code # Can be None if URI template is specified in options
self.edition = edition
Expand All @@ -70,6 +70,14 @@ def get_config(self, concept_type, code, options):
'whitespace': options.get('whitespace', '-'),
}

if concept_type is None:
if code in CONFIG['subject_schemes']:
concept_type = AuthorityRecord
elif code in CONFIG['classification_schemes']:
concept_type = ClassificationRecord
else:
raise ValueError('Unknown concept scheme code "%s"' % code)

try:
cfg = CONFIG[{
AuthorityRecord: 'subject_schemes',
Expand Down Expand Up @@ -266,6 +274,63 @@ def parse(self, options):
def is_public(self):
return True

def append_relation(self, scheme_code, scheme_type, relation, **kwargs):
try:
scheme = ConceptScheme(scheme_type, scheme_code, edition=kwargs.get('edition'))
uri = scheme.get_uri(**kwargs)
except UnknownSchemeError as e:
logger.warning('Cannot generate URIs for unknown vocabulary "%s"', scheme_code)
return

if uri:
self.relations.append({
'uri': uri,
'relation': relation,
})

def get_mappings(self):
for heading in self.get_terms('7'):
relation = None
for sf in heading['node'].all('mx:subfield'):
if sf.get('code') == '4':
if is_uri(sf.text()):
relation = URIRef(sf.text())
else:
relation = {
'=EQ': SKOS.exactMatch,
'~EQ': SKOS.closeMatch,
'BM': SKOS.broadMatch,
'NM': SKOS.narrowMatch,
'RM': SKOS.relatedMatch,
}.get(sf.text()) # None if no match

elif sf.get('code') == '0':
# Note: Default value might change in the future
relation = relation if relation else SKOS.closeMatch
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I realize this is just my own code moved to a new place, but got me thinking about the order of subfields. This code currently requires $4 to come before $0, not after. Is that a valid assumption to make, or could it equally well be the other way around?

Copy link
Contributor Author

@nichtich nichtich Jun 20, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess it could also be the other way - if subfields are not repeatable, the order should be irrelevant. On the other hand I have not seen such record in real life so far.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The problem is that they are repeatable. But I have also yet to see it in practice, and I'm not sure what it would mean.


if is_uri(sf.text()):
self.relations.append({
'uri': sf.text(),
'relation': relation,
})
else:
scheme_code = {
'0': 'a', # Library of Congress Subject Headings
'1': 'b', # LC subject headings for children's literature
'2': 'c', # Medical Subject Headings
'3': 'd', # National Agricultural Library subject authority file
'4': 'n', # Source not specified
'5': 'k', # Canadian Subject Headings
'6': 'v', # Répertoire de vedettes-matière
'7': heading['node'].text('mx:subfield[@code="2"]'), # Source specified in subfield $2
}.get(heading['node'].get('ind2'))

yield {
'scheme_code': scheme_code,
'relation': relation,
'control_number': sf.text()
}


class ClassificationRecord(Record):

Expand Down Expand Up @@ -422,6 +487,15 @@ def parse(self, options):
'term': heading['value']
})

# 7XX: Heading Linking Entries
for mapping in self.get_mappings():
self.append_relation(
mapping['scheme_code'],
None,
mapping['relation'],
control_number=mapping['control_number']
)

# 765 : Synthesized Number Components
for entry in reversed(list(self.record.all('mx:datafield[@tag="765"]'))):

Expand Down Expand Up @@ -635,23 +709,7 @@ def get_class_number(el):
else:
return number_start

def append_relation(self, scheme_code, scheme_type, relation, **kwargs):

try:
scheme = ConceptScheme(scheme_type, scheme_code, edition=kwargs.get('edition'))
uri = scheme.get_uri(**kwargs)
except UnknownSchemeError as e:
logger.warning('Cannot generate URIs for unknown vocabulary "%s"', scheme_code)
return

if uri:
self.relations.append({
'uri': uri,
'relation': relation,
})

def parse(self, options):

super(AuthorityRecord, self).parse(options)

# Now we have enough information to generate URIs
Expand Down Expand Up @@ -782,45 +840,10 @@ def parse(self, options):
self.historyNote.append(entry.stringify())

# 7XX: Heading Linking Entries
for heading in self.get_terms('7'):
relation = None
for sf in heading['node'].all('mx:subfield'):
if sf.get('code') == '4':
if is_uri(sf.text()):
relation = URIRef(sf.text())
else:
relation = {
'=EQ': SKOS.exactMatch,
'~EQ': SKOS.closeMatch,
'BM': SKOS.broadMatch,
'NM': SKOS.narrowMatch,
'RM': SKOS.relatedMatch,
}.get(sf.text()) # None if no match

elif sf.get('code') == '0':
# Note: Default value might change in the future
relation = relation if relation else SKOS.closeMatch

if is_uri(sf.text()):
self.relations.append({
'uri': sf.text(),
'relation': relation,
})
else:
scheme_code = {
'0': 'a', # Library of Congress Subject Headings
'1': 'b', # LC subject headings for children's literature
'2': 'c', # Medical Subject Headings
'3': 'd', # National Agricultural Library subject authority file
'4': 'n', # Source not specified
'5': 'k', # Canadian Subject Headings
'6': 'v', # Répertoire de vedettes-matière
'7': heading['node'].text('mx:subfield[@code="2"]'), # Source specified in subfield $2
}.get(heading['node'].get('ind2'))

self.append_relation(
scheme_code,
AuthorityRecord,
relation,
control_number=sf.text()
)
for mapping in self.get_mappings():
self.append_relation(
mapping['scheme_code'],
None,
mapping['relation'],
control_number=mapping['control_number']
)
7 changes: 7 additions & 0 deletions mc2skos/vocabularies.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,10 @@ subject_schemes:
noubomr:
concept: http://data.ub.uio.no/mrtermer/c{control_number[3:]}
scheme: http://data.ub.uio.no/mrtermer/
gnd:
concept: http://d-nb.info/gnd/{control_number}
scheme: http://d-nb.info/gnd/
ddcri:
scheme: http://id.loc.gov/vocabulary/subjectSchemes/ddcri
TESA:
cheme: http://lod.nal.usda.gov/nalt/
2 changes: 1 addition & 1 deletion test/test_process_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def test_bk_asb_example(marc, match):
check_processing(marc, expect, include_altlabels=True)


@pytest.mark.parametrize('marc,match', examples('rvk'))
@pytest.mark.parametrize('marc,match', examples('rvk(-.*)?'))
def test_rvk_example(marc, match):

options = {
Expand Down