From 4ee54181315305f69426fe46aa2088c369c33d97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Se=CC=81bastien=20De=CC=81le=CC=80ze?= Date: Mon, 29 Mar 2021 16:42:50 +0200 Subject: [PATCH] documents: harvest records from IRs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Harvests records from ArODES. * Harvests records from Zora. * Harvests records from edoc. * Closes #487. Co-Authored-by: Sébastien Délèze --- data/oai_sources.json | 24 + .../documents/dojson/arodes/__init__.py | 18 + .../modules/documents/dojson/arodes/model.py | 343 ++++++ .../modules/documents/dojson/zora/__init__.py | 18 + sonar/modules/documents/dojson/zora/model.py | 328 ++++++ .../documents/loaders/schemas/arodes.py | 45 + sonar/modules/documents/loaders/schemas/dc.py | 241 +++++ .../modules/documents/loaders/schemas/edoc.py | 24 + .../documents/loaders/schemas/factory.py | 8 +- .../modules/documents/loaders/schemas/zora.py | 43 + sonar/modules/documents/receivers.py | 6 +- .../documents/loaders/test_arodes_loader.py | 981 ++++++++++++++++++ .../documents/loaders/test_edoc_loader.py | 542 ++++++++++ .../documents/loaders/test_zora_loader.py | 712 +++++++++++++ 14 files changed, 3330 insertions(+), 3 deletions(-) create mode 100644 sonar/modules/documents/dojson/arodes/__init__.py create mode 100644 sonar/modules/documents/dojson/arodes/model.py create mode 100644 sonar/modules/documents/dojson/zora/__init__.py create mode 100644 sonar/modules/documents/dojson/zora/model.py create mode 100644 sonar/modules/documents/loaders/schemas/arodes.py create mode 100644 sonar/modules/documents/loaders/schemas/dc.py create mode 100644 sonar/modules/documents/loaders/schemas/edoc.py create mode 100644 sonar/modules/documents/loaders/schemas/zora.py create mode 100644 tests/unit/documents/loaders/test_arodes_loader.py create mode 100644 tests/unit/documents/loaders/test_edoc_loader.py create mode 100644 tests/unit/documents/loaders/test_zora_loader.py diff --git a/data/oai_sources.json b/data/oai_sources.json index 228fa7372..4744c5013 100644 --- a/data/oai_sources.json +++ b/data/oai_sources.json @@ -22,5 +22,29 @@ "metadataprefix": "oai_openaire", "comment": "", "setspecs": "" + }, + { + "key": "arodes", + "name": "ArODES", + "url": "https://hesso.tind.io/oai2d", + "metadataprefix": "marcxml", + "comment": "", + "setspecs": "" + }, + { + "key": "zora", + "name": "Zora", + "url": "https://www.zora.uzh.ch/cgi/oai2", + "metadataprefix": "marc21", + "comment": "", + "setspecs": "" + }, + { + "key": "edoc", + "name": "edoc", + "url": "https://edoc.unibas.ch/cgi/oai2", + "metadataprefix": "oai_dc", + "comment": "", + "setspecs": "" } ] diff --git a/sonar/modules/documents/dojson/arodes/__init__.py b/sonar/modules/documents/dojson/arodes/__init__.py new file mode 100644 index 000000000..c55a39c2e --- /dev/null +++ b/sonar/modules/documents/dojson/arodes/__init__.py @@ -0,0 +1,18 @@ +# -*- coding: utf-8 -*- +# +# Swiss Open Access Repository +# Copyright (C) 2021 RERO +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +"""DOJSON transformation for ArODES.""" diff --git a/sonar/modules/documents/dojson/arodes/model.py b/sonar/modules/documents/dojson/arodes/model.py new file mode 100644 index 000000000..7bc89b158 --- /dev/null +++ b/sonar/modules/documents/dojson/arodes/model.py @@ -0,0 +1,343 @@ +# -*- coding: utf-8 -*- +# +# Swiss Open Access Repository +# Copyright (C) 2021 RERO +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +"""DOJSON transformation for ArODES.""" + +import re + +from dojson import utils + +from sonar.modules.documents.dojson.overdo import Overdo + +overdo = Overdo() + +TYPE_MAPPINGS = { + 'livre': 'coar:c_2f33', + 'chapitre': 'coar:c_3248', + 'conference': 'coar:c_5794', + 'scientifique': 'coar:c_6501', + 'professionnel': 'coar:c_3e5a', + 'rapport': 'coar:c_18ws', + 'THESES': 'coar:c_db06', + 'other': 'coar:c_1843' +} + +OA_STATUS = ['green', 'gold', 'hybrid', 'bronze', 'closed'] + + +@overdo.over('identifiedBy', '001') +@utils.ignore_value +def identified_by_from_001(self, key, value): + """Get identifier from field 001.""" + identified_by = self.get('identifiedBy', []) + + identified_by.append({ + 'type': 'bf:Local', + 'source': 'ArODES', + 'value': value + }) + + return identified_by + + +@overdo.over('identifiedBy', '^0247.') +@utils.ignore_value +def identified_by_from_024(self, key, value): + """Get identifier from field 024.""" + identified_by = self.get('identifiedBy', []) + + if not value.get('a') or not value.get('2') in ['DOI', 'PMID']: + return None + + if value.get('2') == 'DOI': + identified_by.append({'type': 'bf:Doi', 'value': value.get('a')}) + + return identified_by + + +@overdo.over('title', '^245..') +@utils.for_each_value +@utils.ignore_value +def title_from_245(self, key, value): + """Get title from field 245.""" + main_title = value.get('a', 'No title found') + subtitle = value.get('b') + language = value.get('9', 'eng') + + title = { + 'type': 'bf:Title', + 'mainTitle': [{ + 'value': main_title, + 'language': language + }] + } + + if subtitle: + title['subtitle'] = [{'value': subtitle, 'language': language}] + + return title + + +@overdo.over('documentType', '^980') +@utils.ignore_value +def document_type_from_980(self, key, value): + """Get document type from 980 field.""" + document_type = value.get('a', None) + + if self.get('documentType') or not document_type: + return None + + if document_type not in TYPE_MAPPINGS: + document_type = 'other' + + return TYPE_MAPPINGS[document_type] + + +@overdo.over('language', '^041') +@utils.for_each_value +@utils.ignore_value +def language_from_041(self, key, value): + """Get languages.""" + if not value.get('a'): + return None + + language = self.get('language', []) + + codes = utils.force_list(value.get('a')) + + for code in codes: + language.append({'type': 'bf:Language', 'value': code}) + + self['language'] = language + + return None + + +@overdo.over('abstracts', '^520..') +@utils.for_each_value +@utils.ignore_value +def abstract_from_520(self, key, value): + """Get abstract.""" + abstract = value.get('a') + language = value.get('9', 'eng') + + if not abstract: + return None + + abstracts_data = self.get('abstracts', []) + abstracts_data.append({'value': abstract, 'language': language}) + + self['abstracts'] = abstracts_data + + return None + + +@overdo.over('oa_status', '^906..') +@utils.ignore_value +def oa_status_from_906(self, key, value): + """Get abstract.""" + oa_status = value.get('a', 'none').lower() + + if not oa_status or oa_status not in OA_STATUS: + return None + + return oa_status + + +@overdo.over('date', '^269..') +@utils.ignore_value +def date_from_269(self, key, value): + """Get date from field 269.""" + # No date, skipping + if not value.get('a'): + return None + + # Assign start date + match = re.search(r'^[0-9]{4}-[0-9]{2}$', value.get('a')) + + # Date does not match "YYYY" or "YYYY-MM-DD" + if not match: + return None + + add_provision_activity_start_date(self, value.get('a') + '-01') + + return None + + +@overdo.over('date', '^260..') +@utils.ignore_value +def date_from_260(self, key, value): + """Get date from field 260.""" + # No date, skipping + if not value.get('c'): + return None + + # Assign start date + match = re.search(r'^[0-9]{4}-[0-9]{2}$', value.get('c')) + + # Date does not match "YYYY" or "YYYY-MM-DD" + if not match: + return None + + add_provision_activity_start_date(self, value.get('c') + '-01') + + return None + + +@overdo.over('subjects', '^653..') +@utils.for_each_value +@utils.ignore_value +def subjects_from_653(self, key, value): + """Get abstract.""" + subject = value.get('a') + language = value.get('9', 'eng') + + if not subject: + return None + + subject_data = get_subject_for_language(self, language) + subject_data['label']['value'].append(subject) + + return None + + +@overdo.over('dissertation', '^502..') +@utils.ignore_value +def dissertation_from_field_502(self, key, value): + """Extract dissertation degree.""" + if not value.get('b'): + return None + + return {'degree': value.get('b')} + + +@overdo.over('partOf', '^773..') +@utils.ignore_value +def host_document_from_field_773(self, key, value): + """Host document.""" + if not value.get('t'): + return None + + part_of = {'document': {'title': value.get('t')}} + + if not value.get('g'): + if self.get('provisionActivity'): + match = re.search(r'^(\d{4})', + self['provisionActivity'][0]['startDate']) + part_of['numberingYear'] = match.group(1) + else: + # Year + match = re.search(r'^(\d{4})', value.get('g')) + if match: + part_of['numberingYear'] = match.group(1) + + # Volume + match = re.search(r'vol\.\s(\d+)', value.get('g')) + if match: + part_of['numberingVolume'] = match.group(1) + + # Issue + match = re.search(r'no\.\s(\d+)', value.get('g')) + if match: + part_of['numberingIssue'] = match.group(1) + + # Pages + match = re.search(r'pp\.\s([0-9\-–]+)', value.get('g')) + if match: + part_of['numberingPages'] = match.group(1) + + if not part_of.get('numberingYear'): + return None + + return [part_of] + + +@overdo.over('contribution', '^700..') +@utils.for_each_value +@utils.ignore_value +def contribution_from_700(self, key, value): + """Get contribution.""" + name = value.get('a') + affiliation = value.get('u') + + if not name: + return None + + contribution = { + 'agent': { + 'type': 'bf:Person', + 'preferred_name': name + }, + 'role': ['ctb'] + } + + if affiliation: + contribution['affiliation'] = affiliation + + return contribution + + +def add_provision_activity_start_date(data, date): + """Add start date for provision activity. + + :param data: Data dictionary. + :param date: Date to add. + """ + provisition_activity = data.get('provisionActivity', []) + + def get_publication(): + """Get stored publication.""" + for key, item in enumerate(provisition_activity): + if item['type'] == 'bf:Publication': + return provisition_activity.pop(key) + + return {'type': 'bf:Publication', 'startDate': None} + + publication = get_publication() + + publication['startDate'] = date + + # Inject publiction into provision activity + provisition_activity.append(publication) + + # Re-assign provisionActivity + data['provisionActivity'] = provisition_activity + + +def get_subject_for_language(data, language): + """Return the subject item corresponding to language. + + :param dict data: Overdo data + :param str language: Language code + :returns: Subject object + :rtype: Dict + """ + if not data.get('subjects'): + data['subjects'] = [] + + subjects = [ + subject for subject in data.get('subjects', []) + if subject['label']['language'] == language + ] + + # Create an empty subject + if not subjects: + subject = {'label': {'language': language, 'value': []}} + data['subjects'].append(subject) + return subject + + return subjects[0] diff --git a/sonar/modules/documents/dojson/zora/__init__.py b/sonar/modules/documents/dojson/zora/__init__.py new file mode 100644 index 000000000..1e768442e --- /dev/null +++ b/sonar/modules/documents/dojson/zora/__init__.py @@ -0,0 +1,18 @@ +# -*- coding: utf-8 -*- +# +# Swiss Open Access Repository +# Copyright (C) 2021 RERO +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +"""DOJSON transformation for ZORA.""" diff --git a/sonar/modules/documents/dojson/zora/model.py b/sonar/modules/documents/dojson/zora/model.py new file mode 100644 index 000000000..082a61753 --- /dev/null +++ b/sonar/modules/documents/dojson/zora/model.py @@ -0,0 +1,328 @@ +# -*- coding: utf-8 -*- +# +# Swiss Open Access Repository +# Copyright (C) 2021 RERO +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +"""DOJSON transformation for ZORA.""" + +import re + +from dojson import utils + +from sonar.modules.documents.dojson.overdo import Overdo + +overdo = Overdo() + + +@overdo.over('identifiedBy', '001') +@utils.ignore_value +def identified_by_from_001(self, key, value): + """Get identifier from field 001.""" + identified_by = self.get('identifiedBy', []) + + identified_by.append({ + 'type': 'bf:Local', + 'source': 'ZORA', + 'value': value + }) + + return identified_by + + +@overdo.over('identifiedBy', '^0247.') +@utils.ignore_value +def identified_by_from_024(self, key, value): + """Get identifier from field 024.""" + identified_by = self.get('identifiedBy', []) + + if not value.get('a'): + return None + + if value.get('2') == 'doi': + identified_by.append({'type': 'bf:Doi', 'value': value.get('a')}) + elif value.get('2') == 'pmid': + identified_by.append({ + 'type': 'bf:Local', + 'value': value.get('a'), + 'source': 'PMID' + }) + else: + identified_by.append({ + 'type': 'bf:Identifier', + 'value': value.get('a') + }) + + return identified_by + + +@overdo.over('title', '^245..') +@utils.for_each_value +@utils.ignore_value +def title_from_245(self, key, value): + """Get title from field 245.""" + main_title = value.get('a', 'No title found') + subtitle = value.get('b') + language = value.get('9', 'eng') + + title = { + 'type': 'bf:Title', + 'mainTitle': [{ + 'value': main_title, + 'language': language + }] + } + + if subtitle: + title['subtitle'] = [{'value': subtitle, 'language': language}] + + return title + + +@overdo.over('documentType', '^655') +@utils.ignore_value +def document_type_from_655(self, key, value): + """Get document type from 655 field.""" + type = value.get('2') + value = value.get('a') + + if self.get('documentType') or not value or not type: + return None + + record = overdo.blob_record + + # Book + if type == 'local' and value == 'Herausgegebenes wissenschaftliches Werk': + return 'coar:c_2f33' + + if type == 'local' and value == 'Monografie': + return 'coar:c_2f33' + + # Book part + if type == 'local' and value == 'Buchkapitel': + return 'coar:c_3248' + + # Conference paper + if type == 'local' and value == 'Konferenzbeitrag': + return 'coar:c_5794' + + # Journal article + if type == 'local' and value == 'Artikel': + return 'coar:c_6501' + + # Newspaper article + if type == 'local' and value == 'Zeitungsartikel': + return 'coar:c_998f' + + # Research report + if type == 'gnd-content' and value == 'Forschungsbericht': + return 'coar:c_18ws' + + # Doctoral thesis + if type == 'gnd-content' and value == 'Hochschulschrift' and record.get( + '502__', {}).get('b') == 'Dissertation': + return 'coar:c_db06' + + # Master thesis + if type == 'gnd-content' and value == 'Hochschulschrift' and record.get( + '502__', {}).get('b') == 'Masterarbeit': + return 'coar:c_bdcc' + + # Habilitation thesis + if type == 'gnd-content' and value == 'Hochschulschrift' and record.get( + '502__', {}).get('b') == 'Habilitation': + return 'habilitation_thesis' + + # Working paper + if type == 'local' and value == 'Working Paper': + return 'coar:c_8042' + + return 'coar:c_1843' + + +@overdo.over('language', '^041') +@utils.for_each_value +@utils.ignore_value +def language_from_041(self, key, value): + """Get languages.""" + if not value.get('a'): + return None + + language = self.get('language', []) + + codes = utils.force_list(value.get('a')) + + for code in codes: + language.append({'type': 'bf:Language', 'value': code}) + + self['language'] = language + + return None + + +@overdo.over('abstracts', '^520..') +@utils.for_each_value +@utils.ignore_value +def abstract_from_520(self, key, value): + """Get abstract.""" + abstract = value.get('a') + language = value.get('9', 'eng') + + if not abstract: + return None + + abstracts_data = self.get('abstracts', []) + abstracts_data.append({'value': abstract, 'language': language}) + + self['abstracts'] = abstracts_data + + return None + + +@overdo.over('date', '^264..') +@utils.ignore_value +def date_from_264(self, key, value): + """Get date from field 264.""" + # No date, skipping + if not value.get('c'): + return None + + # Assign start date + match = re.search(r'^[0-9]{4}$', value.get('c')) + + # Date does not match "YYYY" or "YYYY-MM-DD" + if not match: + return None + + add_provision_activity_start_date(self, value.get('c')) + + return None + + +@overdo.over('dissertation', '^502..') +@utils.ignore_value +def dissertation_from_field_502(self, key, value): + """Extract dissertation degree.""" + if not value.get('b'): + return None + + dissertation = {'degree': value.get('b')} + + if value.get('c'): + dissertation['grantingInstitution'] = value.get('c') + + if value.get('d'): + dissertation['date'] = value.get('d') + + return dissertation + + +@overdo.over('partOf', '^773..') +@utils.ignore_value +def host_document_from_field_773(self, key, value): + """Host document.""" + if not value.get('t'): + return None + + part_of = {'document': {'title': value.get('t')}} + + if not value.get('g'): + if self.get('provisionActivity'): + match = re.search(r'^(\d{4})', + self['provisionActivity'][0]['startDate']) + part_of['numberingYear'] = match.group(1) + else: + # Year + match = re.search(r'\((\d{4})\)$', value.get('g')) + if match: + part_of['numberingYear'] = match.group(1) + + # Volume + match = re.search(r'Bd\.\s(\d+)', value.get('g')) + if match: + part_of['numberingVolume'] = match.group(1) + + # Issue + match = re.search(r'Nr\.\s(\d+)', value.get('g')) + if match: + part_of['numberingIssue'] = match.group(1) + + # Pages + match = re.search(r'S\.\s(.+)\s\(', value.get('g')) + if match: + part_of['numberingPages'] = match.group(1) + + if not part_of.get('numberingYear'): + return None + + return [part_of] + + +@overdo.over('contribution', '^[17]00..') +@utils.ignore_value +def contribution_from_field_100_700(self, key, value): + """Extract contribution from field 100 or 700.""" + if not value.get('a'): + return None + + contribution = self.get('contribution', []) + + data = { + 'agent': { + 'type': 'bf:Person', + 'preferred_name': value.get('a') + }, + 'role': ['cre' if value.get('4') == 'aut' else value.get('4')] + } + + if value.get('0'): + match = re.search(r'^\(orcid\)(.*)$', value.get('0')) + if match: + data['agent']['identifiedBy'] = { + 'type': 'bf:Local', + 'source': 'ORCID', + 'value': match.group(1) + } + + contribution.append(data) + self['contribution'] = contribution + + return None + + +def add_provision_activity_start_date(data, date): + """Add start date for provision activity. + + :param data: Data dictionary. + :param date: Date to add. + """ + provisition_activity = data.get('provisionActivity', []) + + def get_publication(): + """Get stored publication.""" + for key, item in enumerate(provisition_activity): + if item['type'] == 'bf:Publication': + return provisition_activity.pop(key) + + return {'type': 'bf:Publication', 'startDate': None} + + publication = get_publication() + + publication['startDate'] = date + + # Inject publiction into provision activity + provisition_activity.append(publication) + + # Re-assign provisionActivity + data['provisionActivity'] = provisition_activity diff --git a/sonar/modules/documents/loaders/schemas/arodes.py b/sonar/modules/documents/loaders/schemas/arodes.py new file mode 100644 index 000000000..5964fa6d3 --- /dev/null +++ b/sonar/modules/documents/loaders/schemas/arodes.py @@ -0,0 +1,45 @@ +# -*- coding: utf-8 -*- +# +# Swiss Open Access Repository +# Copyright (C) 2021 RERO +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +"""Arodes schema.""" + +from marshmallow import fields, pre_dump + +from sonar.modules.documents.dojson.arodes.model import overdo + +from .marc21 import Marc21Schema + + +class ArodesSchema(Marc21Schema): + """Arodes schema.""" + + identifiedBy = fields.List(fields.Dict()) + title = fields.List(fields.Dict()) + documentType = fields.Str() + language = fields.List(fields.Dict()) + abstracts = fields.List(fields.Dict()) + oa_status = fields.Str() + provisionActivity = fields.List(fields.Dict()) + subjects = fields.List(fields.Dict()) + dissertation = fields.Dict() + partOf = fields.List(fields.Dict()) + contribution = fields.List(fields.Dict()) + + @pre_dump + def process(self, obj, **kwargs): + """All the process is done by overdo.""" + return overdo.do(obj) diff --git a/sonar/modules/documents/loaders/schemas/dc.py b/sonar/modules/documents/loaders/schemas/dc.py new file mode 100644 index 000000000..9eda82c20 --- /dev/null +++ b/sonar/modules/documents/loaders/schemas/dc.py @@ -0,0 +1,241 @@ +# -*- coding: utf-8 -*- +# +# Swiss Open Access Repository +# Copyright (C) 2021 RERO +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +"""Dublin core schema.""" + +import re + +import xmltodict +from marshmallow import Schema, fields, post_dump, pre_dump + +from sonar.modules.pdf_extractor.utils import force_list + +TYPE_MAPPINGS = { + 'Book': 'coar:c_2f33', + 'Book Section': 'coar:c_3248', + 'Conference': 'coar:c_c94f', + 'Workshop Item': 'coar:c_c94f', + 'Research Data': 'coar:c_ddb1', + 'Article': 'coar:c_6501', + 'Newspaper': 'coar:c_998f', + 'Magazine Article': 'coar:c_998f', + 'Audiovisual Material & Event': 'non_textual_object', + 'Preprint': 'coar:c_816b', + 'Thesis': 'coar:c_db06', + 'Working Paper': 'coar:c_8042', + 'Other': 'coar:c_1843' +} + + +class DublinCoreSchema(Schema): + """Dublin Core marshmallow schema.""" + + identifiedBy = fields.Method('get_identifiers') + language = fields.Method('get_language') + title = fields.Method('get_title') + provisionActivity = fields.Method('get_provision_activity') + documentType = fields.Method('get_document_type') + abstracts = fields.Method('get_abstracts') + subjects = fields.Method('get_subjects') + contribution = fields.Method('get_contribution') + + def dump(self, obj): + """Serialize an object to native Python data types. + + :param obj: The object to serialize. + :returns: Serialized data + """ + result = xmltodict.parse(obj) + + if not result.get('record', {}).get('metadata', {}).get('oai_dc:dc'): + return None + + record = result['record']['metadata']['oai_dc:dc'] + record['id'] = result['record']['header']['identifier'] + + return super().dump(record) + + @pre_dump + def store_language(self, item, **kwargs): + """Store language.""" + item['languages'] = [] + + for language in force_list(item.get('dc:language', [])): + if language == 'deu': + language = 'ger' + + if language == 'fra': + language = 'fre' + + item['languages'].append(language) + + if not item['languages']: + item['languages'] = ['eng'] + + return item + + @post_dump + def remove_empty_values(self, data, **kwargs): + """Remove empty values before dumping data.""" + return {key: value for key, value in data.items() if value} + + def get_identifiers(self, obj): + """Get identifiers.""" + identifiers = [{ + 'type': 'bf:Local', + 'source': 'edoc', + 'value': obj['id'] + }] + + if not obj.get('dc:identifier'): + return identifiers + + for identifier in force_list(obj['dc:identifier']): + # DOI + match = re.search(r'^info:doi\/(.+)$', identifier) + if match: + identifiers.append({'type': 'bf:Doi', 'value': match.group(1)}) + continue + + # PMID + match = re.search(r'^info:pmid\/(.+)$', identifier) + if match: + identifiers.append({ + 'type': 'bf:Local', + 'value': match.group(1), + 'source': 'PMID' + }) + continue + + # URN + match = re.search(r'^urn:(.+)$', identifier) + if match: + identifiers.append({'type': 'bf:Urn', 'value': match.group(1)}) + continue + + # Other identifier + identifiers.append({'type': 'bf:Identifier', 'value': identifier}) + + return identifiers + + def get_language(self, obj): + """Get language.""" + return [{ + 'type': 'bf:Language', + 'value': item + } for item in obj['languages']] + + def get_title(self, obj): + """Get title.""" + title = 'Default title' + subtitle = None + + if obj.get('dc:title'): + # Title + subtitle + match = re.search(r'^(.+)\s:\s(.+)$', obj['dc:title']) + if match: + title = match.group(1) + subtitle = match.group(2) + else: + title = obj.get('dc:title') + + title = { + 'type': 'bf:Title', + 'mainTitle': [{ + 'value': title, + 'language': obj['languages'][0] + }] + } + + if subtitle: + title['subtitle'] = [{ + 'value': subtitle, + 'language': obj['languages'][0] + }] + + return [title] + + def get_provision_activity(self, obj): + """Get provisition activity.""" + if not obj.get('dc:date'): + return None + + match = re.search(r'^[0-9]{4}$', obj['dc:date']) + + if not match: + return None + + return [{'type': 'bf:Publication', 'startDate': obj['dc:date']}] + + def get_document_type(self, obj): + """Get document type.""" + for type in force_list(obj.get('dc:type', [])): + if TYPE_MAPPINGS.get(type): + return TYPE_MAPPINGS[type] + + return TYPE_MAPPINGS['Other'] + + def get_abstracts(self, obj): + """Get abstracts.""" + if not obj.get('dc:description'): + return None + + return [{ + 'language': obj['languages'][0], + 'value': obj['dc:description'] + }] + + def get_subjects(self, obj): + """Get subjects.""" + if not obj.get('dc:subject'): + return [] + + subjects = [] + + for subject in force_list(obj.get('dc:subject', [])): + subjects.append(subject) + + return [{ + 'label': { + 'language': obj['languages'][0], + 'value': subjects + } + }] + + def get_contribution(self, obj): + """Get contribution.""" + contributors = [] + + for creator in force_list(obj.get('dc:creator', [])): + contributors.append({ + 'agent': { + 'type': 'bf:Person', + 'preferred_name': creator + }, + 'role': ['cre'] + }) + + for contributor in force_list(obj.get('dc:contributor', [])): + contributors.append({ + 'agent': { + 'type': 'bf:Person', + 'preferred_name': contributor + }, + 'role': ['ctb'] + }) + + return contributors diff --git a/sonar/modules/documents/loaders/schemas/edoc.py b/sonar/modules/documents/loaders/schemas/edoc.py new file mode 100644 index 000000000..f54413d24 --- /dev/null +++ b/sonar/modules/documents/loaders/schemas/edoc.py @@ -0,0 +1,24 @@ +# -*- coding: utf-8 -*- +# +# Swiss Open Access Repository +# Copyright (C) 2021 RERO +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +"""Edoc schema.""" + +from .dc import DublinCoreSchema + + +class EdocSchema(DublinCoreSchema): + """Edoc marshmallow schema.""" diff --git a/sonar/modules/documents/loaders/schemas/factory.py b/sonar/modules/documents/loaders/schemas/factory.py index b9289484c..083b48d4b 100644 --- a/sonar/modules/documents/loaders/schemas/factory.py +++ b/sonar/modules/documents/loaders/schemas/factory.py @@ -18,8 +18,11 @@ """Factory for creating a loader schema.""" from .archive_ouverte_unige import ArchiveOuverteUnigeSchema +from .arodes import ArodesSchema from .boris import BorisSchema +from .edoc import EdocSchema from .rerodoc import RerodocSchema +from .zora import ZoraSchema class LoaderSchemaFactory(): @@ -28,7 +31,10 @@ class LoaderSchemaFactory(): schemas = { 'rerodoc': RerodocSchema, 'archive_ouverte_unige': ArchiveOuverteUnigeSchema, - 'boris': BorisSchema + 'boris': BorisSchema, + 'arodes': ArodesSchema, + 'zora': ZoraSchema, + 'edoc': EdocSchema } @staticmethod diff --git a/sonar/modules/documents/loaders/schemas/zora.py b/sonar/modules/documents/loaders/schemas/zora.py new file mode 100644 index 000000000..f48abfcb1 --- /dev/null +++ b/sonar/modules/documents/loaders/schemas/zora.py @@ -0,0 +1,43 @@ +# -*- coding: utf-8 -*- +# +# Swiss Open Access Repository +# Copyright (C) 2021 RERO +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +"""ZORA schema.""" + +from marshmallow import fields, pre_dump + +from sonar.modules.documents.dojson.zora.model import overdo + +from .marc21 import Marc21Schema + + +class ZoraSchema(Marc21Schema): + """Zora schema.""" + + identifiedBy = fields.List(fields.Dict()) + title = fields.List(fields.Dict()) + documentType = fields.Str() + language = fields.List(fields.Dict()) + abstracts = fields.List(fields.Dict()) + provisionActivity = fields.List(fields.Dict()) + dissertation = fields.Dict() + partOf = fields.List(fields.Dict()) + contribution = fields.List(fields.Dict()) + + @pre_dump + def process(self, obj, **kwargs): + """All the process is done by overdo.""" + return overdo.do(obj) diff --git a/sonar/modules/documents/receivers.py b/sonar/modules/documents/receivers.py index a3f1572ad..8c689a889 100644 --- a/sonar/modules/documents/receivers.py +++ b/sonar/modules/documents/receivers.py @@ -73,8 +73,10 @@ def transform_harvested_records(sender=None, records=None, **kwargs): # Convert from Marc XML to JSON data = loader_schema.dump(str(harvested_record)) - # Add transformed data to list - records.append(data) + # Avoid to import deleted records + if data and data.get('title'): + # Add transformed data to list + records.append(data) # Chunk record list and send celery task for chunk in list(chunks(records, CHUNK_SIZE)): diff --git a/tests/unit/documents/loaders/test_arodes_loader.py b/tests/unit/documents/loaders/test_arodes_loader.py new file mode 100644 index 000000000..1aea5e943 --- /dev/null +++ b/tests/unit/documents/loaders/test_arodes_loader.py @@ -0,0 +1,981 @@ +# -*- coding: utf-8 -*- +# +# Swiss Open Access Repository +# Copyright (C) 2021 RERO +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +"""Test ArODES record loader.""" + +import pytest + +from sonar.modules.documents.loaders.schemas.arodes import ArodesSchema + + +def test_title(): + """Test title.""" + xml = """ + + + + + + + + """ + assert ArodesSchema().dump(xml) == {} + + xml = """ + + + + + + Art and design as linked data : + the LODZ project (Linked Open Data Zurich) + + + + + + """ + assert ArodesSchema().dump(xml) == { + 'title': [{ + 'mainTitle': [{ + 'language': 'eng', + 'value': 'Art and design as linked data :' + }], + 'subtitle': [{ + 'language': 'eng', + 'value': 'the LODZ project (Linked Open Data Zurich)' + }], + 'type': + 'bf:Title' + }] + } + + +def test_identifiers(): + """Test identifiers.""" + xml = """ + + + + + 1972 + + DOI + 10.15291/libellarium.v9i2.256 + + + DOI + + + UNKNOWN + 1111 + + + + + + """ + assert ArodesSchema().dump(xml) == { + 'identifiedBy': [ + { + 'source': 'ArODES', + 'type': 'bf:Local', + 'value': '1972' + }, + { + 'type': 'bf:Doi', + 'value': '10.15291/libellarium.v9i2.256' + }, + ] + } + + +@pytest.mark.parametrize('document_type,result', + [(None, None), ('other', 'coar:c_1843'), + ('livre', 'coar:c_2f33'), + ('chapitre', 'coar:c_3248'), + ('conference', 'coar:c_5794'), + ('scientifique', 'coar:c_6501'), + ('professionnel', 'coar:c_3e5a'), + ('rapport', 'coar:c_18ws'), + ('THESES', 'coar:c_db06'), + ('non-existing', 'coar:c_1843')]) +def test_document_type(document_type, result): + """Test document type.""" + if not document_type: + # No 980 + xml = """ + + + + + + + + + """ + assert ArodesSchema().dump(xml) == {} + + # No 980$a + xml = """ + + + + + + + + + + + """ + assert ArodesSchema().dump(xml) == {} + + return + + xml = f""" + + + + + + {document_type} + + + + + + """ + assert ArodesSchema().dump(xml) == {'documentType': result} + + +def test_language(): + """Test language.""" + # No 041 + xml = """ + + + + + + + + """ + assert ArodesSchema().dump(xml) == {} + + # No 041$a + xml = """ + + + + + + + + + + + """ + assert ArodesSchema().dump(xml) == {} + + # One language + xml = """ + + + + + + eng + + + + + + """ + assert ArodesSchema().dump(xml) == { + 'language': [{ + 'type': 'bf:Language', + 'value': 'eng' + }] + } + + # Multiple 041 + xml = """ + + + + + + eng + + + fre + + + + + + """ + assert ArodesSchema().dump(xml) == { + 'language': [{ + 'type': 'bf:Language', + 'value': 'eng' + }, { + 'type': 'bf:Language', + 'value': 'fre' + }] + } + + # Multiple 041$a + xml = """ + + + + + + eng + fre + + + + + + """ + assert ArodesSchema().dump(xml) == { + 'language': [{ + 'type': 'bf:Language', + 'value': 'eng' + }, { + 'type': 'bf:Language', + 'value': 'fre' + }] + } + + +def test_abstracts(): + """Test abstracts.""" + # No 520 + xml = ''' + + + + + + + + ''' + assert ArodesSchema().dump(xml) == {} + + # No 520$a + xml = ''' + + + + + + fre + + + + + + ''' + assert ArodesSchema().dump(xml) == {} + + # No language + xml = ''' + + + + + + La Convention relative + + + + + + ''' + assert ArodesSchema().dump(xml) == { + 'abstracts': [{ + 'language': 'eng', + 'value': 'La Convention relative' + }] + } + + # One abstracts + xml = ''' + + + + + + fre + La Convention relative + + + + + + ''' + assert ArodesSchema().dump(xml) == { + 'abstracts': [{ + 'language': 'fre', + 'value': 'La Convention relative' + }] + } + + # Multiple abstracts + xml = ''' + + + + + + fre + La Convention relative + + + eng + The Convention + + + + + + ''' + assert ArodesSchema().dump(xml) == { + 'abstracts': [{ + 'language': 'fre', + 'value': 'La Convention relative' + }, { + 'language': 'eng', + 'value': 'The Convention' + }] + } + + +def test_oa_status(): + """Test OA status.""" + # No 906 + xml = ''' + + + + + + + + ''' + assert ArodesSchema().dump(xml) == {} + + # No 906$a + xml = ''' + + + + + + + + + + ''' + assert ArodesSchema().dump(xml) == {} + + # Value NONE + xml = ''' + + + + + + NONE + + + + + + ''' + assert ArodesSchema().dump(xml) == {} + + # OK + xml = ''' + + + + + + GOLD + + + + + + ''' + assert ArodesSchema().dump(xml) == {'oa_status': 'gold'} + + +def test_date(): + """Test Date.""" + # No 269$a, no 260$c + xml = ''' + + + + + + + + + ''' + assert ArodesSchema().dump(xml) == {} + + # 269, but no $a + xml = ''' + + + + + + + + + + + ''' + assert ArodesSchema().dump(xml) == {} + + # 269$a, but wrong format. + xml = ''' + + + + + + wrong + + + + + + ''' + assert ArodesSchema().dump(xml) == {} + + # 269$a OK + xml = ''' + + + + + + 2019-01 + + + + + + ''' + assert ArodesSchema().dump(xml) == { + 'provisionActivity': [{ + 'startDate': '2019-01-01', + 'type': 'bf:Publication' + }] + } + + # 260, but no $c + xml = ''' + + + + + + + + + + + ''' + assert ArodesSchema().dump(xml) == {} + + # 260$c, but wrong format. + xml = ''' + + + + + + wrong + + + + + + ''' + assert ArodesSchema().dump(xml) == {} + + # 260$c OK + xml = ''' + + + + + + 2019-01 + + + + + + ''' + assert ArodesSchema().dump(xml) == { + 'provisionActivity': [{ + 'startDate': '2019-01-01', + 'type': 'bf:Publication' + }] + } + + # 269$a and 260$c, 269 have priority + xml = ''' + + + + + + 2020-01 + + + 2019-01 + + + + + + ''' + assert ArodesSchema().dump(xml) == { + 'provisionActivity': [{ + 'startDate': '2019-01-01', + 'type': 'bf:Publication' + }] + } + + +def test_subjects(): + """Test subjects.""" + # No 653 + xml = ''' + + + + + + + + + ''' + assert ArodesSchema().dump(xml) == {} + + # 653 but not $a + xml = ''' + + + + + + + + + + + ''' + assert ArodesSchema().dump(xml) == {} + + # OK, but no language --> default language `eng` + xml = ''' + + + + + + subject 1 + + + + + + ''' + assert ArodesSchema().dump(xml) == { + 'subjects': [{ + 'label': { + 'language': 'eng', + 'value': ['subject 1'] + } + }] + } + + # OK + xml = ''' + + + + + + sujet 1 + fre + + + + + + ''' + assert ArodesSchema().dump(xml) == { + 'subjects': [{ + 'label': { + 'language': 'fre', + 'value': ['sujet 1'] + } + }] + } + + # Multiple subjects + xml = ''' + + + + + + sujet 1 + fre + + + sujet 2 + fre + + + subject 1 + eng + + + + + + ''' + assert ArodesSchema().dump(xml) == { + 'subjects': [{ + 'label': { + 'language': 'fre', + 'value': ['sujet 1', 'sujet 2'] + } + }, { + 'label': { + 'language': 'eng', + 'value': ['subject 1'] + } + }] + } + + +def test_dissertation(): + """Test dissertation.""" + # OK + xml = ''' + + + + + + Dissertation degree + + + + + + ''' + assert ArodesSchema().dump(xml) == { + 'dissertation': { + 'degree': 'Dissertation degree' + } + } + + # No 502 + xml = ''' + + + + + + + + + ''' + assert ArodesSchema().dump(xml) == {} + + # 502, but no $b + xml = ''' + + + + + + + + + + + ''' + assert ArodesSchema().dump(xml) == {} + + +def test_host_document(): + """Test host document.""" + # No 773 + xml = ''' + + + + + + + + + ''' + assert ArodesSchema().dump(xml) == {} + + # No 773$t + xml = ''' + + + + + + + + + + ''' + assert ArodesSchema().dump(xml) == {} + + # Not $g, no provision activity start date + xml = ''' + + + + + + Host document + + + + + + ''' + assert ArodesSchema().dump(xml) == {} + + # Not $g, with provision activity start date + xml = ''' + + + + + + 2019-01 + + + Host document + + + + + + ''' + assert ArodesSchema().dump(xml) == { + 'partOf': [{ + 'document': { + 'title': 'Host document' + }, + 'numberingYear': '2019' + }], + 'provisionActivity': [{ + 'startDate': '2019-01-01', + 'type': 'bf:Publication' + }] + } + + # OK + xml = ''' + + + + + + Host document + 2015, vol. 37, no. 2, pp. 49-58 + + + + + + ''' + assert ArodesSchema().dump(xml) == { + 'partOf': [{ + 'document': { + 'title': 'Host document' + }, + 'numberingYear': '2015', + 'numberingVolume': '37', + 'numberingIssue': '2', + 'numberingPages': '49-58' + }] + } + + +def test_contribution(): + """Test contribution.""" + # No 700 + xml = ''' + + + + + + + + + ''' + assert ArodesSchema().dump(xml) == {} + + # No 700$a + xml = ''' + + + + + + + + + + ''' + assert ArodesSchema().dump(xml) == {} + + # OK + xml = ''' + + + + + + John Doe + RERO + + + + + + ''' + assert ArodesSchema().dump(xml) == { + 'contribution': [{ + 'agent': { + 'preferred_name': 'John Doe', + 'type': 'bf:Person' + }, + 'role': ['ctb'], + 'affiliation': 'RERO' + }] + } + + # Multiple + xml = ''' + + + + + + John Doe + RERO + + + Marc Landers + HES-SO Valais + + + + + + ''' + assert ArodesSchema().dump(xml) == { + 'contribution': [{ + 'agent': { + 'preferred_name': 'John Doe', + 'type': 'bf:Person' + }, + 'role': ['ctb'], + 'affiliation': 'RERO' + }, { + 'agent': { + 'preferred_name': 'Marc Landers', + 'type': 'bf:Person' + }, + 'role': ['ctb'], + 'affiliation': 'HES-SO Valais' + }] + } diff --git a/tests/unit/documents/loaders/test_edoc_loader.py b/tests/unit/documents/loaders/test_edoc_loader.py new file mode 100644 index 000000000..a90aef338 --- /dev/null +++ b/tests/unit/documents/loaders/test_edoc_loader.py @@ -0,0 +1,542 @@ +# -*- coding: utf-8 -*- +# +# Swiss Open Access Repository +# Copyright (C) 2021 RERO +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +"""Test edoc record loader.""" + +import pytest + +from sonar.modules.documents.loaders.schemas.edoc import EdocSchema + + +def test_no_record_metadata(): + """Test when no record data exists.""" + xml = """ + +
+ oai:edoc.unibas.ch:4 +
+
+ """ + assert not EdocSchema().dump(xml) + + +def test_language(): + """Test language.""" + # No language --> default + xml = """ + +
+ 123456 +
+ + + Title + + +
+ """ + assert EdocSchema().dump(xml)['language'] == [{ + 'type': 'bf:Language', + 'value': 'eng' + }] + + # One language + xml = """ + +
+ 123456 +
+ + + deu + + +
+ """ + assert EdocSchema().dump(xml)['language'] == [{ + 'type': 'bf:Language', + 'value': 'ger' + }] + + # Multiple languages + xml = """ + +
+ 123456 +
+ + + deu + fra + eng + + +
+ """ + assert EdocSchema().dump(xml)['language'] == [{ + 'type': 'bf:Language', + 'value': 'ger' + }, { + 'type': 'bf:Language', + 'value': 'fre' + }, { + 'type': 'bf:Language', + 'value': 'eng' + }] + + +def test_identifiers(): + """Test identifiers.""" + # No specific identifiers + xml = """ + +
+ 123456 +
+ + + Title + + +
+ """ + assert EdocSchema().dump(xml)['identifiedBy'] == [{ + 'type': 'bf:Local', + 'source': 'edoc', + 'value': '123456' + }] + + # All identifiers + xml = """ + +
+ 123456 +
+ + + specific-id + info:doi/10.5451/unibas-001565177 + info:pmid/1111 + urn:urn:nbn:ch:bel-bau-diss47638 + + +
+ """ + assert EdocSchema().dump(xml)['identifiedBy'] == [{ + 'source': 'edoc', + 'type': 'bf:Local', + 'value': '123456' + }, { + 'type': 'bf:Identifier', + 'value': 'specific-id' + }, { + 'type': + 'bf:Doi', + 'value': + '10.5451/unibas-001565177' + }, { + 'source': 'PMID', + 'type': 'bf:Local', + 'value': '1111' + }, { + 'type': + 'bf:Urn', + 'value': + 'urn:nbn:ch:bel-bau-diss47638' + }] + + +def test_title(): + """Test title.""" + # No title --> default one + xml = """ + +
+ 123456 +
+ + + Description + + +
+ """ + assert EdocSchema().dump(xml)['title'] == [{ + 'type': + 'bf:Title', + 'mainTitle': [{ + 'value': 'Default title', + 'language': 'eng' + }] + }] + + # Only title + xml = """ + +
+ 123456 +
+ + + Title + + +
+ """ + assert EdocSchema().dump(xml)['title'] == [{ + 'type': + 'bf:Title', + 'mainTitle': [{ + 'value': 'Title', + 'language': 'eng' + }] + }] + + # Title + subtitle + xml = """ + +
+ 123456 +
+ + + Title : Subtitle + + +
+ """ + assert EdocSchema().dump(xml)['title'] == [{ + 'type': + 'bf:Title', + 'mainTitle': [{ + 'value': 'Title', + 'language': 'eng' + }], + 'subtitle': [{ + 'value': 'Subtitle', + 'language': 'eng' + }] + }] + + +def test_provision_activity(): + """Test provision activity.""" + # No provision activity + xml = """ + +
+ 123456 +
+ + + Description + + +
+ """ + assert 'provisionActivity' not in EdocSchema().dump(xml) + + # Wrong date format + xml = """ + +
+ 123456 +
+ + + wrong + + +
+ """ + assert 'provisionActivity' not in EdocSchema().dump(xml) + + # OK + xml = """ + +
+ 123456 +
+ + + 2019 + + +
+ """ + assert EdocSchema().dump(xml)['provisionActivity'] == [{ + 'type': 'bf:Publication', + 'startDate': '2019' + }] + + +def test_document_type(): + """Test document type.""" + # No document type --> other + xml = """ + +
+ 123456 +
+ + + Title + + +
+ """ + assert EdocSchema().dump(xml)['documentType'] == 'coar:c_1843' + + # Multiple, takes only the first + xml = """ + +
+ 123456 +
+ + + Thesis + NonPeerReviewed + + +
+ """ + assert EdocSchema().dump(xml)['documentType'] == 'coar:c_db06' + + # None existing, takes "other" + xml = """ + +
+ 123456 +
+ + + Unknown + + +
+ """ + assert EdocSchema().dump(xml)['documentType'] == 'coar:c_1843' + + +@pytest.mark.parametrize( + 'document_type,result', + [('Book', 'coar:c_2f33'), ('Book Section', 'coar:c_3248'), + ('Conference', 'coar:c_c94f'), ('Workshop Item', 'coar:c_c94f'), + ('Research Data', 'coar:c_ddb1'), ('Article', 'coar:c_6501'), + ('Newspaper', 'coar:c_998f'), ('Magazine Article', 'coar:c_998f'), + ('Audiovisual Material & Event', 'non_textual_object'), + ('Preprint', 'coar:c_816b'), ('Thesis', 'coar:c_db06'), + ('Working Paper', 'coar:c_8042'), ('Other', 'coar:c_1843')]) +def test_document_type_mappings(document_type, result): + """Test document type mappings.""" + xml = f""" + +
+ 123456 +
+ + + {document_type} + + +
+ """ + assert EdocSchema().dump(xml)['documentType'] == result + + +def test_abstracts(): + """Test abstracts.""" + # No abstract + xml = """ + +
+ 123456 +
+ + + Title + + +
+ """ + assert 'abstracts' not in EdocSchema().dump(xml) + + # No abstract + xml = """ + +
+ 123456 +
+ + + Description + + +
+ """ + assert EdocSchema().dump(xml)['abstracts'] == [{ + 'language': 'eng', + 'value': 'Description' + }] + + +def test_subjects(): + """Test subjects.""" + # No subject + xml = """ + +
+ 123456 +
+ + + Title + + +
+ """ + assert 'subjects' not in EdocSchema().dump(xml) + + # One subject + xml = """ + +
+ 123456 +
+ + + Subject 1 + + +
+ """ + assert EdocSchema().dump(xml)['subjects'] == [{ + 'label': { + 'language': 'eng', + 'value': ['Subject 1'] + } + }] + + # Multiple subjects + xml = """ + +
+ 123456 +
+ + + Subject 1 + Subject 2 + + +
+ """ + assert EdocSchema().dump(xml)['subjects'] == [{ + 'label': { + 'language': 'eng', + 'value': ['Subject 1', 'Subject 2'] + } + }] + + +def test_contribution(): + """Test contibution.""" + # No contribution + xml = """ + +
+ 123456 +
+ + + Title + + +
+ """ + assert 'contribution' not in EdocSchema().dump(xml) + + # OK, one creator, multiple contributors + xml = """ + +
+ 123456 +
+ + + Creator + Contributor 1 + Contributor 2 + + +
+ """ + assert EdocSchema().dump(xml)['contribution'] == [{ + 'agent': { + 'type': 'bf:Person', + 'preferred_name': 'Creator' + }, + 'role': ['cre'] + }, { + 'agent': { + 'type': 'bf:Person', + 'preferred_name': 'Contributor 1' + }, + 'role': ['ctb'] + }, { + 'agent': { + 'type': 'bf:Person', + 'preferred_name': 'Contributor 2' + }, + 'role': ['ctb'] + }] + + # OK, multiple creators, one contributor + xml = """ + +
+ 123456 +
+ + + Creator 1 + Creator 2 + Contributor + + +
+ """ + assert EdocSchema().dump(xml)['contribution'] == [{ + 'agent': { + 'type': 'bf:Person', + 'preferred_name': 'Creator 1' + }, + 'role': ['cre'] + }, { + 'agent': { + 'type': 'bf:Person', + 'preferred_name': 'Creator 2' + }, + 'role': ['cre'] + }, { + 'agent': { + 'type': 'bf:Person', + 'preferred_name': 'Contributor' + }, + 'role': ['ctb'] + }] diff --git a/tests/unit/documents/loaders/test_zora_loader.py b/tests/unit/documents/loaders/test_zora_loader.py new file mode 100644 index 000000000..65f662732 --- /dev/null +++ b/tests/unit/documents/loaders/test_zora_loader.py @@ -0,0 +1,712 @@ +# -*- coding: utf-8 -*- +# +# Swiss Open Access Repository +# Copyright (C) 2021 RERO +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +"""Test ZORA record loader.""" + +import pytest + +from sonar.modules.documents.loaders.schemas.zora import ZoraSchema + + +def test_title(): + """Test title.""" + xml = """ + + + + + + + + """ + assert ZoraSchema().dump(xml) == {} + + xml = """ + + + + + + Art and design as linked data : + the LODZ project (Linked Open Data Zurich) + + + + + + """ + assert ZoraSchema().dump(xml) == { + 'title': [{ + 'mainTitle': [{ + 'language': 'eng', + 'value': 'Art and design as linked data :' + }], + 'subtitle': [{ + 'language': 'eng', + 'value': 'the LODZ project (Linked Open Data Zurich)' + }], + 'type': + 'bf:Title' + }] + } + + +def test_identifiers(): + """Test identifiers.""" + xml = """ + + + + + 1972 + + doi + 10.15291/libellarium.v9i2.256 + + + doi + + + pmid + 2222 + + + UNKNOWN + 1111 + + + + + + """ + assert ZoraSchema().dump(xml) == { + 'identifiedBy': [{ + 'source': 'ZORA', + 'type': 'bf:Local', + 'value': '1972' + }, { + 'type': 'bf:Doi', + 'value': '10.15291/libellarium.v9i2.256' + }, { + 'type': 'bf:Local', + 'value': '2222', + 'source': 'PMID' + }, { + 'type': 'bf:Identifier', + 'value': '1111' + }] + } + + +@pytest.mark.parametrize('type, value, result, dissertation', [ + (None, None, None, None), + ('local', 'Herausgegebenes wissenschaftliches Werk', 'coar:c_2f33', None), + ('local', 'Monografie', 'coar:c_2f33', None), + ('local', 'Buchkapitel', 'coar:c_3248', None), + ('local', 'Konferenzbeitrag', 'coar:c_5794', None), + ('local', 'Artikel', 'coar:c_6501', None), + ('local', 'Zeitungsartikel', 'coar:c_998f', None), + ('gnd-content', 'Forschungsbericht', 'coar:c_18ws', None), + ('gnd-content', 'Hochschulschrift', 'coar:c_db06', 'Dissertation'), + ('gnd-content', 'Hochschulschrift', 'coar:c_bdcc', 'Masterarbeit'), + ('gnd-content', 'Hochschulschrift', 'habilitation_thesis', 'Habilitation'), + ('local', 'Working Paper', 'coar:c_8042', None), + ('local', 'non-existing', 'coar:c_1843', None) +]) +def test_document_type(type, value, result, dissertation): + """Test document type.""" + if not type: + # No 655 + xml = """ + + + + + + + + + """ + assert ZoraSchema().dump(xml) == {} + + # No 655$a + xml = """ + + + + + + + + + + + """ + assert ZoraSchema().dump(xml) == {} + + # No 655$2 + xml = """ + + + + + + Doc type + + + + + + """ + assert ZoraSchema().dump(xml) == {} + + return + + xml = f""" + + + + + + {dissertation} + + + {value} + {type} + + + + + + """ + assert ZoraSchema().dump(xml)['documentType'] == result + + +def test_language(): + """Test language.""" + # No 041 + xml = """ + + + + + + + + """ + assert ZoraSchema().dump(xml) == {} + + # No 041$a + xml = """ + + + + + + + + + + + """ + assert ZoraSchema().dump(xml) == {} + + # One language + xml = """ + + + + + + eng + + + + + + """ + assert ZoraSchema().dump(xml) == { + 'language': [{ + 'type': 'bf:Language', + 'value': 'eng' + }] + } + + # Multiple 041 + xml = """ + + + + + + eng + + + fre + + + + + + """ + assert ZoraSchema().dump(xml) == { + 'language': [{ + 'type': 'bf:Language', + 'value': 'eng' + }, { + 'type': 'bf:Language', + 'value': 'fre' + }] + } + + # Multiple 041$a + xml = """ + + + + + + eng + fre + + + + + + """ + assert ZoraSchema().dump(xml) == { + 'language': [{ + 'type': 'bf:Language', + 'value': 'eng' + }, { + 'type': 'bf:Language', + 'value': 'fre' + }] + } + + +def test_abstracts(): + """Test abstracts.""" + # No 520 + xml = ''' + + + + + + + + ''' + assert ZoraSchema().dump(xml) == {} + + # No 520$a + xml = ''' + + + + + + fre + + + + + + ''' + assert ZoraSchema().dump(xml) == {} + + # No language + xml = ''' + + + + + + La Convention relative + + + + + + ''' + assert ZoraSchema().dump(xml) == { + 'abstracts': [{ + 'language': 'eng', + 'value': 'La Convention relative' + }] + } + + # One abstracts + xml = ''' + + + + + + fre + La Convention relative + + + + + + ''' + assert ZoraSchema().dump(xml) == { + 'abstracts': [{ + 'language': 'fre', + 'value': 'La Convention relative' + }] + } + + # Multiple abstracts + xml = ''' + + + + + + fre + La Convention relative + + + eng + The Convention + + + + + + ''' + assert ZoraSchema().dump(xml) == { + 'abstracts': [{ + 'language': 'fre', + 'value': 'La Convention relative' + }, { + 'language': 'eng', + 'value': 'The Convention' + }] + } + + +def test_date(): + """Test Date.""" + # No 264$c + xml = ''' + + + + + + + + + ''' + assert ZoraSchema().dump(xml) == {} + + # 264$c, but wrong format. + xml = ''' + + + + + + wrong + + + + + + ''' + assert ZoraSchema().dump(xml) == {} + + # 264$c OK + xml = ''' + + + + + + 2019 + + + + + + ''' + assert ZoraSchema().dump(xml) == { + 'provisionActivity': [{ + 'startDate': '2019', + 'type': 'bf:Publication' + }] + } + + +def test_dissertation(): + """Test dissertation.""" + # OK + xml = ''' + + + + + + Dissertation degree + Universität Zürich + 2007 + + + + + + ''' + assert ZoraSchema().dump(xml) == { + 'dissertation': { + 'degree': 'Dissertation degree', + 'grantingInstitution': 'Universität Zürich', + 'date': '2007' + } + } + + # No 502 + xml = ''' + + + + + + + + + ''' + assert ZoraSchema().dump(xml) == {} + + # 502, but no $b + xml = ''' + + + + + + + + + + + ''' + assert ZoraSchema().dump(xml) == {} + + +def test_host_document(): + """Test host document.""" + # No 773 + xml = ''' + + + + + + + + + ''' + assert ZoraSchema().dump(xml) == {} + + # No 773$t + xml = ''' + + + + + + + + + + ''' + assert ZoraSchema().dump(xml) == {} + + # Not $g, no provision activity start date + xml = ''' + + + + + + Host document + + + + + + ''' + assert ZoraSchema().dump(xml) == {} + + # Not $g, with provision activity start date + xml = ''' + + + + + + 2019 + + + Host document + + + + + + ''' + assert ZoraSchema().dump(xml) == { + 'partOf': [{ + 'document': { + 'title': 'Host document' + }, + 'numberingYear': '2019' + }], + 'provisionActivity': [{ + 'startDate': '2019', + 'type': 'bf:Publication' + }] + } + + # OK + xml = ''' + + + + + + Host document + Bd. 16, Nr. 3, S. 411-413 (2002) + + + + + + ''' + assert ZoraSchema().dump(xml) == { + 'partOf': [{ + 'document': { + 'title': 'Host document' + }, + 'numberingYear': '2002', + 'numberingVolume': '16', + 'numberingIssue': '3', + 'numberingPages': '411-413' + }] + } + + +def test_contribution_from_field_100(): + """Test extracting contribution from field 100.""" + # OK + xml = """ + + + Romagnani, Andrea + VerfasserIn + aut + (orcid)0000-0003-3669-3497 + + + """ + data = ZoraSchema().dump(xml) + assert data.get('contribution') == [{ + 'agent': { + 'type': 'bf:Person', + 'preferred_name': 'Romagnani, Andrea', + 'identifiedBy': { + 'type': 'bf:Local', + 'source': 'ORCID', + 'value': '0000-0003-3669-3497' + } + }, + 'role': ['cre'] + }] + + # Not $a + xml = """ + + + + + """ + data = ZoraSchema().dump(xml) + assert not data.get('contribution') + + +def test_contribution_from_field_700(): + """Test extracting contribution from field 700.""" + # OK, with bad ORCID + xml = """ + + + Romagnani, Andrea + AkademischeR BetreuerIn + dgs + non-orcid + + + """ + data = ZoraSchema().dump(xml) + assert data.get('contribution') == [{ + 'agent': { + 'type': 'bf:Person', + 'preferred_name': 'Romagnani, Andrea' + }, + 'role': ['dgs'] + }] + + # Not $a + xml = """ + + + + + """ + data = ZoraSchema().dump(xml) + assert not data.get('contribution')