diff --git a/data/oai_sources.json b/data/oai_sources.json
index 228fa7372..4744c5013 100644
--- a/data/oai_sources.json
+++ b/data/oai_sources.json
@@ -22,5 +22,29 @@
"metadataprefix": "oai_openaire",
"comment": "",
"setspecs": ""
+ },
+ {
+ "key": "arodes",
+ "name": "ArODES",
+ "url": "https://hesso.tind.io/oai2d",
+ "metadataprefix": "marcxml",
+ "comment": "",
+ "setspecs": ""
+ },
+ {
+ "key": "zora",
+ "name": "Zora",
+ "url": "https://www.zora.uzh.ch/cgi/oai2",
+ "metadataprefix": "marc21",
+ "comment": "",
+ "setspecs": ""
+ },
+ {
+ "key": "edoc",
+ "name": "edoc",
+ "url": "https://edoc.unibas.ch/cgi/oai2",
+ "metadataprefix": "oai_dc",
+ "comment": "",
+ "setspecs": ""
}
]
diff --git a/sonar/modules/documents/dojson/arodes/__init__.py b/sonar/modules/documents/dojson/arodes/__init__.py
new file mode 100644
index 000000000..c55a39c2e
--- /dev/null
+++ b/sonar/modules/documents/dojson/arodes/__init__.py
@@ -0,0 +1,18 @@
+# -*- coding: utf-8 -*-
+#
+# Swiss Open Access Repository
+# Copyright (C) 2021 RERO
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, version 3 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see .
+
+"""DOJSON transformation for ArODES."""
diff --git a/sonar/modules/documents/dojson/arodes/model.py b/sonar/modules/documents/dojson/arodes/model.py
new file mode 100644
index 000000000..7bc89b158
--- /dev/null
+++ b/sonar/modules/documents/dojson/arodes/model.py
@@ -0,0 +1,343 @@
+# -*- coding: utf-8 -*-
+#
+# Swiss Open Access Repository
+# Copyright (C) 2021 RERO
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, version 3 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see .
+
+"""DOJSON transformation for ArODES."""
+
+import re
+
+from dojson import utils
+
+from sonar.modules.documents.dojson.overdo import Overdo
+
+overdo = Overdo()
+
+TYPE_MAPPINGS = {
+ 'livre': 'coar:c_2f33',
+ 'chapitre': 'coar:c_3248',
+ 'conference': 'coar:c_5794',
+ 'scientifique': 'coar:c_6501',
+ 'professionnel': 'coar:c_3e5a',
+ 'rapport': 'coar:c_18ws',
+ 'THESES': 'coar:c_db06',
+ 'other': 'coar:c_1843'
+}
+
+OA_STATUS = ['green', 'gold', 'hybrid', 'bronze', 'closed']
+
+
+@overdo.over('identifiedBy', '001')
+@utils.ignore_value
+def identified_by_from_001(self, key, value):
+ """Get identifier from field 001."""
+ identified_by = self.get('identifiedBy', [])
+
+ identified_by.append({
+ 'type': 'bf:Local',
+ 'source': 'ArODES',
+ 'value': value
+ })
+
+ return identified_by
+
+
+@overdo.over('identifiedBy', '^0247.')
+@utils.ignore_value
+def identified_by_from_024(self, key, value):
+ """Get identifier from field 024."""
+ identified_by = self.get('identifiedBy', [])
+
+ if not value.get('a') or not value.get('2') in ['DOI', 'PMID']:
+ return None
+
+ if value.get('2') == 'DOI':
+ identified_by.append({'type': 'bf:Doi', 'value': value.get('a')})
+
+ return identified_by
+
+
+@overdo.over('title', '^245..')
+@utils.for_each_value
+@utils.ignore_value
+def title_from_245(self, key, value):
+ """Get title from field 245."""
+ main_title = value.get('a', 'No title found')
+ subtitle = value.get('b')
+ language = value.get('9', 'eng')
+
+ title = {
+ 'type': 'bf:Title',
+ 'mainTitle': [{
+ 'value': main_title,
+ 'language': language
+ }]
+ }
+
+ if subtitle:
+ title['subtitle'] = [{'value': subtitle, 'language': language}]
+
+ return title
+
+
+@overdo.over('documentType', '^980')
+@utils.ignore_value
+def document_type_from_980(self, key, value):
+ """Get document type from 980 field."""
+ document_type = value.get('a', None)
+
+ if self.get('documentType') or not document_type:
+ return None
+
+ if document_type not in TYPE_MAPPINGS:
+ document_type = 'other'
+
+ return TYPE_MAPPINGS[document_type]
+
+
+@overdo.over('language', '^041')
+@utils.for_each_value
+@utils.ignore_value
+def language_from_041(self, key, value):
+ """Get languages."""
+ if not value.get('a'):
+ return None
+
+ language = self.get('language', [])
+
+ codes = utils.force_list(value.get('a'))
+
+ for code in codes:
+ language.append({'type': 'bf:Language', 'value': code})
+
+ self['language'] = language
+
+ return None
+
+
+@overdo.over('abstracts', '^520..')
+@utils.for_each_value
+@utils.ignore_value
+def abstract_from_520(self, key, value):
+ """Get abstract."""
+ abstract = value.get('a')
+ language = value.get('9', 'eng')
+
+ if not abstract:
+ return None
+
+ abstracts_data = self.get('abstracts', [])
+ abstracts_data.append({'value': abstract, 'language': language})
+
+ self['abstracts'] = abstracts_data
+
+ return None
+
+
+@overdo.over('oa_status', '^906..')
+@utils.ignore_value
+def oa_status_from_906(self, key, value):
+ """Get abstract."""
+ oa_status = value.get('a', 'none').lower()
+
+ if not oa_status or oa_status not in OA_STATUS:
+ return None
+
+ return oa_status
+
+
+@overdo.over('date', '^269..')
+@utils.ignore_value
+def date_from_269(self, key, value):
+ """Get date from field 269."""
+ # No date, skipping
+ if not value.get('a'):
+ return None
+
+ # Assign start date
+ match = re.search(r'^[0-9]{4}-[0-9]{2}$', value.get('a'))
+
+ # Date does not match "YYYY" or "YYYY-MM-DD"
+ if not match:
+ return None
+
+ add_provision_activity_start_date(self, value.get('a') + '-01')
+
+ return None
+
+
+@overdo.over('date', '^260..')
+@utils.ignore_value
+def date_from_260(self, key, value):
+ """Get date from field 260."""
+ # No date, skipping
+ if not value.get('c'):
+ return None
+
+ # Assign start date
+ match = re.search(r'^[0-9]{4}-[0-9]{2}$', value.get('c'))
+
+ # Date does not match "YYYY" or "YYYY-MM-DD"
+ if not match:
+ return None
+
+ add_provision_activity_start_date(self, value.get('c') + '-01')
+
+ return None
+
+
+@overdo.over('subjects', '^653..')
+@utils.for_each_value
+@utils.ignore_value
+def subjects_from_653(self, key, value):
+ """Get abstract."""
+ subject = value.get('a')
+ language = value.get('9', 'eng')
+
+ if not subject:
+ return None
+
+ subject_data = get_subject_for_language(self, language)
+ subject_data['label']['value'].append(subject)
+
+ return None
+
+
+@overdo.over('dissertation', '^502..')
+@utils.ignore_value
+def dissertation_from_field_502(self, key, value):
+ """Extract dissertation degree."""
+ if not value.get('b'):
+ return None
+
+ return {'degree': value.get('b')}
+
+
+@overdo.over('partOf', '^773..')
+@utils.ignore_value
+def host_document_from_field_773(self, key, value):
+ """Host document."""
+ if not value.get('t'):
+ return None
+
+ part_of = {'document': {'title': value.get('t')}}
+
+ if not value.get('g'):
+ if self.get('provisionActivity'):
+ match = re.search(r'^(\d{4})',
+ self['provisionActivity'][0]['startDate'])
+ part_of['numberingYear'] = match.group(1)
+ else:
+ # Year
+ match = re.search(r'^(\d{4})', value.get('g'))
+ if match:
+ part_of['numberingYear'] = match.group(1)
+
+ # Volume
+ match = re.search(r'vol\.\s(\d+)', value.get('g'))
+ if match:
+ part_of['numberingVolume'] = match.group(1)
+
+ # Issue
+ match = re.search(r'no\.\s(\d+)', value.get('g'))
+ if match:
+ part_of['numberingIssue'] = match.group(1)
+
+ # Pages
+ match = re.search(r'pp\.\s([0-9\-–]+)', value.get('g'))
+ if match:
+ part_of['numberingPages'] = match.group(1)
+
+ if not part_of.get('numberingYear'):
+ return None
+
+ return [part_of]
+
+
+@overdo.over('contribution', '^700..')
+@utils.for_each_value
+@utils.ignore_value
+def contribution_from_700(self, key, value):
+ """Get contribution."""
+ name = value.get('a')
+ affiliation = value.get('u')
+
+ if not name:
+ return None
+
+ contribution = {
+ 'agent': {
+ 'type': 'bf:Person',
+ 'preferred_name': name
+ },
+ 'role': ['ctb']
+ }
+
+ if affiliation:
+ contribution['affiliation'] = affiliation
+
+ return contribution
+
+
+def add_provision_activity_start_date(data, date):
+ """Add start date for provision activity.
+
+ :param data: Data dictionary.
+ :param date: Date to add.
+ """
+ provisition_activity = data.get('provisionActivity', [])
+
+ def get_publication():
+ """Get stored publication."""
+ for key, item in enumerate(provisition_activity):
+ if item['type'] == 'bf:Publication':
+ return provisition_activity.pop(key)
+
+ return {'type': 'bf:Publication', 'startDate': None}
+
+ publication = get_publication()
+
+ publication['startDate'] = date
+
+ # Inject publiction into provision activity
+ provisition_activity.append(publication)
+
+ # Re-assign provisionActivity
+ data['provisionActivity'] = provisition_activity
+
+
+def get_subject_for_language(data, language):
+ """Return the subject item corresponding to language.
+
+ :param dict data: Overdo data
+ :param str language: Language code
+ :returns: Subject object
+ :rtype: Dict
+ """
+ if not data.get('subjects'):
+ data['subjects'] = []
+
+ subjects = [
+ subject for subject in data.get('subjects', [])
+ if subject['label']['language'] == language
+ ]
+
+ # Create an empty subject
+ if not subjects:
+ subject = {'label': {'language': language, 'value': []}}
+ data['subjects'].append(subject)
+ return subject
+
+ return subjects[0]
diff --git a/sonar/modules/documents/dojson/zora/__init__.py b/sonar/modules/documents/dojson/zora/__init__.py
new file mode 100644
index 000000000..1e768442e
--- /dev/null
+++ b/sonar/modules/documents/dojson/zora/__init__.py
@@ -0,0 +1,18 @@
+# -*- coding: utf-8 -*-
+#
+# Swiss Open Access Repository
+# Copyright (C) 2021 RERO
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, version 3 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see .
+
+"""DOJSON transformation for ZORA."""
diff --git a/sonar/modules/documents/dojson/zora/model.py b/sonar/modules/documents/dojson/zora/model.py
new file mode 100644
index 000000000..082a61753
--- /dev/null
+++ b/sonar/modules/documents/dojson/zora/model.py
@@ -0,0 +1,328 @@
+# -*- coding: utf-8 -*-
+#
+# Swiss Open Access Repository
+# Copyright (C) 2021 RERO
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, version 3 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see .
+
+"""DOJSON transformation for ZORA."""
+
+import re
+
+from dojson import utils
+
+from sonar.modules.documents.dojson.overdo import Overdo
+
+overdo = Overdo()
+
+
+@overdo.over('identifiedBy', '001')
+@utils.ignore_value
+def identified_by_from_001(self, key, value):
+ """Get identifier from field 001."""
+ identified_by = self.get('identifiedBy', [])
+
+ identified_by.append({
+ 'type': 'bf:Local',
+ 'source': 'ZORA',
+ 'value': value
+ })
+
+ return identified_by
+
+
+@overdo.over('identifiedBy', '^0247.')
+@utils.ignore_value
+def identified_by_from_024(self, key, value):
+ """Get identifier from field 024."""
+ identified_by = self.get('identifiedBy', [])
+
+ if not value.get('a'):
+ return None
+
+ if value.get('2') == 'doi':
+ identified_by.append({'type': 'bf:Doi', 'value': value.get('a')})
+ elif value.get('2') == 'pmid':
+ identified_by.append({
+ 'type': 'bf:Local',
+ 'value': value.get('a'),
+ 'source': 'PMID'
+ })
+ else:
+ identified_by.append({
+ 'type': 'bf:Identifier',
+ 'value': value.get('a')
+ })
+
+ return identified_by
+
+
+@overdo.over('title', '^245..')
+@utils.for_each_value
+@utils.ignore_value
+def title_from_245(self, key, value):
+ """Get title from field 245."""
+ main_title = value.get('a', 'No title found')
+ subtitle = value.get('b')
+ language = value.get('9', 'eng')
+
+ title = {
+ 'type': 'bf:Title',
+ 'mainTitle': [{
+ 'value': main_title,
+ 'language': language
+ }]
+ }
+
+ if subtitle:
+ title['subtitle'] = [{'value': subtitle, 'language': language}]
+
+ return title
+
+
+@overdo.over('documentType', '^655')
+@utils.ignore_value
+def document_type_from_655(self, key, value):
+ """Get document type from 655 field."""
+ type = value.get('2')
+ value = value.get('a')
+
+ if self.get('documentType') or not value or not type:
+ return None
+
+ record = overdo.blob_record
+
+ # Book
+ if type == 'local' and value == 'Herausgegebenes wissenschaftliches Werk':
+ return 'coar:c_2f33'
+
+ if type == 'local' and value == 'Monografie':
+ return 'coar:c_2f33'
+
+ # Book part
+ if type == 'local' and value == 'Buchkapitel':
+ return 'coar:c_3248'
+
+ # Conference paper
+ if type == 'local' and value == 'Konferenzbeitrag':
+ return 'coar:c_5794'
+
+ # Journal article
+ if type == 'local' and value == 'Artikel':
+ return 'coar:c_6501'
+
+ # Newspaper article
+ if type == 'local' and value == 'Zeitungsartikel':
+ return 'coar:c_998f'
+
+ # Research report
+ if type == 'gnd-content' and value == 'Forschungsbericht':
+ return 'coar:c_18ws'
+
+ # Doctoral thesis
+ if type == 'gnd-content' and value == 'Hochschulschrift' and record.get(
+ '502__', {}).get('b') == 'Dissertation':
+ return 'coar:c_db06'
+
+ # Master thesis
+ if type == 'gnd-content' and value == 'Hochschulschrift' and record.get(
+ '502__', {}).get('b') == 'Masterarbeit':
+ return 'coar:c_bdcc'
+
+ # Habilitation thesis
+ if type == 'gnd-content' and value == 'Hochschulschrift' and record.get(
+ '502__', {}).get('b') == 'Habilitation':
+ return 'habilitation_thesis'
+
+ # Working paper
+ if type == 'local' and value == 'Working Paper':
+ return 'coar:c_8042'
+
+ return 'coar:c_1843'
+
+
+@overdo.over('language', '^041')
+@utils.for_each_value
+@utils.ignore_value
+def language_from_041(self, key, value):
+ """Get languages."""
+ if not value.get('a'):
+ return None
+
+ language = self.get('language', [])
+
+ codes = utils.force_list(value.get('a'))
+
+ for code in codes:
+ language.append({'type': 'bf:Language', 'value': code})
+
+ self['language'] = language
+
+ return None
+
+
+@overdo.over('abstracts', '^520..')
+@utils.for_each_value
+@utils.ignore_value
+def abstract_from_520(self, key, value):
+ """Get abstract."""
+ abstract = value.get('a')
+ language = value.get('9', 'eng')
+
+ if not abstract:
+ return None
+
+ abstracts_data = self.get('abstracts', [])
+ abstracts_data.append({'value': abstract, 'language': language})
+
+ self['abstracts'] = abstracts_data
+
+ return None
+
+
+@overdo.over('date', '^264..')
+@utils.ignore_value
+def date_from_264(self, key, value):
+ """Get date from field 264."""
+ # No date, skipping
+ if not value.get('c'):
+ return None
+
+ # Assign start date
+ match = re.search(r'^[0-9]{4}$', value.get('c'))
+
+ # Date does not match "YYYY" or "YYYY-MM-DD"
+ if not match:
+ return None
+
+ add_provision_activity_start_date(self, value.get('c'))
+
+ return None
+
+
+@overdo.over('dissertation', '^502..')
+@utils.ignore_value
+def dissertation_from_field_502(self, key, value):
+ """Extract dissertation degree."""
+ if not value.get('b'):
+ return None
+
+ dissertation = {'degree': value.get('b')}
+
+ if value.get('c'):
+ dissertation['grantingInstitution'] = value.get('c')
+
+ if value.get('d'):
+ dissertation['date'] = value.get('d')
+
+ return dissertation
+
+
+@overdo.over('partOf', '^773..')
+@utils.ignore_value
+def host_document_from_field_773(self, key, value):
+ """Host document."""
+ if not value.get('t'):
+ return None
+
+ part_of = {'document': {'title': value.get('t')}}
+
+ if not value.get('g'):
+ if self.get('provisionActivity'):
+ match = re.search(r'^(\d{4})',
+ self['provisionActivity'][0]['startDate'])
+ part_of['numberingYear'] = match.group(1)
+ else:
+ # Year
+ match = re.search(r'\((\d{4})\)$', value.get('g'))
+ if match:
+ part_of['numberingYear'] = match.group(1)
+
+ # Volume
+ match = re.search(r'Bd\.\s(\d+)', value.get('g'))
+ if match:
+ part_of['numberingVolume'] = match.group(1)
+
+ # Issue
+ match = re.search(r'Nr\.\s(\d+)', value.get('g'))
+ if match:
+ part_of['numberingIssue'] = match.group(1)
+
+ # Pages
+ match = re.search(r'S\.\s(.+)\s\(', value.get('g'))
+ if match:
+ part_of['numberingPages'] = match.group(1)
+
+ if not part_of.get('numberingYear'):
+ return None
+
+ return [part_of]
+
+
+@overdo.over('contribution', '^[17]00..')
+@utils.ignore_value
+def contribution_from_field_100_700(self, key, value):
+ """Extract contribution from field 100 or 700."""
+ if not value.get('a'):
+ return None
+
+ contribution = self.get('contribution', [])
+
+ data = {
+ 'agent': {
+ 'type': 'bf:Person',
+ 'preferred_name': value.get('a')
+ },
+ 'role': ['cre' if value.get('4') == 'aut' else value.get('4')]
+ }
+
+ if value.get('0'):
+ match = re.search(r'^\(orcid\)(.*)$', value.get('0'))
+ if match:
+ data['agent']['identifiedBy'] = {
+ 'type': 'bf:Local',
+ 'source': 'ORCID',
+ 'value': match.group(1)
+ }
+
+ contribution.append(data)
+ self['contribution'] = contribution
+
+ return None
+
+
+def add_provision_activity_start_date(data, date):
+ """Add start date for provision activity.
+
+ :param data: Data dictionary.
+ :param date: Date to add.
+ """
+ provisition_activity = data.get('provisionActivity', [])
+
+ def get_publication():
+ """Get stored publication."""
+ for key, item in enumerate(provisition_activity):
+ if item['type'] == 'bf:Publication':
+ return provisition_activity.pop(key)
+
+ return {'type': 'bf:Publication', 'startDate': None}
+
+ publication = get_publication()
+
+ publication['startDate'] = date
+
+ # Inject publiction into provision activity
+ provisition_activity.append(publication)
+
+ # Re-assign provisionActivity
+ data['provisionActivity'] = provisition_activity
diff --git a/sonar/modules/documents/loaders/schemas/arodes.py b/sonar/modules/documents/loaders/schemas/arodes.py
new file mode 100644
index 000000000..5964fa6d3
--- /dev/null
+++ b/sonar/modules/documents/loaders/schemas/arodes.py
@@ -0,0 +1,45 @@
+# -*- coding: utf-8 -*-
+#
+# Swiss Open Access Repository
+# Copyright (C) 2021 RERO
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, version 3 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see .
+
+"""Arodes schema."""
+
+from marshmallow import fields, pre_dump
+
+from sonar.modules.documents.dojson.arodes.model import overdo
+
+from .marc21 import Marc21Schema
+
+
+class ArodesSchema(Marc21Schema):
+ """Arodes schema."""
+
+ identifiedBy = fields.List(fields.Dict())
+ title = fields.List(fields.Dict())
+ documentType = fields.Str()
+ language = fields.List(fields.Dict())
+ abstracts = fields.List(fields.Dict())
+ oa_status = fields.Str()
+ provisionActivity = fields.List(fields.Dict())
+ subjects = fields.List(fields.Dict())
+ dissertation = fields.Dict()
+ partOf = fields.List(fields.Dict())
+ contribution = fields.List(fields.Dict())
+
+ @pre_dump
+ def process(self, obj, **kwargs):
+ """All the process is done by overdo."""
+ return overdo.do(obj)
diff --git a/sonar/modules/documents/loaders/schemas/dc.py b/sonar/modules/documents/loaders/schemas/dc.py
new file mode 100644
index 000000000..9eda82c20
--- /dev/null
+++ b/sonar/modules/documents/loaders/schemas/dc.py
@@ -0,0 +1,241 @@
+# -*- coding: utf-8 -*-
+#
+# Swiss Open Access Repository
+# Copyright (C) 2021 RERO
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, version 3 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see .
+
+"""Dublin core schema."""
+
+import re
+
+import xmltodict
+from marshmallow import Schema, fields, post_dump, pre_dump
+
+from sonar.modules.pdf_extractor.utils import force_list
+
+TYPE_MAPPINGS = {
+ 'Book': 'coar:c_2f33',
+ 'Book Section': 'coar:c_3248',
+ 'Conference': 'coar:c_c94f',
+ 'Workshop Item': 'coar:c_c94f',
+ 'Research Data': 'coar:c_ddb1',
+ 'Article': 'coar:c_6501',
+ 'Newspaper': 'coar:c_998f',
+ 'Magazine Article': 'coar:c_998f',
+ 'Audiovisual Material & Event': 'non_textual_object',
+ 'Preprint': 'coar:c_816b',
+ 'Thesis': 'coar:c_db06',
+ 'Working Paper': 'coar:c_8042',
+ 'Other': 'coar:c_1843'
+}
+
+
+class DublinCoreSchema(Schema):
+ """Dublin Core marshmallow schema."""
+
+ identifiedBy = fields.Method('get_identifiers')
+ language = fields.Method('get_language')
+ title = fields.Method('get_title')
+ provisionActivity = fields.Method('get_provision_activity')
+ documentType = fields.Method('get_document_type')
+ abstracts = fields.Method('get_abstracts')
+ subjects = fields.Method('get_subjects')
+ contribution = fields.Method('get_contribution')
+
+ def dump(self, obj):
+ """Serialize an object to native Python data types.
+
+ :param obj: The object to serialize.
+ :returns: Serialized data
+ """
+ result = xmltodict.parse(obj)
+
+ if not result.get('record', {}).get('metadata', {}).get('oai_dc:dc'):
+ return None
+
+ record = result['record']['metadata']['oai_dc:dc']
+ record['id'] = result['record']['header']['identifier']
+
+ return super().dump(record)
+
+ @pre_dump
+ def store_language(self, item, **kwargs):
+ """Store language."""
+ item['languages'] = []
+
+ for language in force_list(item.get('dc:language', [])):
+ if language == 'deu':
+ language = 'ger'
+
+ if language == 'fra':
+ language = 'fre'
+
+ item['languages'].append(language)
+
+ if not item['languages']:
+ item['languages'] = ['eng']
+
+ return item
+
+ @post_dump
+ def remove_empty_values(self, data, **kwargs):
+ """Remove empty values before dumping data."""
+ return {key: value for key, value in data.items() if value}
+
+ def get_identifiers(self, obj):
+ """Get identifiers."""
+ identifiers = [{
+ 'type': 'bf:Local',
+ 'source': 'edoc',
+ 'value': obj['id']
+ }]
+
+ if not obj.get('dc:identifier'):
+ return identifiers
+
+ for identifier in force_list(obj['dc:identifier']):
+ # DOI
+ match = re.search(r'^info:doi\/(.+)$', identifier)
+ if match:
+ identifiers.append({'type': 'bf:Doi', 'value': match.group(1)})
+ continue
+
+ # PMID
+ match = re.search(r'^info:pmid\/(.+)$', identifier)
+ if match:
+ identifiers.append({
+ 'type': 'bf:Local',
+ 'value': match.group(1),
+ 'source': 'PMID'
+ })
+ continue
+
+ # URN
+ match = re.search(r'^urn:(.+)$', identifier)
+ if match:
+ identifiers.append({'type': 'bf:Urn', 'value': match.group(1)})
+ continue
+
+ # Other identifier
+ identifiers.append({'type': 'bf:Identifier', 'value': identifier})
+
+ return identifiers
+
+ def get_language(self, obj):
+ """Get language."""
+ return [{
+ 'type': 'bf:Language',
+ 'value': item
+ } for item in obj['languages']]
+
+ def get_title(self, obj):
+ """Get title."""
+ title = 'Default title'
+ subtitle = None
+
+ if obj.get('dc:title'):
+ # Title + subtitle
+ match = re.search(r'^(.+)\s:\s(.+)$', obj['dc:title'])
+ if match:
+ title = match.group(1)
+ subtitle = match.group(2)
+ else:
+ title = obj.get('dc:title')
+
+ title = {
+ 'type': 'bf:Title',
+ 'mainTitle': [{
+ 'value': title,
+ 'language': obj['languages'][0]
+ }]
+ }
+
+ if subtitle:
+ title['subtitle'] = [{
+ 'value': subtitle,
+ 'language': obj['languages'][0]
+ }]
+
+ return [title]
+
+ def get_provision_activity(self, obj):
+ """Get provisition activity."""
+ if not obj.get('dc:date'):
+ return None
+
+ match = re.search(r'^[0-9]{4}$', obj['dc:date'])
+
+ if not match:
+ return None
+
+ return [{'type': 'bf:Publication', 'startDate': obj['dc:date']}]
+
+ def get_document_type(self, obj):
+ """Get document type."""
+ for type in force_list(obj.get('dc:type', [])):
+ if TYPE_MAPPINGS.get(type):
+ return TYPE_MAPPINGS[type]
+
+ return TYPE_MAPPINGS['Other']
+
+ def get_abstracts(self, obj):
+ """Get abstracts."""
+ if not obj.get('dc:description'):
+ return None
+
+ return [{
+ 'language': obj['languages'][0],
+ 'value': obj['dc:description']
+ }]
+
+ def get_subjects(self, obj):
+ """Get subjects."""
+ if not obj.get('dc:subject'):
+ return []
+
+ subjects = []
+
+ for subject in force_list(obj.get('dc:subject', [])):
+ subjects.append(subject)
+
+ return [{
+ 'label': {
+ 'language': obj['languages'][0],
+ 'value': subjects
+ }
+ }]
+
+ def get_contribution(self, obj):
+ """Get contribution."""
+ contributors = []
+
+ for creator in force_list(obj.get('dc:creator', [])):
+ contributors.append({
+ 'agent': {
+ 'type': 'bf:Person',
+ 'preferred_name': creator
+ },
+ 'role': ['cre']
+ })
+
+ for contributor in force_list(obj.get('dc:contributor', [])):
+ contributors.append({
+ 'agent': {
+ 'type': 'bf:Person',
+ 'preferred_name': contributor
+ },
+ 'role': ['ctb']
+ })
+
+ return contributors
diff --git a/sonar/modules/documents/loaders/schemas/edoc.py b/sonar/modules/documents/loaders/schemas/edoc.py
new file mode 100644
index 000000000..f54413d24
--- /dev/null
+++ b/sonar/modules/documents/loaders/schemas/edoc.py
@@ -0,0 +1,24 @@
+# -*- coding: utf-8 -*-
+#
+# Swiss Open Access Repository
+# Copyright (C) 2021 RERO
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, version 3 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see .
+
+"""Edoc schema."""
+
+from .dc import DublinCoreSchema
+
+
+class EdocSchema(DublinCoreSchema):
+ """Edoc marshmallow schema."""
diff --git a/sonar/modules/documents/loaders/schemas/factory.py b/sonar/modules/documents/loaders/schemas/factory.py
index b9289484c..083b48d4b 100644
--- a/sonar/modules/documents/loaders/schemas/factory.py
+++ b/sonar/modules/documents/loaders/schemas/factory.py
@@ -18,8 +18,11 @@
"""Factory for creating a loader schema."""
from .archive_ouverte_unige import ArchiveOuverteUnigeSchema
+from .arodes import ArodesSchema
from .boris import BorisSchema
+from .edoc import EdocSchema
from .rerodoc import RerodocSchema
+from .zora import ZoraSchema
class LoaderSchemaFactory():
@@ -28,7 +31,10 @@ class LoaderSchemaFactory():
schemas = {
'rerodoc': RerodocSchema,
'archive_ouverte_unige': ArchiveOuverteUnigeSchema,
- 'boris': BorisSchema
+ 'boris': BorisSchema,
+ 'arodes': ArodesSchema,
+ 'zora': ZoraSchema,
+ 'edoc': EdocSchema
}
@staticmethod
diff --git a/sonar/modules/documents/loaders/schemas/zora.py b/sonar/modules/documents/loaders/schemas/zora.py
new file mode 100644
index 000000000..f48abfcb1
--- /dev/null
+++ b/sonar/modules/documents/loaders/schemas/zora.py
@@ -0,0 +1,43 @@
+# -*- coding: utf-8 -*-
+#
+# Swiss Open Access Repository
+# Copyright (C) 2021 RERO
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, version 3 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see .
+
+"""ZORA schema."""
+
+from marshmallow import fields, pre_dump
+
+from sonar.modules.documents.dojson.zora.model import overdo
+
+from .marc21 import Marc21Schema
+
+
+class ZoraSchema(Marc21Schema):
+ """Zora schema."""
+
+ identifiedBy = fields.List(fields.Dict())
+ title = fields.List(fields.Dict())
+ documentType = fields.Str()
+ language = fields.List(fields.Dict())
+ abstracts = fields.List(fields.Dict())
+ provisionActivity = fields.List(fields.Dict())
+ dissertation = fields.Dict()
+ partOf = fields.List(fields.Dict())
+ contribution = fields.List(fields.Dict())
+
+ @pre_dump
+ def process(self, obj, **kwargs):
+ """All the process is done by overdo."""
+ return overdo.do(obj)
diff --git a/sonar/modules/documents/receivers.py b/sonar/modules/documents/receivers.py
index a3f1572ad..8c689a889 100644
--- a/sonar/modules/documents/receivers.py
+++ b/sonar/modules/documents/receivers.py
@@ -73,8 +73,10 @@ def transform_harvested_records(sender=None, records=None, **kwargs):
# Convert from Marc XML to JSON
data = loader_schema.dump(str(harvested_record))
- # Add transformed data to list
- records.append(data)
+ # Avoid to import deleted records
+ if data and data.get('title'):
+ # Add transformed data to list
+ records.append(data)
# Chunk record list and send celery task
for chunk in list(chunks(records, CHUNK_SIZE)):
diff --git a/tests/unit/documents/loaders/test_arodes_loader.py b/tests/unit/documents/loaders/test_arodes_loader.py
new file mode 100644
index 000000000..1aea5e943
--- /dev/null
+++ b/tests/unit/documents/loaders/test_arodes_loader.py
@@ -0,0 +1,981 @@
+# -*- coding: utf-8 -*-
+#
+# Swiss Open Access Repository
+# Copyright (C) 2021 RERO
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, version 3 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see .
+
+"""Test ArODES record loader."""
+
+import pytest
+
+from sonar.modules.documents.loaders.schemas.arodes import ArodesSchema
+
+
+def test_title():
+ """Test title."""
+ xml = """
+
+
+
+
+
+
+
+ """
+ assert ArodesSchema().dump(xml) == {}
+
+ xml = """
+
+
+
+
+
+ Art and design as linked data :
+ the LODZ project (Linked Open Data Zurich)
+
+
+
+
+
+ """
+ assert ArodesSchema().dump(xml) == {
+ 'title': [{
+ 'mainTitle': [{
+ 'language': 'eng',
+ 'value': 'Art and design as linked data :'
+ }],
+ 'subtitle': [{
+ 'language': 'eng',
+ 'value': 'the LODZ project (Linked Open Data Zurich)'
+ }],
+ 'type':
+ 'bf:Title'
+ }]
+ }
+
+
+def test_identifiers():
+ """Test identifiers."""
+ xml = """
+
+
+
+
+ 1972
+
+ DOI
+ 10.15291/libellarium.v9i2.256
+
+
+ DOI
+
+
+ UNKNOWN
+ 1111
+
+
+
+
+
+ """
+ assert ArodesSchema().dump(xml) == {
+ 'identifiedBy': [
+ {
+ 'source': 'ArODES',
+ 'type': 'bf:Local',
+ 'value': '1972'
+ },
+ {
+ 'type': 'bf:Doi',
+ 'value': '10.15291/libellarium.v9i2.256'
+ },
+ ]
+ }
+
+
+@pytest.mark.parametrize('document_type,result',
+ [(None, None), ('other', 'coar:c_1843'),
+ ('livre', 'coar:c_2f33'),
+ ('chapitre', 'coar:c_3248'),
+ ('conference', 'coar:c_5794'),
+ ('scientifique', 'coar:c_6501'),
+ ('professionnel', 'coar:c_3e5a'),
+ ('rapport', 'coar:c_18ws'),
+ ('THESES', 'coar:c_db06'),
+ ('non-existing', 'coar:c_1843')])
+def test_document_type(document_type, result):
+ """Test document type."""
+ if not document_type:
+ # No 980
+ xml = """
+
+
+
+
+
+
+
+
+ """
+ assert ArodesSchema().dump(xml) == {}
+
+ # No 980$a
+ xml = """
+
+
+
+
+
+
+
+
+
+
+ """
+ assert ArodesSchema().dump(xml) == {}
+
+ return
+
+ xml = f"""
+
+
+
+
+
+ {document_type}
+
+
+
+
+
+ """
+ assert ArodesSchema().dump(xml) == {'documentType': result}
+
+
+def test_language():
+ """Test language."""
+ # No 041
+ xml = """
+
+
+
+
+
+
+
+ """
+ assert ArodesSchema().dump(xml) == {}
+
+ # No 041$a
+ xml = """
+
+
+
+
+
+
+
+
+
+
+ """
+ assert ArodesSchema().dump(xml) == {}
+
+ # One language
+ xml = """
+
+
+
+
+
+ eng
+
+
+
+
+
+ """
+ assert ArodesSchema().dump(xml) == {
+ 'language': [{
+ 'type': 'bf:Language',
+ 'value': 'eng'
+ }]
+ }
+
+ # Multiple 041
+ xml = """
+
+
+
+
+
+ eng
+
+
+ fre
+
+
+
+
+
+ """
+ assert ArodesSchema().dump(xml) == {
+ 'language': [{
+ 'type': 'bf:Language',
+ 'value': 'eng'
+ }, {
+ 'type': 'bf:Language',
+ 'value': 'fre'
+ }]
+ }
+
+ # Multiple 041$a
+ xml = """
+
+
+
+
+
+ eng
+ fre
+
+
+
+
+
+ """
+ assert ArodesSchema().dump(xml) == {
+ 'language': [{
+ 'type': 'bf:Language',
+ 'value': 'eng'
+ }, {
+ 'type': 'bf:Language',
+ 'value': 'fre'
+ }]
+ }
+
+
+def test_abstracts():
+ """Test abstracts."""
+ # No 520
+ xml = '''
+
+
+
+
+
+
+
+ '''
+ assert ArodesSchema().dump(xml) == {}
+
+ # No 520$a
+ xml = '''
+
+
+
+
+
+ fre
+
+
+
+
+
+ '''
+ assert ArodesSchema().dump(xml) == {}
+
+ # No language
+ xml = '''
+
+
+
+
+
+ La Convention relative
+
+
+
+
+
+ '''
+ assert ArodesSchema().dump(xml) == {
+ 'abstracts': [{
+ 'language': 'eng',
+ 'value': 'La Convention relative'
+ }]
+ }
+
+ # One abstracts
+ xml = '''
+
+
+
+
+
+ fre
+ La Convention relative
+
+
+
+
+
+ '''
+ assert ArodesSchema().dump(xml) == {
+ 'abstracts': [{
+ 'language': 'fre',
+ 'value': 'La Convention relative'
+ }]
+ }
+
+ # Multiple abstracts
+ xml = '''
+
+
+
+
+
+ fre
+ La Convention relative
+
+
+ eng
+ The Convention
+
+
+
+
+
+ '''
+ assert ArodesSchema().dump(xml) == {
+ 'abstracts': [{
+ 'language': 'fre',
+ 'value': 'La Convention relative'
+ }, {
+ 'language': 'eng',
+ 'value': 'The Convention'
+ }]
+ }
+
+
+def test_oa_status():
+ """Test OA status."""
+ # No 906
+ xml = '''
+
+
+
+
+
+
+
+ '''
+ assert ArodesSchema().dump(xml) == {}
+
+ # No 906$a
+ xml = '''
+
+
+
+
+
+
+
+
+
+ '''
+ assert ArodesSchema().dump(xml) == {}
+
+ # Value NONE
+ xml = '''
+
+
+
+
+
+ NONE
+
+
+
+
+
+ '''
+ assert ArodesSchema().dump(xml) == {}
+
+ # OK
+ xml = '''
+
+
+
+
+
+ GOLD
+
+
+
+
+
+ '''
+ assert ArodesSchema().dump(xml) == {'oa_status': 'gold'}
+
+
+def test_date():
+ """Test Date."""
+ # No 269$a, no 260$c
+ xml = '''
+
+
+
+
+
+
+
+
+ '''
+ assert ArodesSchema().dump(xml) == {}
+
+ # 269, but no $a
+ xml = '''
+
+
+
+
+
+
+
+
+
+
+ '''
+ assert ArodesSchema().dump(xml) == {}
+
+ # 269$a, but wrong format.
+ xml = '''
+
+
+
+
+
+ wrong
+
+
+
+
+
+ '''
+ assert ArodesSchema().dump(xml) == {}
+
+ # 269$a OK
+ xml = '''
+
+
+
+
+
+ 2019-01
+
+
+
+
+
+ '''
+ assert ArodesSchema().dump(xml) == {
+ 'provisionActivity': [{
+ 'startDate': '2019-01-01',
+ 'type': 'bf:Publication'
+ }]
+ }
+
+ # 260, but no $c
+ xml = '''
+
+
+
+
+
+
+
+
+
+
+ '''
+ assert ArodesSchema().dump(xml) == {}
+
+ # 260$c, but wrong format.
+ xml = '''
+
+
+
+
+
+ wrong
+
+
+
+
+
+ '''
+ assert ArodesSchema().dump(xml) == {}
+
+ # 260$c OK
+ xml = '''
+
+
+
+
+
+ 2019-01
+
+
+
+
+
+ '''
+ assert ArodesSchema().dump(xml) == {
+ 'provisionActivity': [{
+ 'startDate': '2019-01-01',
+ 'type': 'bf:Publication'
+ }]
+ }
+
+ # 269$a and 260$c, 269 have priority
+ xml = '''
+
+
+
+
+
+ 2020-01
+
+
+ 2019-01
+
+
+
+
+
+ '''
+ assert ArodesSchema().dump(xml) == {
+ 'provisionActivity': [{
+ 'startDate': '2019-01-01',
+ 'type': 'bf:Publication'
+ }]
+ }
+
+
+def test_subjects():
+ """Test subjects."""
+ # No 653
+ xml = '''
+
+
+
+
+
+
+
+
+ '''
+ assert ArodesSchema().dump(xml) == {}
+
+ # 653 but not $a
+ xml = '''
+
+
+
+
+
+
+
+
+
+
+ '''
+ assert ArodesSchema().dump(xml) == {}
+
+ # OK, but no language --> default language `eng`
+ xml = '''
+
+
+
+
+
+ subject 1
+
+
+
+
+
+ '''
+ assert ArodesSchema().dump(xml) == {
+ 'subjects': [{
+ 'label': {
+ 'language': 'eng',
+ 'value': ['subject 1']
+ }
+ }]
+ }
+
+ # OK
+ xml = '''
+
+
+
+
+
+ sujet 1
+ fre
+
+
+
+
+
+ '''
+ assert ArodesSchema().dump(xml) == {
+ 'subjects': [{
+ 'label': {
+ 'language': 'fre',
+ 'value': ['sujet 1']
+ }
+ }]
+ }
+
+ # Multiple subjects
+ xml = '''
+
+
+
+
+
+ sujet 1
+ fre
+
+
+ sujet 2
+ fre
+
+
+ subject 1
+ eng
+
+
+
+
+
+ '''
+ assert ArodesSchema().dump(xml) == {
+ 'subjects': [{
+ 'label': {
+ 'language': 'fre',
+ 'value': ['sujet 1', 'sujet 2']
+ }
+ }, {
+ 'label': {
+ 'language': 'eng',
+ 'value': ['subject 1']
+ }
+ }]
+ }
+
+
+def test_dissertation():
+ """Test dissertation."""
+ # OK
+ xml = '''
+
+
+
+
+
+ Dissertation degree
+
+
+
+
+
+ '''
+ assert ArodesSchema().dump(xml) == {
+ 'dissertation': {
+ 'degree': 'Dissertation degree'
+ }
+ }
+
+ # No 502
+ xml = '''
+
+
+
+
+
+
+
+
+ '''
+ assert ArodesSchema().dump(xml) == {}
+
+ # 502, but no $b
+ xml = '''
+
+
+
+
+
+
+
+
+
+
+ '''
+ assert ArodesSchema().dump(xml) == {}
+
+
+def test_host_document():
+ """Test host document."""
+ # No 773
+ xml = '''
+
+
+
+
+
+
+
+
+ '''
+ assert ArodesSchema().dump(xml) == {}
+
+ # No 773$t
+ xml = '''
+
+
+
+
+
+
+
+
+
+ '''
+ assert ArodesSchema().dump(xml) == {}
+
+ # Not $g, no provision activity start date
+ xml = '''
+
+
+
+
+
+ Host document
+
+
+
+
+
+ '''
+ assert ArodesSchema().dump(xml) == {}
+
+ # Not $g, with provision activity start date
+ xml = '''
+
+
+
+
+
+ 2019-01
+
+
+ Host document
+
+
+
+
+
+ '''
+ assert ArodesSchema().dump(xml) == {
+ 'partOf': [{
+ 'document': {
+ 'title': 'Host document'
+ },
+ 'numberingYear': '2019'
+ }],
+ 'provisionActivity': [{
+ 'startDate': '2019-01-01',
+ 'type': 'bf:Publication'
+ }]
+ }
+
+ # OK
+ xml = '''
+
+
+
+
+
+ Host document
+ 2015, vol. 37, no. 2, pp. 49-58
+
+
+
+
+
+ '''
+ assert ArodesSchema().dump(xml) == {
+ 'partOf': [{
+ 'document': {
+ 'title': 'Host document'
+ },
+ 'numberingYear': '2015',
+ 'numberingVolume': '37',
+ 'numberingIssue': '2',
+ 'numberingPages': '49-58'
+ }]
+ }
+
+
+def test_contribution():
+ """Test contribution."""
+ # No 700
+ xml = '''
+
+
+
+
+
+
+
+
+ '''
+ assert ArodesSchema().dump(xml) == {}
+
+ # No 700$a
+ xml = '''
+
+
+
+
+
+
+
+
+
+ '''
+ assert ArodesSchema().dump(xml) == {}
+
+ # OK
+ xml = '''
+
+
+
+
+
+ John Doe
+ RERO
+
+
+
+
+
+ '''
+ assert ArodesSchema().dump(xml) == {
+ 'contribution': [{
+ 'agent': {
+ 'preferred_name': 'John Doe',
+ 'type': 'bf:Person'
+ },
+ 'role': ['ctb'],
+ 'affiliation': 'RERO'
+ }]
+ }
+
+ # Multiple
+ xml = '''
+
+
+
+
+
+ John Doe
+ RERO
+
+
+ Marc Landers
+ HES-SO Valais
+
+
+
+
+
+ '''
+ assert ArodesSchema().dump(xml) == {
+ 'contribution': [{
+ 'agent': {
+ 'preferred_name': 'John Doe',
+ 'type': 'bf:Person'
+ },
+ 'role': ['ctb'],
+ 'affiliation': 'RERO'
+ }, {
+ 'agent': {
+ 'preferred_name': 'Marc Landers',
+ 'type': 'bf:Person'
+ },
+ 'role': ['ctb'],
+ 'affiliation': 'HES-SO Valais'
+ }]
+ }
diff --git a/tests/unit/documents/loaders/test_edoc_loader.py b/tests/unit/documents/loaders/test_edoc_loader.py
new file mode 100644
index 000000000..a90aef338
--- /dev/null
+++ b/tests/unit/documents/loaders/test_edoc_loader.py
@@ -0,0 +1,542 @@
+# -*- coding: utf-8 -*-
+#
+# Swiss Open Access Repository
+# Copyright (C) 2021 RERO
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, version 3 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see .
+
+"""Test edoc record loader."""
+
+import pytest
+
+from sonar.modules.documents.loaders.schemas.edoc import EdocSchema
+
+
+def test_no_record_metadata():
+ """Test when no record data exists."""
+ xml = """
+
+
+
+ """
+ assert not EdocSchema().dump(xml)
+
+
+def test_language():
+ """Test language."""
+ # No language --> default
+ xml = """
+
+
+
+
+ Title
+
+
+
+ """
+ assert EdocSchema().dump(xml)['language'] == [{
+ 'type': 'bf:Language',
+ 'value': 'eng'
+ }]
+
+ # One language
+ xml = """
+
+
+
+
+ deu
+
+
+
+ """
+ assert EdocSchema().dump(xml)['language'] == [{
+ 'type': 'bf:Language',
+ 'value': 'ger'
+ }]
+
+ # Multiple languages
+ xml = """
+
+
+
+
+ deu
+ fra
+ eng
+
+
+
+ """
+ assert EdocSchema().dump(xml)['language'] == [{
+ 'type': 'bf:Language',
+ 'value': 'ger'
+ }, {
+ 'type': 'bf:Language',
+ 'value': 'fre'
+ }, {
+ 'type': 'bf:Language',
+ 'value': 'eng'
+ }]
+
+
+def test_identifiers():
+ """Test identifiers."""
+ # No specific identifiers
+ xml = """
+
+
+
+
+ Title
+
+
+
+ """
+ assert EdocSchema().dump(xml)['identifiedBy'] == [{
+ 'type': 'bf:Local',
+ 'source': 'edoc',
+ 'value': '123456'
+ }]
+
+ # All identifiers
+ xml = """
+
+
+
+
+ specific-id
+ info:doi/10.5451/unibas-001565177
+ info:pmid/1111
+ urn:urn:nbn:ch:bel-bau-diss47638
+
+
+
+ """
+ assert EdocSchema().dump(xml)['identifiedBy'] == [{
+ 'source': 'edoc',
+ 'type': 'bf:Local',
+ 'value': '123456'
+ }, {
+ 'type': 'bf:Identifier',
+ 'value': 'specific-id'
+ }, {
+ 'type':
+ 'bf:Doi',
+ 'value':
+ '10.5451/unibas-001565177'
+ }, {
+ 'source': 'PMID',
+ 'type': 'bf:Local',
+ 'value': '1111'
+ }, {
+ 'type':
+ 'bf:Urn',
+ 'value':
+ 'urn:nbn:ch:bel-bau-diss47638'
+ }]
+
+
+def test_title():
+ """Test title."""
+ # No title --> default one
+ xml = """
+
+
+
+
+ Description
+
+
+
+ """
+ assert EdocSchema().dump(xml)['title'] == [{
+ 'type':
+ 'bf:Title',
+ 'mainTitle': [{
+ 'value': 'Default title',
+ 'language': 'eng'
+ }]
+ }]
+
+ # Only title
+ xml = """
+
+
+
+
+ Title
+
+
+
+ """
+ assert EdocSchema().dump(xml)['title'] == [{
+ 'type':
+ 'bf:Title',
+ 'mainTitle': [{
+ 'value': 'Title',
+ 'language': 'eng'
+ }]
+ }]
+
+ # Title + subtitle
+ xml = """
+
+
+
+
+ Title : Subtitle
+
+
+
+ """
+ assert EdocSchema().dump(xml)['title'] == [{
+ 'type':
+ 'bf:Title',
+ 'mainTitle': [{
+ 'value': 'Title',
+ 'language': 'eng'
+ }],
+ 'subtitle': [{
+ 'value': 'Subtitle',
+ 'language': 'eng'
+ }]
+ }]
+
+
+def test_provision_activity():
+ """Test provision activity."""
+ # No provision activity
+ xml = """
+
+
+
+
+ Description
+
+
+
+ """
+ assert 'provisionActivity' not in EdocSchema().dump(xml)
+
+ # Wrong date format
+ xml = """
+
+
+
+
+ wrong
+
+
+
+ """
+ assert 'provisionActivity' not in EdocSchema().dump(xml)
+
+ # OK
+ xml = """
+
+
+
+
+ 2019
+
+
+
+ """
+ assert EdocSchema().dump(xml)['provisionActivity'] == [{
+ 'type': 'bf:Publication',
+ 'startDate': '2019'
+ }]
+
+
+def test_document_type():
+ """Test document type."""
+ # No document type --> other
+ xml = """
+
+
+
+
+ Title
+
+
+
+ """
+ assert EdocSchema().dump(xml)['documentType'] == 'coar:c_1843'
+
+ # Multiple, takes only the first
+ xml = """
+
+
+
+
+ Thesis
+ NonPeerReviewed
+
+
+
+ """
+ assert EdocSchema().dump(xml)['documentType'] == 'coar:c_db06'
+
+ # None existing, takes "other"
+ xml = """
+
+
+
+
+ Unknown
+
+
+
+ """
+ assert EdocSchema().dump(xml)['documentType'] == 'coar:c_1843'
+
+
+@pytest.mark.parametrize(
+ 'document_type,result',
+ [('Book', 'coar:c_2f33'), ('Book Section', 'coar:c_3248'),
+ ('Conference', 'coar:c_c94f'), ('Workshop Item', 'coar:c_c94f'),
+ ('Research Data', 'coar:c_ddb1'), ('Article', 'coar:c_6501'),
+ ('Newspaper', 'coar:c_998f'), ('Magazine Article', 'coar:c_998f'),
+ ('Audiovisual Material & Event', 'non_textual_object'),
+ ('Preprint', 'coar:c_816b'), ('Thesis', 'coar:c_db06'),
+ ('Working Paper', 'coar:c_8042'), ('Other', 'coar:c_1843')])
+def test_document_type_mappings(document_type, result):
+ """Test document type mappings."""
+ xml = f"""
+
+
+
+
+ {document_type}
+
+
+
+ """
+ assert EdocSchema().dump(xml)['documentType'] == result
+
+
+def test_abstracts():
+ """Test abstracts."""
+ # No abstract
+ xml = """
+
+
+
+
+ Title
+
+
+
+ """
+ assert 'abstracts' not in EdocSchema().dump(xml)
+
+ # No abstract
+ xml = """
+
+
+
+
+ Description
+
+
+
+ """
+ assert EdocSchema().dump(xml)['abstracts'] == [{
+ 'language': 'eng',
+ 'value': 'Description'
+ }]
+
+
+def test_subjects():
+ """Test subjects."""
+ # No subject
+ xml = """
+
+
+
+
+ Title
+
+
+
+ """
+ assert 'subjects' not in EdocSchema().dump(xml)
+
+ # One subject
+ xml = """
+
+
+
+
+ Subject 1
+
+
+
+ """
+ assert EdocSchema().dump(xml)['subjects'] == [{
+ 'label': {
+ 'language': 'eng',
+ 'value': ['Subject 1']
+ }
+ }]
+
+ # Multiple subjects
+ xml = """
+
+
+
+
+ Subject 1
+ Subject 2
+
+
+
+ """
+ assert EdocSchema().dump(xml)['subjects'] == [{
+ 'label': {
+ 'language': 'eng',
+ 'value': ['Subject 1', 'Subject 2']
+ }
+ }]
+
+
+def test_contribution():
+ """Test contibution."""
+ # No contribution
+ xml = """
+
+
+
+
+ Title
+
+
+
+ """
+ assert 'contribution' not in EdocSchema().dump(xml)
+
+ # OK, one creator, multiple contributors
+ xml = """
+
+
+
+
+ Creator
+ Contributor 1
+ Contributor 2
+
+
+
+ """
+ assert EdocSchema().dump(xml)['contribution'] == [{
+ 'agent': {
+ 'type': 'bf:Person',
+ 'preferred_name': 'Creator'
+ },
+ 'role': ['cre']
+ }, {
+ 'agent': {
+ 'type': 'bf:Person',
+ 'preferred_name': 'Contributor 1'
+ },
+ 'role': ['ctb']
+ }, {
+ 'agent': {
+ 'type': 'bf:Person',
+ 'preferred_name': 'Contributor 2'
+ },
+ 'role': ['ctb']
+ }]
+
+ # OK, multiple creators, one contributor
+ xml = """
+
+
+
+
+ Creator 1
+ Creator 2
+ Contributor
+
+
+
+ """
+ assert EdocSchema().dump(xml)['contribution'] == [{
+ 'agent': {
+ 'type': 'bf:Person',
+ 'preferred_name': 'Creator 1'
+ },
+ 'role': ['cre']
+ }, {
+ 'agent': {
+ 'type': 'bf:Person',
+ 'preferred_name': 'Creator 2'
+ },
+ 'role': ['cre']
+ }, {
+ 'agent': {
+ 'type': 'bf:Person',
+ 'preferred_name': 'Contributor'
+ },
+ 'role': ['ctb']
+ }]
diff --git a/tests/unit/documents/loaders/test_zora_loader.py b/tests/unit/documents/loaders/test_zora_loader.py
new file mode 100644
index 000000000..65f662732
--- /dev/null
+++ b/tests/unit/documents/loaders/test_zora_loader.py
@@ -0,0 +1,712 @@
+# -*- coding: utf-8 -*-
+#
+# Swiss Open Access Repository
+# Copyright (C) 2021 RERO
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, version 3 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see .
+
+"""Test ZORA record loader."""
+
+import pytest
+
+from sonar.modules.documents.loaders.schemas.zora import ZoraSchema
+
+
+def test_title():
+ """Test title."""
+ xml = """
+
+
+
+
+
+
+
+ """
+ assert ZoraSchema().dump(xml) == {}
+
+ xml = """
+
+
+
+
+
+ Art and design as linked data :
+ the LODZ project (Linked Open Data Zurich)
+
+
+
+
+
+ """
+ assert ZoraSchema().dump(xml) == {
+ 'title': [{
+ 'mainTitle': [{
+ 'language': 'eng',
+ 'value': 'Art and design as linked data :'
+ }],
+ 'subtitle': [{
+ 'language': 'eng',
+ 'value': 'the LODZ project (Linked Open Data Zurich)'
+ }],
+ 'type':
+ 'bf:Title'
+ }]
+ }
+
+
+def test_identifiers():
+ """Test identifiers."""
+ xml = """
+
+
+
+
+ 1972
+
+ doi
+ 10.15291/libellarium.v9i2.256
+
+
+ doi
+
+
+ pmid
+ 2222
+
+
+ UNKNOWN
+ 1111
+
+
+
+
+
+ """
+ assert ZoraSchema().dump(xml) == {
+ 'identifiedBy': [{
+ 'source': 'ZORA',
+ 'type': 'bf:Local',
+ 'value': '1972'
+ }, {
+ 'type': 'bf:Doi',
+ 'value': '10.15291/libellarium.v9i2.256'
+ }, {
+ 'type': 'bf:Local',
+ 'value': '2222',
+ 'source': 'PMID'
+ }, {
+ 'type': 'bf:Identifier',
+ 'value': '1111'
+ }]
+ }
+
+
+@pytest.mark.parametrize('type, value, result, dissertation', [
+ (None, None, None, None),
+ ('local', 'Herausgegebenes wissenschaftliches Werk', 'coar:c_2f33', None),
+ ('local', 'Monografie', 'coar:c_2f33', None),
+ ('local', 'Buchkapitel', 'coar:c_3248', None),
+ ('local', 'Konferenzbeitrag', 'coar:c_5794', None),
+ ('local', 'Artikel', 'coar:c_6501', None),
+ ('local', 'Zeitungsartikel', 'coar:c_998f', None),
+ ('gnd-content', 'Forschungsbericht', 'coar:c_18ws', None),
+ ('gnd-content', 'Hochschulschrift', 'coar:c_db06', 'Dissertation'),
+ ('gnd-content', 'Hochschulschrift', 'coar:c_bdcc', 'Masterarbeit'),
+ ('gnd-content', 'Hochschulschrift', 'habilitation_thesis', 'Habilitation'),
+ ('local', 'Working Paper', 'coar:c_8042', None),
+ ('local', 'non-existing', 'coar:c_1843', None)
+])
+def test_document_type(type, value, result, dissertation):
+ """Test document type."""
+ if not type:
+ # No 655
+ xml = """
+
+
+
+
+
+
+
+
+ """
+ assert ZoraSchema().dump(xml) == {}
+
+ # No 655$a
+ xml = """
+
+
+
+
+
+
+
+
+
+
+ """
+ assert ZoraSchema().dump(xml) == {}
+
+ # No 655$2
+ xml = """
+
+
+
+
+
+ Doc type
+
+
+
+
+
+ """
+ assert ZoraSchema().dump(xml) == {}
+
+ return
+
+ xml = f"""
+
+
+
+
+
+ {dissertation}
+
+
+ {value}
+ {type}
+
+
+
+
+
+ """
+ assert ZoraSchema().dump(xml)['documentType'] == result
+
+
+def test_language():
+ """Test language."""
+ # No 041
+ xml = """
+
+
+
+
+
+
+
+ """
+ assert ZoraSchema().dump(xml) == {}
+
+ # No 041$a
+ xml = """
+
+
+
+
+
+
+
+
+
+
+ """
+ assert ZoraSchema().dump(xml) == {}
+
+ # One language
+ xml = """
+
+
+
+
+
+ eng
+
+
+
+
+
+ """
+ assert ZoraSchema().dump(xml) == {
+ 'language': [{
+ 'type': 'bf:Language',
+ 'value': 'eng'
+ }]
+ }
+
+ # Multiple 041
+ xml = """
+
+
+
+
+
+ eng
+
+
+ fre
+
+
+
+
+
+ """
+ assert ZoraSchema().dump(xml) == {
+ 'language': [{
+ 'type': 'bf:Language',
+ 'value': 'eng'
+ }, {
+ 'type': 'bf:Language',
+ 'value': 'fre'
+ }]
+ }
+
+ # Multiple 041$a
+ xml = """
+
+
+
+
+
+ eng
+ fre
+
+
+
+
+
+ """
+ assert ZoraSchema().dump(xml) == {
+ 'language': [{
+ 'type': 'bf:Language',
+ 'value': 'eng'
+ }, {
+ 'type': 'bf:Language',
+ 'value': 'fre'
+ }]
+ }
+
+
+def test_abstracts():
+ """Test abstracts."""
+ # No 520
+ xml = '''
+
+
+
+
+
+
+
+ '''
+ assert ZoraSchema().dump(xml) == {}
+
+ # No 520$a
+ xml = '''
+
+
+
+
+
+ fre
+
+
+
+
+
+ '''
+ assert ZoraSchema().dump(xml) == {}
+
+ # No language
+ xml = '''
+
+
+
+
+
+ La Convention relative
+
+
+
+
+
+ '''
+ assert ZoraSchema().dump(xml) == {
+ 'abstracts': [{
+ 'language': 'eng',
+ 'value': 'La Convention relative'
+ }]
+ }
+
+ # One abstracts
+ xml = '''
+
+
+
+
+
+ fre
+ La Convention relative
+
+
+
+
+
+ '''
+ assert ZoraSchema().dump(xml) == {
+ 'abstracts': [{
+ 'language': 'fre',
+ 'value': 'La Convention relative'
+ }]
+ }
+
+ # Multiple abstracts
+ xml = '''
+
+
+
+
+
+ fre
+ La Convention relative
+
+
+ eng
+ The Convention
+
+
+
+
+
+ '''
+ assert ZoraSchema().dump(xml) == {
+ 'abstracts': [{
+ 'language': 'fre',
+ 'value': 'La Convention relative'
+ }, {
+ 'language': 'eng',
+ 'value': 'The Convention'
+ }]
+ }
+
+
+def test_date():
+ """Test Date."""
+ # No 264$c
+ xml = '''
+
+
+
+
+
+
+
+
+ '''
+ assert ZoraSchema().dump(xml) == {}
+
+ # 264$c, but wrong format.
+ xml = '''
+
+
+
+
+
+ wrong
+
+
+
+
+
+ '''
+ assert ZoraSchema().dump(xml) == {}
+
+ # 264$c OK
+ xml = '''
+
+
+
+
+
+ 2019
+
+
+
+
+
+ '''
+ assert ZoraSchema().dump(xml) == {
+ 'provisionActivity': [{
+ 'startDate': '2019',
+ 'type': 'bf:Publication'
+ }]
+ }
+
+
+def test_dissertation():
+ """Test dissertation."""
+ # OK
+ xml = '''
+
+
+
+
+
+ Dissertation degree
+ Universität Zürich
+ 2007
+
+
+
+
+
+ '''
+ assert ZoraSchema().dump(xml) == {
+ 'dissertation': {
+ 'degree': 'Dissertation degree',
+ 'grantingInstitution': 'Universität Zürich',
+ 'date': '2007'
+ }
+ }
+
+ # No 502
+ xml = '''
+
+
+
+
+
+
+
+
+ '''
+ assert ZoraSchema().dump(xml) == {}
+
+ # 502, but no $b
+ xml = '''
+
+
+
+
+
+
+
+
+
+
+ '''
+ assert ZoraSchema().dump(xml) == {}
+
+
+def test_host_document():
+ """Test host document."""
+ # No 773
+ xml = '''
+
+
+
+
+
+
+
+
+ '''
+ assert ZoraSchema().dump(xml) == {}
+
+ # No 773$t
+ xml = '''
+
+
+
+
+
+
+
+
+
+ '''
+ assert ZoraSchema().dump(xml) == {}
+
+ # Not $g, no provision activity start date
+ xml = '''
+
+
+
+
+
+ Host document
+
+
+
+
+
+ '''
+ assert ZoraSchema().dump(xml) == {}
+
+ # Not $g, with provision activity start date
+ xml = '''
+
+
+
+
+
+ 2019
+
+
+ Host document
+
+
+
+
+
+ '''
+ assert ZoraSchema().dump(xml) == {
+ 'partOf': [{
+ 'document': {
+ 'title': 'Host document'
+ },
+ 'numberingYear': '2019'
+ }],
+ 'provisionActivity': [{
+ 'startDate': '2019',
+ 'type': 'bf:Publication'
+ }]
+ }
+
+ # OK
+ xml = '''
+
+
+
+
+
+ Host document
+ Bd. 16, Nr. 3, S. 411-413 (2002)
+
+
+
+
+
+ '''
+ assert ZoraSchema().dump(xml) == {
+ 'partOf': [{
+ 'document': {
+ 'title': 'Host document'
+ },
+ 'numberingYear': '2002',
+ 'numberingVolume': '16',
+ 'numberingIssue': '3',
+ 'numberingPages': '411-413'
+ }]
+ }
+
+
+def test_contribution_from_field_100():
+ """Test extracting contribution from field 100."""
+ # OK
+ xml = """
+
+
+ Romagnani, Andrea
+ VerfasserIn
+ aut
+ (orcid)0000-0003-3669-3497
+
+
+ """
+ data = ZoraSchema().dump(xml)
+ assert data.get('contribution') == [{
+ 'agent': {
+ 'type': 'bf:Person',
+ 'preferred_name': 'Romagnani, Andrea',
+ 'identifiedBy': {
+ 'type': 'bf:Local',
+ 'source': 'ORCID',
+ 'value': '0000-0003-3669-3497'
+ }
+ },
+ 'role': ['cre']
+ }]
+
+ # Not $a
+ xml = """
+
+
+
+
+ """
+ data = ZoraSchema().dump(xml)
+ assert not data.get('contribution')
+
+
+def test_contribution_from_field_700():
+ """Test extracting contribution from field 700."""
+ # OK, with bad ORCID
+ xml = """
+
+
+ Romagnani, Andrea
+ AkademischeR BetreuerIn
+ dgs
+ non-orcid
+
+
+ """
+ data = ZoraSchema().dump(xml)
+ assert data.get('contribution') == [{
+ 'agent': {
+ 'type': 'bf:Person',
+ 'preferred_name': 'Romagnani, Andrea'
+ },
+ 'role': ['dgs']
+ }]
+
+ # Not $a
+ xml = """
+
+
+
+
+ """
+ data = ZoraSchema().dump(xml)
+ assert not data.get('contribution')