document: dublin core improvements

* Adds files into the `dc:identifier`. * Adds attributes `xml:lang` on the descriptions, subjects and titles fields. * Closes #661. * Closes #800. Co-Authored-by: Bertrand Zuchuat <bertrand.zuchuat@rero.ch>
rero · May 5, 2022 · 396bfc7 · 396bfc7
1 parent de7b33d
commit 396bfc7
Show file tree

Hide file tree

Showing 3 changed files with 197 additions and 20 deletions.
diff --git a/sonar/modules/documents/serializers/dc.py b/sonar/modules/documents/serializers/dc.py
@@ -17,8 +17,8 @@
 
 """Dublin Core serializer."""
 
-from dcxml import simpledc
 from flask_resources.serializers import SerializerMixin
+from lxml import etree
 
 from sonar.modules.documents.serializers.schemas.dc import DublinCoreSchema
 
@@ -38,10 +38,72 @@ def transform_record(self, obj):
     def serialize_object_xml(self, obj):
         """Serialize a single record and persistent identifier to etree.
 
-        :param obj: Record instance
+        :param obj: Record instance.
+        :returns: an etree element.
         """
-        json = self.transform_record(obj["_source"])
-        return simpledc.dump_etree(json)
+        data = self.transform_record(obj["_source"])
+        return self.serialize_dict_to_etree(data)
+
+    def serialize_dict_to_etree(self, data):
+        """Serialize json to etree.
+
+        :param data: transformed record to dict.
+        :returns: an etree element.
+        """
+        ns = {
+            'dc': 'http://purl.org/dc/elements/1.1/',
+            'oai_dc': 'http://www.openarchives.org/OAI/2.0/oai_dc/',
+            'xml': 'xml',
+            'xsi': 'http://www.w3.org/2001/XMLSchema-instance',
+        }
+        container = '{http://www.openarchives.org/OAI/2.0/oai_dc/}dc'
+        """Default container element."""
+        attrib = {
+            '{http://www.w3.org/2001/XMLSchema-instance}schemaLocation':
+            'http://www.openarchives.org/OAI/2.0/oai_dc/ '
+            'http://www.openarchives.org/OAI/2.0/oai_dc.xsd'
+        }
+        """Default container element attributes."""
+        elements = {
+            'contributors': 'contributor',
+            'creators': 'creator',
+            'dates': 'date',
+            'descriptions': 'description',
+            'formats': 'format',
+            'identifiers': 'identifier',
+            'languages': 'language',
+            'publishers': 'publisher',
+            'relations': 'relation',
+            'rights': 'rights',
+            'sources': 'source',
+            'subjects': 'subject',
+            'titles': 'title',
+            'types': 'type'
+        }
+
+        root = etree.Element(container, nsmap=ns, attrib=attrib)
+
+        for key, values in data.items():
+            if key in elements:
+                for value in values:
+                    attrs = {}
+                    if isinstance(value, dict):
+                        val = value['value']
+                        if '@attrs' in value:
+                            for attr in value['@attrs']:
+                                prefix = attr['prefix'] \
+                                    if 'prefix' in attr else 'xml'
+                                attrs[f'{{{prefix}}}{attr["name"]}'] = \
+                                    attr['value']
+                    else:
+                        val = value
+                    field = etree.SubElement(
+                        root,
+                        f'{{http://purl.org/dc/elements/1.1/}}{elements[key]}',
+                        attrs
+                        )
+                    field.text = val
+        return root
 
 
 def sonar_dublin_core(pid, record):

diff --git a/sonar/modules/documents/serializers/schemas/dc.py b/sonar/modules/documents/serializers/schemas/dc.py
@@ -19,7 +19,7 @@
 
 import re
 
-from flask import request
+from flask import current_app, request
 from marshmallow import fields
 
 from sonar.modules.documents.api import DocumentRecord
@@ -46,6 +46,17 @@ class DublinCoreSchema(BaseSchema):
     titles = fields.Method('get_titles')
     types = fields.Method('get_types')
 
+    def translate_language(self, language):
+        """Translate language code ISO-639-3 to ISO-639-2 if possible.
+
+        :param language: language with ISO-639-3 format.
+        :returns: language code ISO-639-2 if possible or ISO-639-3.
+        """
+        langs = current_app.config['SONAR_APP_LANGUAGES_MAP']
+        if language in langs and langs[language]:
+            return langs[language]
+        return language
+
     def get_contributors(self, obj):
         """Get contributors."""
         items = []
@@ -85,7 +96,21 @@ def get_dates(self, obj):
 
     def get_descriptions(self, obj):
         """Get descriptions."""
-        return [file['value'] for file in obj['metadata'].get('abstracts', [])]
+        items = []
+        for abstract in obj['metadata'].get('abstracts', []):
+            if 'language' in abstract:
+                items.append({
+                '@attrs': [{
+                    'prefix':'xml',
+                    'name':'lang',
+                    'value': self.translate_language(abstract['language'])
+                }],
+                'value':abstract['value']
+            })
+            else:
+                items.append(abstract['value'])
+
+        return items
 
     def get_formats(self, obj):
         """Get formats."""
@@ -98,10 +123,27 @@ def get_formats(self, obj):
 
     def get_identifiers(self, obj):
         """Get identifiers."""
-        return [
+        items = [
             DocumentRecord.get_permanent_link(request.host_url,
                                               obj['metadata']['pid'])
         ]
+        # If files on the document
+        if '_files' in obj['metadata']:
+            # Extraction of files only with a type file
+            files = filter(
+                lambda f: ('type' in f and f['type'] == 'file'),
+                obj['metadata']['_files'])
+            # Files sorting
+            files = sorted(files, key=lambda file: file.get('order', 100))
+            # Remove / at the end of host_url
+            host = request.host_url[:-1]
+            # Add file only the the link is defined in download
+            for file in files:
+                links = file.get('links', {})
+                if 'download' in links and links.get('download'):
+                    items.append(host + links.get('download'))
+
+        return items
 
     def get_languages(self, obj):
         """Get languages."""
@@ -222,7 +264,19 @@ def get_subjects(self, obj):
 
         # Subjects
         for subjects in obj['metadata'].get('subjects', []):
-            items = items + subjects['label']['value']
+            if 'language' in subjects['label']:
+                for value in subjects['label']['value']:
+                    items.append({
+                        '@attrs': [{
+                            'prefix': 'xml',
+                            'name': 'lang',
+                            'value': self.translate_language(
+                                subjects['label']['language'])
+                        }],
+                        'value': value
+                    })
+            else:
+                items = items + subjects['label']['value']
 
         # Classification
         for classification in obj['metadata'].get('classification', []):
@@ -240,12 +294,22 @@ def get_subjects(self, obj):
 
     def get_titles(self, obj):
         """Get titles."""
-        title = [obj['metadata']['title'][0]['mainTitle'][0]['value']]
-
+        title = {
+            '@attrs': [{
+                'prefix': 'xml',
+                'name': 'lang',
+                'value': self.translate_language(
+                    obj['metadata']['title'][0]['mainTitle'][0]['language'])
+            }],
+            'value': obj['metadata']['title'][0]['mainTitle'][0]['value']\
+                .strip()
+        }
         if obj['metadata']['title'][0].get('subtitle'):
-            title.append(obj['metadata']['title'][0]['subtitle'][0]['value'])
+            subtitle = obj['metadata']['title'][0]['subtitle'][0]['value']\
+                .strip()
+            title['value'] = f"{title['value']} : {subtitle}"
 
-        return [' : '.join(title)]
+        return [title]
 
     def get_types(self, obj):
         """Get types."""

diff --git a/tests/ui/documents/test_dc_schema.py b/tests/ui/documents/test_dc_schema.py
@@ -29,11 +29,9 @@
 def minimal_document(db, bucket_location, organisation):
     record = DocumentRecord.create(
         {
-            'pid':
-            '1000',
+            'pid': '1000',
             'title': [{
-                'type':
-                'bf:Title',
+                'type': 'bf:Title',
                 'mainTitle': [{
                     'language': 'eng',
                     'value': 'Title of the document'
@@ -181,6 +179,29 @@ def test_descriptions(minimal_document):
     assert result['descriptions'] == ['Description 1', 'Description 2']
 
 
+def test_descriptions_attributes(minimal_document):
+    result = SonarDublinCoreXMLSerializer().transform_record(minimal_document)
+    assert result['descriptions'] == []
+
+    minimal_document['abstracts'] = [{
+        'language': 'fre',
+        'value': 'Description 1'
+    }, {
+        'language': 'ace',
+        'value': 'Description 2'
+    }]
+    result = SonarDublinCoreXMLSerializer().transform_record(minimal_document)
+    assert result['descriptions'] == [
+        {
+            '@attrs': [{'prefix': 'xml', 'name': 'lang', 'value': 'fr'}],
+            'value': 'Description 1'
+        },
+        {
+            '@attrs': [{'prefix': 'xml', 'name': 'lang', 'value': 'ace'}],
+            'value': 'Description 2'
+        }
+    ]
+
 def test_formats(minimal_document):
     result = SonarDublinCoreXMLSerializer().transform_record(minimal_document)
     assert result['formats'] == []
@@ -402,7 +423,22 @@ def test_subjects(minimal_document):
     }]
     result = SonarDublinCoreXMLSerializer().transform_record(minimal_document)
     assert result['subjects'] == [
-        'Subject 1', 'Subject 2', 'Sujet 1', 'Sujet 2'
+        {
+            '@attrs': [{'prefix': 'xml', 'name': 'lang', 'value': 'en'}],
+            'value': 'Subject 1'
+        },
+        {
+            '@attrs': [{'prefix': 'xml', 'name': 'lang', 'value': 'en'}],
+            'value': 'Subject 2'
+        },
+        {
+            '@attrs': [{'prefix': 'xml', 'name': 'lang', 'value': 'fr'}],
+            'value': 'Sujet 1'
+        },
+        {
+            '@attrs': [{'prefix': 'xml', 'name': 'lang', 'value': 'fr'}],
+            'value': 'Sujet 2'
+        }
     ]
 
     minimal_document.pop('subjects', None)
@@ -423,7 +459,12 @@ def test_subjects(minimal_document):
 def test_titles(minimal_document):
     """Test titles serialization."""
     result = SonarDublinCoreXMLSerializer().transform_record(minimal_document)
-    assert result['titles'] == ['Title of the document']
+    assert result['titles'] == [
+        {
+            '@attrs': [{'prefix': 'xml', 'name': 'lang', 'value': 'en'}],
+            'value': 'Title of the document'
+        }
+    ]
 
     minimal_document['title'] = [{
         'mainTitle': [{
@@ -437,7 +478,12 @@ def test_titles(minimal_document):
         }]
     }]
     result = SonarDublinCoreXMLSerializer().transform_record(minimal_document)
-    assert result['titles'] == ['Title 1']
+    assert result['titles'] == [
+        {
+            '@attrs': [{'prefix': 'xml', 'name': 'lang', 'value': 'en'}],
+            'value': 'Title 1'
+        }
+    ]
 
     minimal_document['title'] = [{
         'mainTitle': [{
@@ -450,7 +496,12 @@ def test_titles(minimal_document):
         }]
     }]
     result = SonarDublinCoreXMLSerializer().transform_record(minimal_document)
-    assert result['titles'] == ['Title 1 : Subtitle 1']
+    assert result['titles'] == [
+        {
+            '@attrs': [{'prefix': 'xml', 'name': 'lang', 'value': 'en'}],
+            'value': 'Title 1 : Subtitle 1'
+        }
+    ]
 
 
 def test_types(minimal_document):