PDF extractor: Affiliation extraction

* Extracts affiliation associated to authors. * Fixes an error when "analytic" property is existing but empty. * Fixes #148. Co-Authored-by: Sébastien Délèze <sebastien.deleze@rero.ch>
rero · Feb 21, 2020 · 1567e3e · 1567e3e
1 parent 822a521
commit 1567e3e
Show file tree

Hide file tree

Showing 6 changed files with 98 additions and 278 deletions.
diff --git a/sonar/modules/pdf_extractor/pdf_extractor.py b/sonar/modules/pdf_extractor/pdf_extractor.py
@@ -137,9 +137,7 @@ def extract_metadata(self, file):
                                            files={
                                                'input':
                                                (file, open(file, 'rb'),
-                                                'application/pdf'),
-                                               'consolidateHeader':
-                                               '1'
+                                                'application/pdf')
                                            })
 
         if status != 200:

diff --git a/sonar/modules/pdf_extractor/utils.py b/sonar/modules/pdf_extractor/utils.py
@@ -66,13 +66,15 @@ def format_extracted_data(data):
             else:
                 formatted_data['languages'] = [language.alpha_3]
 
-    authors = data['teiHeader']['fileDesc']['sourceDesc']['biblStruct'].get(
-        'analytic', {}).get('author')
+    analytic = data['teiHeader']['fileDesc']['sourceDesc']['biblStruct'].get(
+        'analytic', {})
 
-    if authors:
-        authors = force_list(authors)
+    if analytic and analytic.get('author'):
+        authors = force_list(analytic.get('author'))
 
         for author in authors:
+            author_data = {}
+
             if author.get('persName'):
                 name = []
                 surname = author['persName'].get('surname')
@@ -85,8 +87,20 @@ def format_extracted_data(data):
 
                 name = name + [forename['#text'] for forename in forenames]
 
-                formatted_data.setdefault('authors',
-                                          []).append({'name': ' '.join(name)})
+                author_data['name'] = ' '.join(name)
+
+            if author_data.get('name'):
+                affiliations = force_list(author.get('affiliation', []))
+
+                if affiliations:
+                    affiliation = affiliations[0]
+                    organisations = force_list(affiliation.get('orgName', []))
+
+                    for organisation in organisations:
+                        if organisation.get('@type') == 'institution':
+                            author_data['affiliation'] = organisation['#text']
+
+            formatted_data.setdefault('authors', []).append(author_data)
 
     imprint = data['teiHeader']['fileDesc']['sourceDesc']['biblStruct'][
         'monogr'].get('imprint', {})

diff --git a/tests/ui/deposits/data/extracted_data.json b/tests/ui/deposits/data/extracted_data.json
diff --git a/tests/ui/deposits/test_deposits_api.py b/tests/ui/deposits/test_deposits_api.py
@@ -26,8 +26,8 @@
 
 def test_populate_with_pdf_metadata(app):
     """Test populate deposit with metadata."""
-    json_file = os.path.dirname(
-        os.path.abspath(__file__)) + '/data/extracted_data.json'
+    json_file = os.path.dirname(os.path.abspath(
+        __file__)) + '/../pdf_extractor/data/extracted_data.json'
 
     with open(json_file, 'rb') as file:
         pdf_metadata = format_extracted_data(json.load(file))