Skip to content

Commit

Permalink
PDF extractor: Affiliation extraction
Browse files Browse the repository at this point in the history
* Extracts affiliation associated to authors.
* Fixes an error when "analytic" property is existing but empty.
* Fixes #148.

Co-Authored-by: Sébastien Délèze <sebastien.deleze@rero.ch>
  • Loading branch information
Sébastien Délèze committed Feb 21, 2020
1 parent 822a521 commit 1567e3e
Show file tree
Hide file tree
Showing 6 changed files with 98 additions and 278 deletions.
4 changes: 1 addition & 3 deletions sonar/modules/pdf_extractor/pdf_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,9 +137,7 @@ def extract_metadata(self, file):
files={
'input':
(file, open(file, 'rb'),
'application/pdf'),
'consolidateHeader':
'1'
'application/pdf')
})

if status != 200:
Expand Down
26 changes: 20 additions & 6 deletions sonar/modules/pdf_extractor/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,13 +66,15 @@ def format_extracted_data(data):
else:
formatted_data['languages'] = [language.alpha_3]

authors = data['teiHeader']['fileDesc']['sourceDesc']['biblStruct'].get(
'analytic', {}).get('author')
analytic = data['teiHeader']['fileDesc']['sourceDesc']['biblStruct'].get(
'analytic', {})

if authors:
authors = force_list(authors)
if analytic and analytic.get('author'):
authors = force_list(analytic.get('author'))

for author in authors:
author_data = {}

if author.get('persName'):
name = []
surname = author['persName'].get('surname')
Expand All @@ -85,8 +87,20 @@ def format_extracted_data(data):

name = name + [forename['#text'] for forename in forenames]

formatted_data.setdefault('authors',
[]).append({'name': ' '.join(name)})
author_data['name'] = ' '.join(name)

if author_data.get('name'):
affiliations = force_list(author.get('affiliation', []))

if affiliations:
affiliation = affiliations[0]
organisations = force_list(affiliation.get('orgName', []))

for organisation in organisations:
if organisation.get('@type') == 'institution':
author_data['affiliation'] = organisation['#text']

formatted_data.setdefault('authors', []).append(author_data)

imprint = data['teiHeader']['fileDesc']['sourceDesc']['biblStruct'][
'monogr'].get('imprint', {})
Expand Down
205 changes: 0 additions & 205 deletions tests/ui/deposits/data/extracted_data.json

This file was deleted.

4 changes: 2 additions & 2 deletions tests/ui/deposits/test_deposits_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@

def test_populate_with_pdf_metadata(app):
"""Test populate deposit with metadata."""
json_file = os.path.dirname(
os.path.abspath(__file__)) + '/data/extracted_data.json'
json_file = os.path.dirname(os.path.abspath(
__file__)) + '/../pdf_extractor/data/extracted_data.json'

with open(json_file, 'rb') as file:
pdf_metadata = format_extracted_data(json.load(file))
Expand Down
Loading

0 comments on commit 1567e3e

Please sign in to comment.