Skip to content

Commit

Permalink
Merge f86ed63 into 1b3f1fd
Browse files Browse the repository at this point in the history
  • Loading branch information
Sébastien Délèze committed Dec 20, 2019
2 parents 1b3f1fd + f86ed63 commit 967fb32
Show file tree
Hide file tree
Showing 8 changed files with 399 additions and 13 deletions.
1 change: 1 addition & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ python-slugify = "*"
python3-saml = "*"
xmltodict = "*"
marshmallow = "<=3.0.0b6"
pycountry = "*"

[dev-packages]
Flask-Debugtoolbar = ">=0.10.1"
Expand Down
9 changes: 8 additions & 1 deletion Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

36 changes: 31 additions & 5 deletions sonar/modules/deposits/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,7 @@ class DepositRecord(SonarRecord):
schema = 'deposit'

@classmethod
def create(cls,
data,
id_=None,
dbcommit=False,
with_bucket=True,
def create(cls, data, id_=None, dbcommit=False, with_bucket=True,
**kwargs):
"""Create deposit record."""
record = super(DepositRecord, cls).create(data,
Expand All @@ -64,3 +60,33 @@ def create(cls,
with_bucket=with_bucket,
**kwargs)
return record

def populate_with_pdf_metadata(self, pdf_metadata, default_title=None):
"""Update data for record."""
self['metadata'] = {}

if 'title' in pdf_metadata:
self['metadata']['title'] = pdf_metadata['title']
else:
self['metadata']['title'] = default_title

if 'languages' in pdf_metadata:
self['metadata']['languages'] = pdf_metadata['languages']

if 'authors' in pdf_metadata:
if 'contributors' not in self:
self['contributors'] = []

for author in pdf_metadata['authors']:
self['contributors'].append({'name': author['name']})

if 'abstract' in pdf_metadata:
if 'abstracts' not in self['metadata']:
self['metadata']['abstracts'] = []

self['metadata']['abstracts'].append(pdf_metadata['abstract'])

if 'journal' in pdf_metadata:
self['metadata']['journal'] = pdf_metadata['journal']

return self
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
},
"metadata": {
"type": "object",
"required": ["document_type", "languages", "title"],
"required": ["languages", "title"],
"propertiesOrder": [ "document_type", "languages", "title", "publication_date", "journal", "abstracts", "etc" ],
"properties": {
"document_type": {
Expand Down Expand Up @@ -125,12 +125,12 @@
"volume": {
"title": "Volume",
"description": "Volume of the journal.",
"type": "integer"
"type": "string"
},
"number": {
"title": "Number",
"description": "Number of the journal.",
"type": "integer"
"type": "string"
},
"pages": {
"title": "Pages",
Expand All @@ -154,6 +154,7 @@
},
"etc": {
"title": "ETC.",
"default": "",
"type": "string"
}
}
Expand Down Expand Up @@ -276,4 +277,4 @@
}
}
}
}
}
19 changes: 16 additions & 3 deletions sonar/modules/deposits/rest.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
from invenio_rest import ContentNegotiatedMethodView

from sonar.modules.deposits.api import DepositRecord
from sonar.modules.pdf_extractor.pdf_extractor import PDFExtractor
from sonar.modules.pdf_extractor.utils import format_extracted_data


class FilesResource(ContentNegotiatedMethodView):
Expand Down Expand Up @@ -63,16 +65,28 @@ def post(pid=None):
# deposit.files[text_key]['file_type'] = 'full-text'
# deposit.commit()

file_content = BytesIO(request.get_data())

# Store document
deposit.files[key] = BytesIO(request.get_data())
deposit.files[key] = file_content
deposit.files[key]['label'] = re.search(r'(.*)\..*$', key).group(1)
deposit.files[key]['embargo'] = False
deposit.files[key]['embargoDate'] = None
deposit.files[key]['expect'] = False
deposit.files[key]['category'] = request.args['type']
deposit.files[key]['file_type'] = 'file'
deposit.commit()

# Extract data from pdf and populate deposit
if request.args['type'] == 'main':
pdf_extractor = PDFExtractor()
pdf_metadata = format_extracted_data(
pdf_extractor.process_raw(request.get_data()))

# deposit.populate_with_pdf_metadata(
# pdf_metadata, "Deposit #{pid}".format(pid=pid))
deposit.files[key]['pdf_metadata'] = pdf_metadata

deposit.commit()
db.session.commit()

return make_response(jsonify(deposit.files[key].dumps()))
Expand Down Expand Up @@ -106,7 +120,6 @@ def put(pid=None, key=None):
files_view = FilesResource.as_view('files')
file_view = FileResource.as_view('file')


blueprint = Blueprint('deposits', __name__, url_prefix='/deposits/<pid>/')
blueprint.add_url_rule('/custom-files/<key>', view_func=file_view)
blueprint.add_url_rule('/custom-files', view_func=files_view)
73 changes: 73 additions & 0 deletions sonar/modules/pdf_extractor/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
import subprocess
import tempfile

import pycountry


def extract_text_from_content(content):
"""Extract full-text from content which will be stored in a temporary file.
Expand All @@ -44,3 +46,74 @@ def extract_text_from_file(file):
text = re.sub('[\r\n\f]+', ' ', text)

return text


def format_extracted_data(data):
"""Format the extracted metadata from PDF."""
formatted_data = {}
if '#text' in data['teiHeader']['fileDesc']['titleStmt']['title']:
formatted_data['title'] = data['teiHeader']['fileDesc']['titleStmt'][
'title']['#text']

if data['text']['@xml:lang']:
language = pycountry.languages.get(alpha_2=data['text']['@xml:lang'])
if language:
if hasattr(language, 'bibliographic'):
formatted_data['languages'] = [language.bibliographic]
else:
formatted_data['languages'] = [language.alpha_3]

if 'analytic' in data['teiHeader']['fileDesc']['sourceDesc'][
'biblStruct'] and data['teiHeader']['fileDesc']['sourceDesc'][
'biblStruct']['analytic'] and 'author' in data['teiHeader'][
'fileDesc']['sourceDesc']['biblStruct']['analytic']:
authors = data['teiHeader']['fileDesc']['sourceDesc']['biblStruct'][
'analytic']['author']
if not isinstance(authors, list):
authors = [authors]

formatted_data['authors'] = []
for author in authors:
if 'persName' in author:
new_author = {}

if 'surname' in author['persName']:
new_author['name'] = author['persName']['surname']

if not isinstance(author['persName']['forename'], list):
author['persName']['forename'] = [
author['persName']['forename']
]

for forename in author['persName']['forename']:
new_author[
'name'] = forename['#text'] + ' ' + new_author['name']

formatted_data['authors'].append(new_author)

if data['teiHeader']['fileDesc']['sourceDesc']['biblStruct']['monogr'][
'imprint']:
imprint = data['teiHeader']['fileDesc']['sourceDesc']['biblStruct'][
'monogr']['imprint']
if 'publisher' in imprint:
formatted_data['journal'] = {'name': imprint['publisher']}

if not isinstance(imprint['biblScope'], list):
imprint['biblScope'] = [imprint['biblScope']]

for item in imprint['biblScope']:
if item['@unit'] in ['page', 'volume', 'number']:
key = item['@unit']
if key == 'page':
key = 'pages'

formatted_data['journal'][
key] = item['#text'] if '#text' in item else item[
'@from'] + '-' + item['@to']

if 'abstract' in data['teiHeader']['profileDesc'] and data['teiHeader'][
'profileDesc']['abstract']:
formatted_data['abstract'] = data['teiHeader']['profileDesc'][
'abstract']['p']

return formatted_data
Loading

0 comments on commit 967fb32

Please sign in to comment.