Skip to content

Commit

Permalink
document: PDF metadata extraction
Browse files Browse the repository at this point in the history
NEW API for metadata and full-text extraction for a PDF document.

Signed-off-by: Sébastien Délèze <sebastien.deleze@rero.ch>
  • Loading branch information
Sébastien Délèze committed Jul 25, 2019
1 parent da40b3f commit fab3c7f
Show file tree
Hide file tree
Showing 35 changed files with 1,204 additions and 59 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ python:

before_install:
- sudo apt-get update
- sudo apt-get install -y libxml2 libxml2-dev libxmlsec1 libxmlsec1-dev
- sudo apt-get install -y libxml2 libxml2-dev libxmlsec1 libxmlsec1-dev xpdf
# Stop default travis services
- "sudo service mysql stop"
- "sudo service postgresql stop"
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile.base
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

FROM inveniosoftware/centos7-python:3.6

RUN yum -y install libxml2-devel xmlsec1-devel xmlsec1-openssl-devel libtool-ltdl-devel
RUN yum -y install libxml2-devel xmlsec1-devel xmlsec1-openssl-devel libtool-ltdl-devel xpdf

COPY Pipfile Pipfile.lock ./
RUN pipenv install --deploy --system
6 changes: 5 additions & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,14 @@ recursive-include docs *.txt
recursive-include docs Makefile
recursive-include sonar *.gitkeep
recursive-include sonar *.po *.pot *.mo
recursive-include sonar *.json *.html *.js *.scss
recursive-include sonar *.json *.html *.js *.scss *.css
recursive-include sonar *.png *.jpg *.svg
recursive-include docker *.cfg *.conf *.crt *.ini *.key *.pem *.sh
recursive-include tests *.py
recursive-include tests *.doc
recursive-include tests *.json
recursive-include tests *.pdf
recursive-include tests *.xml

# added by check_manifest.py
include *.html
Expand Down
2 changes: 2 additions & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ lxml = ">=3.5.0,<4.2.6"
orcid = "*"
python-slugify = "*"
python3-saml = "*"
xmltodict = "*"

[dev-packages]
Flask-Debugtoolbar = ">=0.10.1"
Expand All @@ -33,6 +34,7 @@ pytest-mock = ">=1.6.0"
pytest-pep8 = ">=1.0.6"
pytest-random-order = ">=0.5.4"
pytest-runner = ">=3.0.0,<5"
docutils = "==0.15"

[requires]
python_version = "3.6"
Expand Down
118 changes: 63 additions & 55 deletions Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions docker-compose.full.yml
Original file line number Diff line number Diff line change
Expand Up @@ -132,5 +132,9 @@ services:
extends:
file: docker-services.yml
service: es
grobid:
extends:
file: docker-services.yml
service: grobid
volumes:
static_data:
4 changes: 4 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,7 @@ services:
extends:
file: docker-services.yml
service: es
grobid:
extends:
file: docker-services.yml
service: grobid
5 changes: 5 additions & 0 deletions docker-services.yml
Original file line number Diff line number Diff line change
Expand Up @@ -89,3 +89,8 @@ services:
command: --broker=amqp://guest:guest@mq:5672/ --broker_api=http://guest:guest@mq:15672/api/
ports:
- "5555:5555"
grobid:
image: lfoppiano/grobid:0.5.5
ports:
- "8070:8070"
- "8071:8071"
3 changes: 2 additions & 1 deletion docker/nginx/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ RUN apt-get update && apt-get upgrade -y && apt-get install -y \
libxml2 \
libxml2-dev \
libxmlsec1 \
libxmlsec1-dev
libxmlsec1-dev \
xpdf

COPY nginx.conf /etc/nginx/nginx.conf
COPY conf.d/* /etc/nginx/conf.d/
Expand Down
6 changes: 6 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,11 @@
'documents = sonar.modules.documents.views:blueprint',
'shibboleth_authenticator = \
sonar.modules.shibboleth_authenticator.views.client:blueprint',
'pdf_extractor = \
sonar.modules.pdf_extractor.views.client:blueprint'
],
'invenio_base.api_blueprints': [
'pdf_extractor = sonar.modules.pdf_extractor.views.api:blueprint'
],
'invenio_assets.webpack': [
'sonar_theme = sonar.theme.webpack:theme',
Expand All @@ -65,6 +70,7 @@
'sonar_documents = sonar.modules.documents.config',
'shibboleth_authenticator = \
sonar.modules.shibboleth_authenticator.config',
'pdf_extractor = sonar.modules.pdf_extractor.config',
],
'invenio_i18n.translations': [
'messages = sonar',
Expand Down
15 changes: 15 additions & 0 deletions sonar/modules/pdf_extractor/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2019 CERN.
#
# My site is free software; you can redistribute it and/or modify it under
# the terms of the MIT License; see LICENSE file for more details.


"""PDF extractor extension."""

from __future__ import absolute_import, print_function

from .ext import PDFExtractor

__all__ = ('PDFExtractor', )
Loading

0 comments on commit fab3c7f

Please sign in to comment.