Skip to content

Commit

Permalink
feat: check & validate supported formats #40
Browse files Browse the repository at this point in the history
  • Loading branch information
rueedlinger committed Jun 13, 2024
1 parent 3beb581 commit a86b21b
Show file tree
Hide file tree
Showing 8 changed files with 40 additions and 19 deletions.
26 changes: 20 additions & 6 deletions teal/core/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,23 @@
class LibreOfficeAdapter:
def __init__(self, libreoffice_cmd="soffice"):
self.libreoffice_cmd = libreoffice_cmd
# at the moment restricted to these file endings
self.supported_file_extensions = [
".doc",
".docx",
# OpenDocument Text
".odt",
# OpenDocument Text Template
".ott",
# Rich Text Format
".rtf",
# Microsoft Word
".doc",
# Microsoft Word XML
".docx",
# Plain Text
".txt",
# Plain Text
".text",
# PDF
".pdf",
]

Expand All @@ -45,7 +56,9 @@ async def create_pdf(
if is_feature_enabled("TEAL_FEATURE_CREATE_PDF_CHECK_FILE_EXTENSION"):
if file_ext not in self.supported_file_extensions:
return create_json_err_response(
400, f"file extension '{file_ext}' is not supported ({filename})."
400,
f"file extension '{file_ext}' is not supported, supported "
f"extensions are {sorted(self.supported_file_extensions)}.",
)

# create tmp dir for all files
Expand All @@ -69,18 +82,19 @@ async def create_pdf(
pdf_version = output_type.to_param()

pages = parse_page_ranges(page_ranges)
_logger.debug(f"using pdf version {pdf_version}")

# https://help.libreoffice.org/latest/en-US/text/shared/guide/pdf_params.html?&DbPAR=SHARED&System=UNIX
# https://help.libreoffice.org/latest/en-US/text/shared/guide/convertfilters.html?DbPAR=SHARED#bm_id541554406270299
# https://vmiklos.hu/blog/pdf-convert-to.html
if pages is None:
pdf_param = (
'pdf:draw_pdf_Export:{"SelectPdfVersion":{"type":"long","value":"'
'pdf:writer_pdf_Export:{"SelectPdfVersion":{"type":"long","value":"'
+ pdf_version
+ '"}}'
)
else:
pdf_param = (
'pdf:draw_pdf_Export:{"SelectPdfVersion":{"type":"long","value":"'
'pdf:writer_pdf_Export:{"SelectPdfVersion":{"type":"long","value":"'
+ pdf_version
+ '"},"PageRange":{"type":"string","value":"'
+ to_page_range(pages)
Expand Down
16 changes: 12 additions & 4 deletions teal/core/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,9 @@ async def extract_text(
file_ext = get_file_ext(filename)
if file_ext not in self.supported_file_extensions:
return create_json_err_response(
400, f"file extension '{file_ext}' is not supported ({filename})."
400,
f"file extension '{file_ext}' is not supported, supported "
f"extensions are {sorted(self.supported_file_extensions)}.",
)

extracts = []
Expand Down Expand Up @@ -76,7 +78,9 @@ async def extract_text_with_ocr(
file_ext = get_file_ext(filename)
if file_ext not in self.supported_file_extensions:
return create_json_err_response(
400, f"file extension '{file_ext}' is not supported ({filename})."
400,
f"file extension '{file_ext}' is not supported, supported "
f"extensions are {sorted(self.supported_file_extensions)}.",
)

extracts = []
Expand Down Expand Up @@ -125,7 +129,9 @@ async def extract_table(
file_ext = get_file_ext(filename)
if file_ext not in self.supported_file_extensions:
return create_json_err_response(
400, f"file extension '{file_ext}' is not supported ({filename})."
400,
f"file extension '{file_ext}' is not supported, supported "
f"extensions are {sorted(self.supported_file_extensions)}.",
)

async with aiofiles.tempfile.NamedTemporaryFile(suffix=".pdf") as tmp_pdf_file:
Expand Down Expand Up @@ -193,7 +199,9 @@ async def extract_meta_data(
file_ext = get_file_ext(filename)
if file_ext not in self.supported_file_extensions:
return create_json_err_response(
400, f"file extension '{file_ext}' is not supported ({filename})."
400,
f"file extension '{file_ext}' is not supported, supported "
f"extensions are {sorted(self.supported_file_extensions)}.",
)
meta_data = {}
doc_info = {}
Expand Down
4 changes: 3 additions & 1 deletion teal/core/ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,9 @@ async def create_pdf(
file_ext = get_file_ext(filename)
if file_ext not in self.supported_file_extensions:
return create_json_err_response(
400, f"file extension '{file_ext}' is not supported ({filename})."
400,
f"file extension '{file_ext}' is not supported, supported "
f"extensions are {sorted(self.supported_file_extensions)}.",
)

# create tmp dir for all files
Expand Down
5 changes: 1 addition & 4 deletions tests/test_api_create_pdf.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import os
import tempfile

import pytest
Expand Down Expand Up @@ -98,9 +97,7 @@ def test_create_pdf_wrong_file_type():
response = client.post(url="/create/pdf", files={"file": tmp})
assert response.status_code == 400
assert response.json() == {
"message": "file extension '.xyz' is not supported ("
+ os.path.basename(tmp.name)
+ ")."
"message": "file extension '.xyz' is not supported, supported extensions are ['.doc', '.docx', '.odt', '.ott', '.pdf', '.rtf', '.text', '.txt']."
}


Expand Down
2 changes: 1 addition & 1 deletion tests/test_api_extract_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def test_pdf_extract_meta_wrong_file_ending():
response = client.post(url="/extract/meta", files={"file": f})
assert response.status_code == 400
assert response.json() == {
"message": f"file extension '.docx' is not supported (word_document.docx)."
"message": f"file extension '.docx' is not supported, supported extensions are ['.pdf']."
}


Expand Down
2 changes: 1 addition & 1 deletion tests/test_api_extract_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def test_pdf_table_extract_with_wrong_file_ending():
response = client.post(url="/extract/table", files={"file": f})
assert response.status_code == 400
assert response.json() == {
"message": f"file extension '.docx' is not supported (word_document.docx)."
"message": f"file extension '.docx' is not supported, supported extensions are ['.pdf']."
}


Expand Down
2 changes: 1 addition & 1 deletion tests/test_api_extract_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def test_extract_text_with_wrong_file_ending():
response = client.post(url="/extract/text", files={"file": f})
assert response.status_code == 400
assert response.json() == {
"message": f"file extension '.docx' is not supported (word_document.docx)."
"message": f"file extension '.docx' is not supported, supported extensions are ['.pdf']."
}


Expand Down
2 changes: 1 addition & 1 deletion tests/test_api_ocr_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ def test_ocr_pdf_with_wrong_file_ending():
response = client.post(url="/ocr/pdf", files={"file": f})
assert response.status_code == 400
assert response.json() == {
"message": f"file extension '.docx' is not supported (word_document.docx)."
"message": f"file extension '.docx' is not supported, supported extensions are ['.pdf']."
}


Expand Down

0 comments on commit a86b21b

Please sign in to comment.