Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions machine/corpora/paratext_project_quote_convention_detector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from abc import ABC, abstractmethod
from typing import BinaryIO, Optional, Union

from ..utils.typeshed import StrPath
from .paratext_project_settings import ParatextProjectSettings
from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
from .punctuation_analysis.quote_convention_detector import QuoteConventionAnalysis, QuoteConventionDetector
from .usfm_parser import parse_usfm


class ParatextProjectQuoteConventionDetector(ABC):
def __init__(self, settings: Union[ParatextProjectSettings, ParatextProjectSettingsParserBase]) -> None:
if isinstance(settings, ParatextProjectSettingsParserBase):
self._settings = settings.parse()
else:
self._settings = settings

def get_quote_convention_analysis(
self, handler: Optional[QuoteConventionDetector] = None
) -> Optional[QuoteConventionAnalysis]:
handler = QuoteConventionDetector() if handler is None else handler
for file_name in self._settings.get_all_scripture_book_file_names():
if not self._exists(file_name):
continue
with self._open(file_name) as sfm_file:
usfm: str = sfm_file.read().decode(self._settings.encoding)
try:
parse_usfm(usfm, handler, self._settings.stylesheet, self._settings.versification)
except Exception as e:
error_message = (
f"An error occurred while parsing the usfm for '{file_name}'"
f"{f' in project {self._settings.name}' if self._settings.name else ''}"
f". Error: '{e}'"
)
raise RuntimeError(error_message) from e
return handler.detect_quote_convention()

@abstractmethod
def _exists(self, file_name: StrPath) -> bool: ...

@abstractmethod
def _open(self, file_name: StrPath) -> BinaryIO: ...
8 changes: 6 additions & 2 deletions machine/corpora/paratext_project_settings.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from dataclasses import dataclass
from typing import Optional
from typing import Iterable, Optional

from ..scripture.canon import book_id_to_number, book_number_to_id
from ..scripture.canon import book_id_to_number, book_number_to_id, get_scripture_books
from ..scripture.verse_ref import Versification
from .usfm_stylesheet import UsfmStylesheet

Expand Down Expand Up @@ -53,6 +53,10 @@ def get_book_file_name(self, book_id: str) -> str:
book_part = _get_book_file_name_digits(book_id) + book_id
return self.file_name_prefix + book_part + self.file_name_suffix

def get_all_scripture_book_file_names(self) -> Iterable[str]:
for book_id in get_scripture_books():
yield self.get_book_file_name(book_id)


def _get_book_file_name_digits(book_id: str) -> str:
book_num = book_id_to_number(book_id)
Expand Down
24 changes: 24 additions & 0 deletions machine/corpora/zip_paratext_project_quote_convention_detector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from io import BytesIO
from typing import BinaryIO, Optional
from zipfile import ZipFile

from .paratext_project_quote_convention_detector import ParatextProjectQuoteConventionDetector


class ZipParatextProjectQuoteConventionDetector(ParatextProjectQuoteConventionDetector):
def __init__(self, archive: ZipFile) -> None:
self._archive = archive

def _exists(self, file_name: str) -> bool:
return file_name in self._archive.namelist()

def _find(self, extension: str) -> Optional[str]:
for entry in self._archive.namelist():
if entry.endswith(extension):
return entry
return None

def _open(self, file_name: str) -> Optional[BinaryIO]:
if file_name in self._archive.namelist():
return BytesIO(self._archive.read(file_name))
return None
6 changes: 5 additions & 1 deletion machine/scripture/canon.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Union
from typing import Iterable, Union

ALL_BOOK_IDS = [
"GEN",
Expand Down Expand Up @@ -181,3 +181,7 @@ def is_canonical(book: Union[str, int]) -> bool:
if isinstance(book, int):
book = book_number_to_id(book)
return is_book_id_valid(book) and book not in NON_CANONICAL_IDS


def get_scripture_books() -> Iterable[str]:
return list(map(lambda kvp: kvp[0], filter(lambda kvp: is_ot_nt(kvp[1]), BOOK_NUMBERS.items())))
82 changes: 82 additions & 0 deletions tests/corpora/test_paratext_project_quote_convention_detector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
from typing import Dict, Optional

from testutils.memory_paratext_project_quote_convention_detector import MemoryParatextProjectQuoteConventionDetector

from machine.corpora import ParatextProjectSettings, UsfmStylesheet
from machine.corpora.paratext_project_quote_convention_detector import ParatextProjectQuoteConventionDetector
from machine.corpora.punctuation_analysis.quote_convention_detector import QuoteConventionAnalysis
from machine.scripture import ORIGINAL_VERSIFICATION, Versification


def test_get_quote_convention() -> None:
env = _TestEnvironment(
files={
"41MATTest.SFM": r"""\id MAT
\c 1
\v 1 Someone said, “This is something I am saying!
\v 2 This is also something I am saying” (that is, “something I am speaking”).
\p
\v 3 Other text, and someone else said,
\q1
\v 4 “Things
\q2 someone else said!
\q3 and more things someone else said.”
\m That is why he said “things someone else said.”
\v 5 Then someone said, “More things someone said.”""",
}
)
analysis: Optional[QuoteConventionAnalysis] = env.get_quote_convention()
assert analysis is not None
assert analysis.best_quote_convention_score > 0.8
assert analysis.best_quote_convention.name == "standard_english"


class _TestEnvironment:
def __init__(
self,
settings: Optional[ParatextProjectSettings] = None,
files: Optional[Dict[str, str]] = None,
) -> None:
self._detector: ParatextProjectQuoteConventionDetector = MemoryParatextProjectQuoteConventionDetector(
settings or _DefaultParatextProjectSettings(), files or {}
)

@property
def detector(self) -> ParatextProjectQuoteConventionDetector:
return self._detector

def get_quote_convention(self) -> Optional[QuoteConventionAnalysis]:
return self.detector.get_quote_convention_analysis()


class _DefaultParatextProjectSettings(ParatextProjectSettings):
def __init__(
self,
name: str = "Test",
full_name: str = "TestProject",
encoding: Optional[str] = None,
versification: Optional[Versification] = None,
stylesheet: Optional[UsfmStylesheet] = None,
file_name_prefix: str = "",
file_name_form: str = "41MAT",
file_name_suffix: str = "Test.SFM",
biblical_terms_list_type: str = "Project",
biblical_terms_project_name: str = "Test",
biblical_terms_file_name: str = "ProjectBiblicalTerms.xml",
language_code: str = "en",
):

super().__init__(
name,
full_name,
encoding if encoding is not None else "utf-8",
versification if versification is not None else ORIGINAL_VERSIFICATION,
stylesheet if stylesheet is not None else UsfmStylesheet("usfm.sty"),
file_name_prefix,
file_name_form,
file_name_suffix,
biblical_terms_list_type,
biblical_terms_project_name,
biblical_terms_file_name,
language_code,
)
24 changes: 12 additions & 12 deletions tests/corpora/test_update_usfm_parser_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -843,7 +843,7 @@ def test_update_block_verse_preserve_paras() -> None:
\v 1 verse 1 \p inner verse paragraph
"""

update_block_handler = TestUsfmUpdateBlockHandler()
update_block_handler = _TestUsfmUpdateBlockHandler()
update_usfm(
rows, usfm, embed_behavior=UpdateUsfmMarkerBehavior.PRESERVE, update_block_handlers=[update_block_handler]
)
Expand Down Expand Up @@ -872,7 +872,7 @@ def test_update_block_verse_strip_paras() -> None:
\v 1 verse 1 \p inner verse paragraph
"""

update_block_handler = TestUsfmUpdateBlockHandler()
update_block_handler = _TestUsfmUpdateBlockHandler()
update_usfm(
rows, usfm, paragraph_behavior=UpdateUsfmMarkerBehavior.STRIP, update_block_handlers=[update_block_handler]
)
Expand Down Expand Up @@ -901,7 +901,7 @@ def test_update_block_verse_range() -> None:
\v 1-3 verse 1 through 3
"""

update_block_handler = TestUsfmUpdateBlockHandler()
update_block_handler = _TestUsfmUpdateBlockHandler()
update_usfm(
rows, usfm, embed_behavior=UpdateUsfmMarkerBehavior.PRESERVE, update_block_handlers=[update_block_handler]
)
Expand All @@ -928,7 +928,7 @@ def test_update_block_footnote_preserve_embeds() -> None:
\v 1 verse\f \fr 1.1 \ft Some note \f* 1
"""

update_block_handler = TestUsfmUpdateBlockHandler()
update_block_handler = _TestUsfmUpdateBlockHandler()
update_usfm(
rows, usfm, embed_behavior=UpdateUsfmMarkerBehavior.PRESERVE, update_block_handlers=[update_block_handler]
)
Expand Down Expand Up @@ -957,7 +957,7 @@ def test_update_block_footnote_strip_embeds() -> None:
\v 1 verse\f \fr 1.1 \ft Some note \f* 1
"""

update_block_handler = TestUsfmUpdateBlockHandler()
update_block_handler = _TestUsfmUpdateBlockHandler()
update_usfm(rows, usfm, embed_behavior=UpdateUsfmMarkerBehavior.STRIP, update_block_handlers=[update_block_handler])

assert len(update_block_handler.blocks) == 1
Expand Down Expand Up @@ -985,7 +985,7 @@ def test_update_block_nonverse() -> None:
\v 1 verse 1
"""

update_block_handler = TestUsfmUpdateBlockHandler()
update_block_handler = _TestUsfmUpdateBlockHandler()
update_usfm(rows, usfm, update_block_handlers=[update_block_handler])

assert len(update_block_handler.blocks) == 2
Expand All @@ -1010,7 +1010,7 @@ def test_update_block_verse_preserve_styles() -> None:
\v 1 verse \bd 1\bd*
"""

update_block_handler = TestUsfmUpdateBlockHandler()
update_block_handler = _TestUsfmUpdateBlockHandler()
update_usfm(
rows, usfm, style_behavior=UpdateUsfmMarkerBehavior.PRESERVE, update_block_handlers=[update_block_handler]
)
Expand Down Expand Up @@ -1041,7 +1041,7 @@ def test_update_block_verse_strip_styles() -> None:
\v 1 verse \bd 1\bd*
"""

update_block_handler = TestUsfmUpdateBlockHandler()
update_block_handler = _TestUsfmUpdateBlockHandler()
update_usfm(rows, usfm, style_behavior=UpdateUsfmMarkerBehavior.STRIP, update_block_handlers=[update_block_handler])

assert len(update_block_handler.blocks) == 1
Expand Down Expand Up @@ -1074,7 +1074,7 @@ def test_update_block_verse_section_header() -> None:
\v 2 Verse 2
"""

update_block_handler = TestUsfmUpdateBlockHandler()
update_block_handler = _TestUsfmUpdateBlockHandler()
update_usfm(rows, usfm, update_block_handlers=[update_block_handler])

assert len(update_block_handler.blocks) == 4
Expand Down Expand Up @@ -1114,7 +1114,7 @@ def test_update_block_verse_section_header_in_verse() -> None:
\p end of verse
"""

update_block_handler = TestUsfmUpdateBlockHandler()
update_block_handler = _TestUsfmUpdateBlockHandler()
update_usfm(rows, usfm, update_block_handlers=[update_block_handler])

assert len(update_block_handler.blocks) == 3
Expand Down Expand Up @@ -1148,7 +1148,7 @@ def test_update_block_nonverse_paragraph_end_of_verse() -> None:
\s Section header
"""

update_block_handler = TestUsfmUpdateBlockHandler()
update_block_handler = _TestUsfmUpdateBlockHandler()
update_usfm(rows, usfm, update_block_handlers=[update_block_handler])

assert len(update_block_handler.blocks) == 3
Expand Down Expand Up @@ -1307,7 +1307,7 @@ def assert_update_block_equals(
assert element.marked_for_removal == expected_marked_for_removal


class TestUsfmUpdateBlockHandler(UsfmUpdateBlockHandler):
class _TestUsfmUpdateBlockHandler(UsfmUpdateBlockHandler):
def __init__(self):
self.blocks: list[UsfmUpdateBlock] = []

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from io import BytesIO
from typing import BinaryIO, Dict

from machine.corpora import ParatextProjectSettings
from machine.corpora.paratext_project_quote_convention_detector import ParatextProjectQuoteConventionDetector


class MemoryParatextProjectQuoteConventionDetector(ParatextProjectQuoteConventionDetector):
def __init__(self, settings: ParatextProjectSettings, files: Dict[str, str]) -> None:
super().__init__(settings)

self.files = files

def _exists(self, file_name: str) -> bool:
return file_name in self.files

def _open(self, file_name: str) -> BinaryIO:
return BytesIO(self.files[file_name].encode("utf-8"))
Loading