Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 22 additions & 4 deletions machine/corpora/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,10 @@
from .dbl_bundle_text_corpus import DblBundleTextCorpus
from .dictionary_alignment_corpus import DictionaryAlignmentCorpus
from .dictionary_text_corpus import DictionaryTextCorpus
from .file_paratext_project_file_handler import FileParatextProjectFileHandler
from .file_paratext_project_settings_parser import FileParatextProjectSettingsParser
from .file_paratext_project_text_updater import FileParatextProjectTextUpdater
from .file_paratext_project_versification_error_detector import FileParatextProjectVersificationErrorDetector
from .flatten import flatten
from .memory_alignment_collection import MemoryAlignmentCollection
from .memory_stream_container import MemoryStreamContainer
Expand All @@ -18,10 +20,12 @@
from .parallel_text_row import ParallelTextRow
from .paratext_backup_terms_corpus import ParatextBackupTermsCorpus
from .paratext_backup_text_corpus import ParatextBackupTextCorpus
from .paratext_project_file_handler import ParatextProjectFileHandler
from .paratext_project_settings import ParatextProjectSettings
from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
from .paratext_project_terms_parser_base import ParatextProjectTermsParserBase
from .paratext_project_text_updater_base import ParatextProjectTextUpdaterBase
from .paratext_project_versification_error_detector import ParatextProjectVersificationErrorDetector
from .paratext_text_corpus import ParatextTextCorpus
from .place_markers_usfm_update_block_handler import PlaceMarkersAlignmentInfo, PlaceMarkersUsfmUpdateBlockHandler
from .scripture_element import ScriptureElement
Expand Down Expand Up @@ -70,16 +74,22 @@
from .usfm_update_block import UsfmUpdateBlock
from .usfm_update_block_element import UsfmUpdateBlockElement, UsfmUpdateBlockElementType
from .usfm_update_block_handler import UsfmUpdateBlockHandler
from .usfm_versification_error_detector import (
UsfmVersificationError,
UsfmVersificationErrorDetector,
UsfmVersificationErrorType,
)
from .usx_file_alignment_collection import UsxFileAlignmentCollection
from .usx_file_alignment_corpus import UsxFileAlignmentCorpus
from .usx_file_text import UsxFileText
from .usx_file_text_corpus import UsxFileTextCorpus
from .usx_memory_text import UsxMemoryText
from .usx_zip_text import UsxZipText
from .zip_paratext_project_file_handler import ZipParatextProjectFileHandler
from .zip_paratext_project_settings_parser import ZipParatextProjectSettingsParser
from .zip_paratext_project_settings_parser_base import ZipParatextProjectSettingsParserBase
from .zip_paratext_project_terms_parser import ZipParatextProjectTermsParser
from .zip_paratext_project_text_updater import ZipParatextProjectTextUpdater
from .zip_paratext_project_versification_detector import ZipParatextProjectVersificationErrorDetector

__all__ = [
"AlignedWordPair",
Expand All @@ -95,8 +105,10 @@
"EMPTY_SCRIPTURE_REF",
"escape_spaces",
"extract_scripture_corpus",
"FileParatextProjectFileHandler",
"FileParatextProjectSettingsParser",
"FileParatextProjectTextUpdater",
"FileParatextProjectVersificationErrorDetector",
"flatten",
"is_scripture",
"lowercase",
Expand All @@ -113,14 +125,16 @@
"ParallelTextRow",
"ParatextBackupTermsCorpus",
"ParatextBackupTextCorpus",
"ParatextProjectFileHandler",
"ParatextProjectSettings",
"ParatextProjectSettingsParserBase",
"ParatextProjectTermsParserBase",
"ParatextProjectTextUpdaterBase",
"ParatextProjectVersificationErrorDetector",
"ParatextTextCorpus",
"parse_usfm",
"PlaceMarkersAlignmentInfo",
"PlaceMarkersUsfmUpdateBlockHandler",
"parse_usfm",
"RtlReferenceOrder",
"ScriptureElement",
"ScriptureRef",
Expand All @@ -139,8 +153,8 @@
"unescape_spaces",
"UpdateUsfmMarkerBehavior",
"UpdateUsfmParserHandler",
"UpdateUsfmTextBehavior",
"UpdateUsfmRow",
"UpdateUsfmTextBehavior",
"UsfmAttribute",
"UsfmElementType",
"UsfmFileText",
Expand All @@ -164,14 +178,18 @@
"UsfmUpdateBlockElement",
"UsfmUpdateBlockElementType",
"UsfmUpdateBlockHandler",
"UsfmVersificationError",
"UsfmVersificationErrorDetector",
"UsfmVersificationErrorType",
"UsxFileAlignmentCollection",
"UsxFileAlignmentCorpus",
"UsxFileText",
"UsxFileTextCorpus",
"UsxMemoryText",
"UsxZipText",
"ZipParatextProjectFileHandler",
"ZipParatextProjectSettingsParser",
"ZipParatextProjectSettingsParserBase",
"ZipParatextProjectTermsParser",
"ZipParatextProjectTextUpdater",
"ZipParatextProjectVersificationErrorDetector",
]
27 changes: 27 additions & 0 deletions machine/corpora/file_paratext_project_file_handler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from pathlib import Path
from typing import BinaryIO, Optional

from ..utils.typeshed import StrPath
from .paratext_project_file_handler import ParatextProjectFileHandler
from .usfm_stylesheet import UsfmStylesheet


class FileParatextProjectFileHandler(ParatextProjectFileHandler):
def __init__(self, project_dir: StrPath) -> None:
self._project_dir = Path(project_dir)

def exists(self, file_name: str) -> bool:
return (self._project_dir / file_name).is_file()

def open(self, file_name: str) -> BinaryIO:
return open(self._project_dir / file_name, "rb")

def find(self, extension: str) -> Optional[Path]:
return next(self._project_dir.glob(f"*{extension}"), None)

def create_stylesheet(self, file_name: str) -> UsfmStylesheet:
custom_stylesheet_filename = self._project_dir / "custom.sty"
return UsfmStylesheet(
file_name,
custom_stylesheet_filename if custom_stylesheet_filename.is_file() else None,
)
23 changes: 2 additions & 21 deletions machine/corpora/file_paratext_project_settings_parser.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,8 @@
from pathlib import Path
from typing import BinaryIO, Optional

from ..utils.typeshed import StrPath
from .file_paratext_project_file_handler import FileParatextProjectFileHandler
from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
from .usfm_stylesheet import UsfmStylesheet


class FileParatextProjectSettingsParser(ParatextProjectSettingsParserBase):
def __init__(self, project_dir: StrPath) -> None:
self._project_dir = Path(project_dir)

def _create_stylesheet(self, file_name: StrPath) -> UsfmStylesheet:
custom_stylesheet_filename = self._project_dir / "custom.sty"
return UsfmStylesheet(
file_name,
custom_stylesheet_filename if custom_stylesheet_filename.is_file() else None,
)

def _exists(self, file_name: StrPath) -> bool:
return (self._project_dir / file_name).is_file()

def _find(self, extension: str) -> Optional[Path]:
return next(self._project_dir.glob(f"*{extension}"), None)

def _open(self, file_name: StrPath) -> BinaryIO:
return open(self._project_dir / file_name, "rb")
super().__init__(FileParatextProjectFileHandler(project_dir))
5 changes: 4 additions & 1 deletion machine/corpora/file_paratext_project_text_updater.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,16 @@
from typing import BinaryIO

from ..utils.typeshed import StrPath
from .file_paratext_project_file_handler import FileParatextProjectFileHandler
from .file_paratext_project_settings_parser import FileParatextProjectSettingsParser
from .paratext_project_text_updater_base import ParatextProjectTextUpdaterBase


class FileParatextProjectTextUpdater(ParatextProjectTextUpdaterBase):
def __init__(self, project_dir: StrPath) -> None:
super().__init__(FileParatextProjectSettingsParser(project_dir))
super().__init__(
FileParatextProjectFileHandler(project_dir), FileParatextProjectSettingsParser(project_dir).parse()
)

self._project_dir = project_dir

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from ..utils.typeshed import StrPath
from .file_paratext_project_file_handler import FileParatextProjectFileHandler
from .file_paratext_project_settings_parser import FileParatextProjectSettingsParser
from .paratext_project_versification_error_detector import ParatextProjectVersificationErrorDetector


class FileParatextProjectVersificationErrorDetector(ParatextProjectVersificationErrorDetector):
def __init__(self, project_dir: StrPath) -> None:
super().__init__(
FileParatextProjectFileHandler(project_dir), FileParatextProjectSettingsParser(project_dir).parse()
)
18 changes: 18 additions & 0 deletions machine/corpora/paratext_project_file_handler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from abc import ABC, abstractmethod
from typing import BinaryIO

from .usfm_stylesheet import UsfmStylesheet


class ParatextProjectFileHandler(ABC):
@abstractmethod
def exists(self, file_name: str) -> bool: ...

@abstractmethod
def open(self, file_name: str) -> BinaryIO: ...

@abstractmethod
def find(self, extension: str) -> str: ...

@abstractmethod
def create_stylesheet(self, file_name: str) -> UsfmStylesheet: ...
36 changes: 14 additions & 22 deletions machine/corpora/paratext_project_settings_parser_base.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,24 @@
from abc import ABC, abstractmethod
from typing import BinaryIO
from abc import ABC
from xml.etree import ElementTree

from ..scripture.verse_ref import Versification
from ..utils.string_utils import parse_integer
from .corpora_utils import get_encoding
from .paratext_project_file_handler import ParatextProjectFileHandler
from .paratext_project_settings import ParatextProjectSettings
from .usfm_stylesheet import UsfmStylesheet


class ParatextProjectSettingsParserBase(ABC):

@abstractmethod
def _exists(self, file_name: str) -> bool: ...

@abstractmethod
def _find(self, extension: str) -> str: ...

@abstractmethod
def _open(self, file_name: str) -> BinaryIO: ...

@abstractmethod
def _create_stylesheet(self, file_name: str) -> UsfmStylesheet: ...
def __init__(self, paratext_project_file_handler: ParatextProjectFileHandler):
self._paratext_project_file_handler = paratext_project_file_handler

def parse(self) -> ParatextProjectSettings:
settings_file_name = "Settings.xml"
if not self._exists(settings_file_name):
settings_file_name = self._find(".ssf")
if not self._paratext_project_file_handler.exists(settings_file_name):
settings_file_name = self._paratext_project_file_handler.find(".ssf")
if not settings_file_name:
raise ValueError("The project does not contain a settings file.")
with self._open(settings_file_name) as stream:
with self._paratext_project_file_handler.open(settings_file_name) as stream:
settings_tree = ElementTree.parse(stream)

name = settings_tree.getroot().findtext("Name", "")
Expand All @@ -46,18 +35,21 @@ def parse(self) -> ParatextProjectSettings:

versification_type = int(settings_tree.getroot().findtext("Versification", "4"))
versification = Versification.get_builtin(versification_type)
if self._exists("custom.vrs"):
if self._paratext_project_file_handler.exists("custom.vrs"):
guid = settings_tree.getroot().findtext("Guid", "")
versification_name = f"{versification.name}-{guid}"
versification = Versification.load(
self._open("custom.vrs"),
self._paratext_project_file_handler.open("custom.vrs"),
versification,
versification_name,
)
stylesheet_file_name = settings_tree.getroot().findtext("StyleSheet", "usfm.sty")
if not self._exists(stylesheet_file_name) and stylesheet_file_name != "usfm_sb.sty":
if (
not self._paratext_project_file_handler.exists(stylesheet_file_name)
and stylesheet_file_name != "usfm_sb.sty"
):
stylesheet_file_name = "usfm.sty"
stylesheet = self._create_stylesheet(stylesheet_file_name)
stylesheet = self._paratext_project_file_handler.create_stylesheet(stylesheet_file_name)

prefix = ""
form = "41MAT"
Expand Down
26 changes: 13 additions & 13 deletions machine/corpora/paratext_project_terms_parser_base.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
from __future__ import annotations

import re
from abc import ABC, abstractmethod
from abc import ABC
from collections import defaultdict
from importlib.resources import open_binary
from typing import BinaryIO, Dict, List, Optional, Sequence, Tuple, Union
from typing import Dict, List, Optional, Sequence, Tuple, Union
from xml.etree import ElementTree

from .paratext_project_file_handler import ParatextProjectFileHandler
from .paratext_project_settings import ParatextProjectSettings
from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase

Expand All @@ -24,7 +25,12 @@


class ParatextProjectTermsParserBase(ABC):
def __init__(self, settings: Union[ParatextProjectSettings, ParatextProjectSettingsParserBase]) -> None:
def __init__(
self,
paratext_project_file_handler: ParatextProjectFileHandler,
settings: Union[ParatextProjectSettings, ParatextProjectSettingsParserBase],
) -> None:
self._paratext_project_file_handler = paratext_project_file_handler
self._settings: ParatextProjectSettings
if isinstance(settings, ParatextProjectSettingsParserBase):
self._settings = settings.parse()
Expand All @@ -34,8 +40,8 @@ def __init__(self, settings: Union[ParatextProjectSettings, ParatextProjectSetti
def parse(self, term_categories: Sequence[str], use_term_glosses: bool = True) -> List[Tuple[str, List[str]]]:
biblical_terms_doc = None
if self._settings.biblical_terms_list_type == "Project":
if self._exists(self._settings.biblical_terms_file_name):
with self._open(self._settings.biblical_terms_file_name) as stream:
if self._paratext_project_file_handler.exists(self._settings.biblical_terms_file_name):
with self._paratext_project_file_handler.open(self._settings.biblical_terms_file_name) as stream:
biblical_terms_doc = ElementTree.parse(stream)
term_id_to_category_dict = _get_category_per_id(biblical_terms_doc)
elif self._settings.biblical_terms_list_type in _PREDEFINED_TERMS_LIST_TYPES:
Expand All @@ -60,8 +66,8 @@ def parse(self, term_categories: Sequence[str], use_term_glosses: bool = True) -
terms_glosses_doc = ElementTree.parse(stream)

term_renderings_doc: Optional[ElementTree.ElementTree[ElementTree.Element]] = None
if self._exists("TermRenderings.xml"):
with self._open("TermRenderings.xml") as stream:
if self._paratext_project_file_handler.exists("TermRenderings.xml"):
with self._paratext_project_file_handler.open("TermRenderings.xml") as stream:
term_renderings_doc = ElementTree.parse(stream)

terms_renderings: Dict[str, List[str]] = defaultdict(list)
Expand Down Expand Up @@ -94,12 +100,6 @@ def parse(self, term_categories: Sequence[str], use_term_glosses: bool = True) -

return []

@abstractmethod
def _exists(self, file_name: str) -> bool: ...

@abstractmethod
def _open(self, file_name: str) -> BinaryIO: ...


def _is_in_category(id: str, term_categories: Sequence[str], term_id_to_category_dict: Dict[str, str]) -> bool:
category = term_id_to_category_dict.get(id)
Expand Down
23 changes: 11 additions & 12 deletions machine/corpora/paratext_project_text_updater_base.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from abc import ABC, abstractmethod
from typing import BinaryIO, Callable, Iterable, Optional, Sequence, Union
from abc import ABC
from typing import Callable, Iterable, Optional, Sequence, Union

from ..utils.typeshed import StrPath
from .paratext_project_file_handler import ParatextProjectFileHandler
from .paratext_project_settings import ParatextProjectSettings
from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
from .update_usfm_parser_handler import (
Expand All @@ -15,7 +15,12 @@


class ParatextProjectTextUpdaterBase(ABC):
def __init__(self, settings: Union[ParatextProjectSettings, ParatextProjectSettingsParserBase]) -> None:
def __init__(
self,
paratext_project_file_handler: ParatextProjectFileHandler,
settings: Union[ParatextProjectSettings, ParatextProjectSettingsParserBase],
) -> None:
self._paratext_project_file_handler = paratext_project_file_handler
if isinstance(settings, ParatextProjectSettingsParserBase):
self._settings = settings.parse()
else:
Expand All @@ -37,9 +42,9 @@ def update_usfm(
compare_segments: bool = False,
) -> Optional[str]:
file_name: str = self._settings.get_book_file_name(book_id)
if not self._exists(file_name):
if not self._paratext_project_file_handler.exists(file_name):
return None
with self._open(file_name) as sfm_file:
with self._paratext_project_file_handler.open(file_name) as sfm_file:
usfm: str = sfm_file.read().decode(self._settings.encoding)
handler = UpdateUsfmParserHandler(
rows,
Expand All @@ -64,9 +69,3 @@ def update_usfm(
f". Error: '{e}'"
)
raise RuntimeError(error_message) from e

@abstractmethod
def _exists(self, file_name: StrPath) -> bool: ...

@abstractmethod
def _open(self, file_name: StrPath) -> BinaryIO: ...
Loading
Loading