Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 25 additions & 13 deletions machine/corpora/paratext_project_terms_parser_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,10 +70,14 @@ def parse(self, term_categories: Sequence[str], use_term_glosses: bool = True) -
id = term.attrib["Id"]
if _is_in_category(id, term_categories, term_id_to_category_dict):
id_ = id.replace("\n", "&#xA")
renderings = term.find("Renderings")
gloss = renderings.text if renderings is not None and renderings.text is not None else ""
glosses = _get_glosses(gloss)
terms_renderings[id_].extend(glosses)
renderings_element = term.find("Renderings")
rendering_text = (
renderings_element.text
if renderings_element is not None and renderings_element.text is not None
else ""
)
renderings = _get_renderings(rendering_text)
terms_renderings[id_].extend(renderings)

terms_glosses: Dict[str, List[str]] = defaultdict(list)
if terms_glosses_doc is not None and use_term_glosses:
Expand Down Expand Up @@ -102,25 +106,33 @@ def _is_in_category(id: str, term_categories: Sequence[str], term_id_to_category
return not term_categories or (category is not None and category in term_categories)


def _clean_term(term: str):
term = term.strip()
term = _strip_parens(term)
term = " ".join(term.split())
return term


def _get_glosses(gloss: str) -> List[str]:
match = _CONTENT_IN_BRACKETS_REGEX.match(gloss)
if match:
gloss = match.group(0)
gloss = gloss.replace("?", "")
gloss = gloss.replace("*", "")
gloss = gloss.replace("/", " ")
gloss = gloss.strip()
gloss = _strip_parens(gloss)
gloss = match.group(1)
gloss = _clean_term(gloss)
gloss = _strip_parens(gloss, left="[", right="]")
gloss = gloss.strip()
for match in _NUMERICAL_INFORMATION_REGEX.finditer(gloss):
gloss = gloss.replace(match.group(0), "")
glosses = re.split(r"\|\|", gloss)
glosses = [re.split(r"[,;]", g) for g in glosses]
glosses = [item.strip() for sublist in glosses for item in sublist if item.strip()]
glosses = re.split(r"[,;/]", gloss)
glosses = list(set([gloss.strip() for gloss in glosses if gloss.strip()]))
return glosses


def _get_renderings(rendering: str) -> List[str]:
renderings = re.split(r"\|\|", rendering.strip())
renderings = [_clean_term(rendering).strip().replace("*", "") for rendering in renderings]
return [rendering for rendering in renderings if rendering]


def _strip_parens(term_string: str, left: str = "(", right: str = ")") -> str:
parens: int = 0
end: int = -1
Expand Down
8 changes: 6 additions & 2 deletions machine/corpora/paratext_project_text_updater_base.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from abc import ABC, abstractmethod
from typing import BinaryIO, Iterable, Optional, Sequence, Union
from typing import BinaryIO, Callable, Iterable, Optional, Sequence, Union

from ..utils.typeshed import StrPath
from .paratext_project_settings import ParatextProjectSettings
Expand All @@ -11,7 +11,7 @@
UpdateUsfmTextBehavior,
)
from .usfm_parser import parse_usfm
from .usfm_update_block_handler import UsfmUpdateBlockHandler
from .usfm_update_block_handler import UsfmUpdateBlockHandler, UsfmUpdateBlockHandlerError


class ParatextProjectTextUpdaterBase(ABC):
Expand All @@ -33,6 +33,8 @@ def update_usfm(
preserve_paragraph_styles: Optional[Union[Iterable[str], str]] = None,
update_block_handlers: Optional[Iterable[UsfmUpdateBlockHandler]] = None,
remarks: Optional[Iterable[str]] = None,
error_handler: Optional[Callable[[UsfmUpdateBlockHandlerError], bool]] = None,
compare_segments: bool = False,
) -> Optional[str]:
file_name: str = self._settings.get_book_file_name(book_id)
if not self._exists(file_name):
Expand All @@ -49,6 +51,8 @@ def update_usfm(
preserve_paragraph_styles,
update_block_handlers=update_block_handlers,
remarks=remarks,
error_handler=error_handler,
compare_segments=compare_segments,
)
try:
parse_usfm(usfm, handler, self._settings.stylesheet, self._settings.versification)
Expand Down
15 changes: 13 additions & 2 deletions machine/corpora/place_markers_usfm_update_block_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from .usfm_token import UsfmToken, UsfmTokenType
from .usfm_update_block import UsfmUpdateBlock
from .usfm_update_block_element import UsfmUpdateBlockElement, UsfmUpdateBlockElementType
from .usfm_update_block_handler import UsfmUpdateBlockHandler
from .usfm_update_block_handler import UsfmUpdateBlockHandler, UsfmUpdateBlockHandlerError

PLACE_MARKERS_ALIGNMENT_INFO_KEY = "alignment_info"

Expand Down Expand Up @@ -118,7 +118,18 @@ def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock:
trg_tok_starts = []
prev_len = 0
for tok in trg_toks:
trg_tok_starts.append(trg_sent.index(tok, trg_tok_starts[-1] + prev_len if len(trg_tok_starts) > 0 else 0))
try:
index_of_trg_tok_in_sent = trg_sent.index(
tok, trg_tok_starts[-1] + prev_len if len(trg_tok_starts) > 0 else 0
)
except ValueError:
raise UsfmUpdateBlockHandlerError(
block,
f'No token "{tok}" found in text "{trg_sent}" at or beyond index'
f"{trg_tok_starts[-1] + prev_len if len(trg_tok_starts) > 0 else 0}."
"Is the versification correctly specified?",
)
trg_tok_starts.append(index_of_trg_tok_in_sent)
prev_len = len(tok)

# Predict marker placements and get insertion order
Expand Down
133 changes: 108 additions & 25 deletions machine/corpora/update_usfm_parser_handler.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from enum import Enum, auto
from typing import Iterable, List, Optional, Sequence, Tuple, Union
from typing import Callable, Dict, Iterable, List, Optional, Sequence, Tuple, Union

from ..scripture.verse_ref import IgnoreSegmentsVerseRef, VerseRef, Versification
from .scripture_ref import ScriptureRef
from .scripture_ref_usfm_parser_handler import ScriptureRefUsfmParserHandler, ScriptureTextType
from .usfm_parser_state import UsfmParserState
Expand All @@ -10,7 +11,7 @@
from .usfm_tokenizer import UsfmTokenizer
from .usfm_update_block import UsfmUpdateBlock
from .usfm_update_block_element import UsfmUpdateBlockElement, UsfmUpdateBlockElementType
from .usfm_update_block_handler import UsfmUpdateBlockHandler
from .usfm_update_block_handler import UsfmUpdateBlockHandler, UsfmUpdateBlockHandlerError


class UpdateUsfmTextBehavior(Enum):
Expand All @@ -24,6 +25,12 @@ class UpdateUsfmMarkerBehavior(Enum):
STRIP = auto()


class _RowInfo:
def __init__(self, row_index: int):
self.row_index = row_index
self.is_consumed = False


class UpdateUsfmRow:
def __init__(self, refs: Sequence[ScriptureRef], text: str, metadata: Optional[dict[str, object]] = None):
self.refs = refs
Expand All @@ -43,9 +50,19 @@ def __init__(
preserve_paragraph_styles: Optional[Union[Iterable[str], str]] = None,
update_block_handlers: Optional[Iterable[UsfmUpdateBlockHandler]] = None,
remarks: Optional[Iterable[str]] = None,
error_handler: Optional[Callable[[UsfmUpdateBlockHandlerError], bool]] = None,
compare_segments: bool = False,
) -> None:
super().__init__()
self._rows = rows or []
self._verse_rows: List[int] = []
self._verse_row_index = 0
self._verse_rows_map: Dict[VerseRef, List[_RowInfo]] = {}
self._verse_rows_ref = VerseRef()
if len(self._rows) > 0:
self._update_rows_versification: Versification = self._rows[0].refs[0].versification
else:
self._update_rows_versification = Versification.get_builtin("English")
self._tokens: List[UsfmToken] = []
self._updated_text: List[UsfmToken] = []
self._update_block_stack: list[UsfmUpdateBlock] = []
Expand All @@ -65,6 +82,11 @@ def __init__(
self._remarks = []
else:
self._remarks = list(remarks)
if error_handler is None:
self._error_handler = lambda _: False
else:
self._error_handler = error_handler
self._compare_segments = compare_segments
self._text_behavior = text_behavior
self._paragraph_behavior = paragraph_behavior
self._embed_behavior = embed_behavior
Expand All @@ -82,6 +104,10 @@ def end_usfm(self, state: UsfmParserState) -> None:
super().end_usfm(state)

def start_book(self, state: UsfmParserState, marker: str, code: str) -> None:
self._verse_rows_ref = state.verse_ref.copy()
self._update_verse_rows_map()
self._update_verse_rows()

self._collect_readonly_tokens(state)
self._update_block_stack.append(UsfmUpdateBlock())
start_book_tokens: List[UsfmToken] = []
Expand All @@ -108,7 +134,7 @@ def start_para(
) -> None:
if state.is_verse_text:
# Only strip paragraph markers in a verse
if self._paragraph_behavior == UpdateUsfmMarkerBehavior.PRESERVE:
if self._paragraph_behavior == UpdateUsfmMarkerBehavior.PRESERVE and not self._duplicate_verse:
self._collect_updatable_tokens(state)
else:
self._skip_updatable_tokens(state)
Expand Down Expand Up @@ -148,6 +174,11 @@ def chapter(
) -> None:
self._use_updated_text()

if self._verse_rows_ref != state.verse_ref:
self._verse_rows_ref = state.verse_ref.copy()
self._update_verse_rows_map()
self._update_verse_rows()

super().chapter(state, number, marker, alt_number, pub_number)

self._collect_readonly_tokens(state)
Expand Down Expand Up @@ -179,14 +210,23 @@ def verse(
if last_paragraph is not None:
last_paragraph.marked_for_removal = False

super().verse(state, number, marker, alt_number, pub_number)
if self._verse_rows_ref != state.verse_ref:
self._verse_rows_ref = state.verse_ref.copy()
self._update_verse_rows()

self._collect_readonly_tokens(state)
super().verse(state, number, marker, alt_number, pub_number)
if self._duplicate_verse:
self._skip_updatable_tokens(state)
else:
self._collect_readonly_tokens(state)

def start_note(self, state: UsfmParserState, marker: str, caller: str, category: str) -> None:
super().start_note(state, marker, caller, category)

self._collect_updatable_tokens(state)
if not self._duplicate_verse:
self._collect_updatable_tokens(state)
else:
self._skip_updatable_tokens(state)

def end_note(self, state: UsfmParserState, marker: str, closed: bool) -> None:
if closed:
Expand Down Expand Up @@ -219,15 +259,14 @@ def end_char(
attributes: Sequence[UsfmAttribute],
closed: bool,
) -> None:
if closed:
if self._current_text_type == ScriptureTextType.EMBED:
self._collect_updatable_tokens(state)
if self._current_text_type == ScriptureTextType.EMBED:
self._collect_updatable_tokens(state)
else:
self._replace_with_new_tokens(state)
if self._style_behavior == UpdateUsfmMarkerBehavior.STRIP:
self._skip_updatable_tokens(state)
else:
self._replace_with_new_tokens(state)
if self._style_behavior == UpdateUsfmMarkerBehavior.STRIP:
self._skip_updatable_tokens(state)
else:
self._collect_updatable_tokens(state)
self._collect_updatable_tokens(state)

super().end_char(state, marker, attributes, closed)

Expand All @@ -242,7 +281,9 @@ def ref(self, state: UsfmParserState, marker: str, display: str, target: str) ->
def text(self, state: UsfmParserState, text: str) -> None:
super().text(state, text)

if self._replace_with_new_tokens(state):
if self._replace_with_new_tokens(state) or (
self._duplicate_verse and self._current_text_type == ScriptureTextType.VERSE
):
self._skip_updatable_tokens(state)
else:
self._collect_updatable_tokens(state)
Expand Down Expand Up @@ -292,11 +333,10 @@ def get_usfm(self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty") -> str:
for remark in self._remarks:
remark_tokens.append(UsfmToken(UsfmTokenType.PARAGRAPH, "rem"))
remark_tokens.append(UsfmToken(UsfmTokenType.TEXT, text=remark))
if len(tokens) > 0 and tokens[0].marker == "id":
index = 1
if len(tokens) > 1 and tokens[1].type == UsfmTokenType.TEXT:
index = 2
while tokens[index].marker == "rem":
if len(tokens) > 0:
index = 0
markers_to_skip = {"id", "ide", "rem"}
while tokens[index].marker in markers_to_skip:
index += 1
if len(tokens) > index and tokens[index].type == UsfmTokenType.TEXT:
index += 1
Expand All @@ -308,13 +348,15 @@ def _advance_rows(self, seg_scr_refs: Sequence[ScriptureRef]) -> Tuple[List[str]
row_texts: List[str] = []
row_metadata = None
source_index: int = 0
while self._row_index < len(self._rows) and source_index < len(seg_scr_refs):
while self._verse_row_index < len(self._verse_rows) and source_index < len(seg_scr_refs):
compare: int = 0
row = self._rows[self._row_index]
row = self._rows[self._verse_rows[self._verse_row_index]]
row_scr_refs, text, metadata = row.refs, row.text, row.metadata
for row_scr_ref in row_scr_refs:
while source_index < len(seg_scr_refs):
compare = row_scr_ref.compare_to(seg_scr_refs[source_index], compare_segments=False)
compare = row_scr_ref.compare_to(
seg_scr_refs[source_index], compare_segments=self._compare_segments
)
if compare > 0:
# row is ahead of source, increment source
source_index += 1
Expand All @@ -328,7 +370,7 @@ def _advance_rows(self, seg_scr_refs: Sequence[ScriptureRef]) -> Tuple[List[str]
break
if compare <= 0:
# source is ahead of row, increment row
self._row_index += 1
self._verse_row_index += 1
return row_texts, row_metadata

def _collect_updatable_tokens(self, state: UsfmParserState) -> None:
Expand Down Expand Up @@ -418,7 +460,13 @@ def _end_update_block(self, state: UsfmParserState, scripture_refs: Sequence[Scr
para_elems.append(update_block.pop())

for handler in self._update_block_handlers:
update_block = handler.process_block(update_block)
try:
update_block = handler.process_block(update_block)
except UsfmUpdateBlockHandlerError as e:
should_continue = self._error_handler(e)
if not should_continue:
raise

tokens = update_block.get_tokens()
for elem in reversed(para_elems):
tokens.extend(elem.get_tokens())
Expand Down Expand Up @@ -449,6 +497,41 @@ def _pop_new_tokens(self) -> None:
def _is_in_preserved_paragraph(self, state: UsfmParserState) -> bool:
return state.para_tag is not None and state.para_tag.marker in self._preserve_paragraph_styles

def _update_verse_rows_map(self) -> None:
self._verse_rows_map.clear()
while (
self._row_index < len(self._rows)
and self._rows[self._row_index].refs[0].chapter_num == self._verse_rows_ref.chapter_num
):
row = self._rows[self._row_index]
ri = _RowInfo(self._row_index)
for sr in row.refs:
vr = sr.verse_ref if self._compare_segments else IgnoreSegmentsVerseRef(sr.verse_ref)
if vr in self._verse_rows_map:
self._verse_rows_map[vr].append(ri)
else:
self._verse_rows_map[vr] = [ri]
self._row_index += 1

def _update_verse_rows(self) -> None:
vref = self._verse_rows_ref.copy()
# We are using a dictionary, which uses an equality comparer. As a result, we need to change the
# source verse ref to use the row versification. If we used a SortedList, it wouldn't be necessary, but it
# would be less efficient.
vref.change_versification(self._update_rows_versification)

self._verse_rows.clear()
self._verse_row_index = 0

for vr in vref.all_verses():
if not self._compare_segments:
vr = IgnoreSegmentsVerseRef(vr)
if rows := self._verse_rows_map.get(vr):
for row in rows:
if not row.is_consumed:
self._verse_rows.append(row.row_index)
row.is_consumed = True


def _is_nonverse_paragraph(state: UsfmParserState, element: UsfmUpdateBlockElement) -> bool:
if element.type != UsfmUpdateBlockElementType.PARAGRAPH:
Expand Down
10 changes: 10 additions & 0 deletions machine/corpora/usfm_update_block_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,13 @@
class UsfmUpdateBlockHandler(ABC):
@abstractmethod
def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock: ...


class UsfmUpdateBlockHandlerError(Exception):
def __init__(self, block: UsfmUpdateBlock, *args):
self._block = block
super().__init__(*args)

@property
def block(self):
return self._block
Loading