From 65d0d15e3df75d30454049d41462c5a29e5edf73 Mon Sep 17 00:00:00 2001 From: Ben King Date: Tue, 28 Oct 2025 14:41:23 -0400 Subject: [PATCH 1/7] Use a weighted average of books for Paratext project quote convention detection --- machine/punctuation_analysis/__init__.py | 3 +- ...atext_project_quote_convention_detector.py | 58 +++++++++++++++++-- .../quotation_mark_tabulator.py | 17 ++++++ .../punctuation_analysis/quote_convention.py | 6 ++ .../quote_convention_analysis.py | 51 ++++++++++++++++ .../quote_convention_detector.py | 30 ++++------ .../quote_convention_set.py | 11 ++++ ...atext_project_quote_convention_detector.py | 2 + ...atext_project_quote_convention_detector.py | 21 +++++++ 9 files changed, 176 insertions(+), 23 deletions(-) create mode 100644 machine/punctuation_analysis/quote_convention_analysis.py diff --git a/machine/punctuation_analysis/__init__.py b/machine/punctuation_analysis/__init__.py index aa28e3b7..1773cbf2 100644 --- a/machine/punctuation_analysis/__init__.py +++ b/machine/punctuation_analysis/__init__.py @@ -32,9 +32,10 @@ from .quotation_mark_update_settings import QuotationMarkUpdateSettings from .quotation_mark_update_strategy import QuotationMarkUpdateStrategy from .quote_convention import QuoteConvention, SingleLevelQuoteConvention +from .quote_convention_analysis import QuoteConventionAnalysis from .quote_convention_changing_usfm_update_block_handler import QuoteConventionChangingUsfmUpdateBlockHandler from .quote_convention_detection_resolution_settings import QuoteConventionDetectionResolutionSettings -from .quote_convention_detector import QuoteConventionAnalysis, QuoteConventionDetector +from .quote_convention_detector import QuoteConventionDetector from .quote_convention_set import QuoteConventionSet from .standard_quote_conventions import STANDARD_QUOTE_CONVENTIONS from .text_segment import TextSegment diff --git a/machine/punctuation_analysis/paratext_project_quote_convention_detector.py b/machine/punctuation_analysis/paratext_project_quote_convention_detector.py index 808a717a..c3b73b86 100644 --- a/machine/punctuation_analysis/paratext_project_quote_convention_detector.py +++ b/machine/punctuation_analysis/paratext_project_quote_convention_detector.py @@ -1,4 +1,5 @@ from abc import ABC, abstractmethod +from collections import defaultdict from typing import BinaryIO, Dict, List, Optional, Union from ..corpora.paratext_project_settings import ParatextProjectSettings @@ -6,7 +7,43 @@ from ..corpora.usfm_parser import parse_usfm from ..scripture.canon import book_id_to_number, get_scripture_books from ..utils.typeshed import StrPath -from .quote_convention_detector import QuoteConventionAnalysis, QuoteConventionDetector +from .quotation_mark_tabulator import QuotationMarkTabulator +from .quote_convention import QuoteConvention +from .quote_convention_analysis import QuoteConventionAnalysis +from .quote_convention_detector import QuoteConventionDetector + + +class WeightedAverageQuoteConventionAnalysisBuilder: + def __init__(self) -> None: + self._total_weight: float = 0 + self._convention_votes: Dict[QuoteConvention, float] = defaultdict(float) + self._total_tabulated_quotation_marks = QuotationMarkTabulator() + + def record_book_results( + self, + quote_convention_analysis: Optional[QuoteConventionAnalysis], + tabulated_quotation_marks: QuotationMarkTabulator, + ) -> None: + if quote_convention_analysis is None: + return + + self._total_tabulated_quotation_marks.tabulate_from(tabulated_quotation_marks) + + self._total_weight += quote_convention_analysis.weight + for convention, score in quote_convention_analysis.get_all_convention_scores(): + self._convention_votes[convention] += score * quote_convention_analysis.weight + + def to_quote_convention_analysis(self) -> Optional[QuoteConventionAnalysis]: + if self._total_weight == 0: + return None + + quote_convention_analysis_builder = QuoteConventionAnalysis.Builder(self._total_tabulated_quotation_marks) + + for convention, total_score in self._convention_votes.items(): + if total_score > 0: + quote_convention_analysis_builder.record_convention_score(convention, total_score / self._total_weight) + + return quote_convention_analysis_builder.build() class ParatextProjectQuoteConventionDetector(ABC): @@ -17,15 +54,20 @@ def __init__(self, settings: Union[ParatextProjectSettings, ParatextProjectSetti self._settings = settings def get_quote_convention_analysis( - self, handler: Optional[QuoteConventionDetector] = None, include_chapters: Optional[Dict[int, List[int]]] = None + self, include_chapters: Optional[Dict[int, List[int]]] = None ) -> Optional[QuoteConventionAnalysis]: - handler = QuoteConventionDetector() if handler is None else handler + + weighted_average_quote_convention_analysis_builder = WeightedAverageQuoteConventionAnalysisBuilder() + for book_id in get_scripture_books(): if include_chapters is not None and book_id_to_number(book_id) not in include_chapters: continue file_name: str = self._settings.get_book_file_name(book_id) if not self._exists(file_name): continue + + handler = QuoteConventionDetector() + with self._open(file_name) as sfm_file: usfm: str = sfm_file.read().decode(self._settings.encoding) try: @@ -37,7 +79,15 @@ def get_quote_convention_analysis( f". Error: '{e}'" ) raise RuntimeError(error_message) from e - return handler.detect_quote_convention(include_chapters) + + quote_convention_analysis, tabulated_quotation_marks = ( + handler.detect_quote_convention_and_get_tabulated_quotation_marks(include_chapters) + ) + weighted_average_quote_convention_analysis_builder.record_book_results( + quote_convention_analysis, tabulated_quotation_marks + ) + + return weighted_average_quote_convention_analysis_builder.to_quote_convention_analysis() @abstractmethod def _exists(self, file_name: StrPath) -> bool: ... diff --git a/machine/punctuation_analysis/quotation_mark_tabulator.py b/machine/punctuation_analysis/quotation_mark_tabulator.py index c76ff540..308be297 100644 --- a/machine/punctuation_analysis/quotation_mark_tabulator.py +++ b/machine/punctuation_analysis/quotation_mark_tabulator.py @@ -15,6 +15,10 @@ def count_quotation_mark(self, quotation_mark: str) -> None: self._quotation_mark_counter.update([quotation_mark]) self._total_count += 1 + def count_from(self, quotation_mark_counts: "QuotationMarkCounts") -> None: + self._quotation_mark_counter.update(quotation_mark_counts._quotation_mark_counter) + self._total_count += quotation_mark_counts._total_count + def find_best_quotation_mark_proportion(self) -> tuple[str, int, int]: return self._quotation_mark_counter.most_common(1)[0] + (self._total_count,) @@ -36,6 +40,13 @@ def tabulate(self, quotation_marks: list[QuotationMarkMetadata]) -> None: for quotation_mark in quotation_marks: self._count_quotation_mark(quotation_mark) + def tabulate_from(self, tabulated_quotation_marks: "QuotationMarkTabulator") -> None: + for ( + depth_and_direction, + quotation_mark_counts, + ) in tabulated_quotation_marks._quotation_counts_by_depth_and_direction.items(): + self._quotation_counts_by_depth_and_direction[depth_and_direction].count_from(quotation_mark_counts) + def _count_quotation_mark(self, quotation_mark: QuotationMarkMetadata) -> None: key = (quotation_mark.depth, quotation_mark.direction) self._quotation_counts_by_depth_and_direction[key].count_quotation_mark(quotation_mark.quotation_mark) @@ -48,6 +59,12 @@ def _find_most_common_quotation_mark_with_depth_and_direction( ) -> tuple[str, int, int]: return self._quotation_counts_by_depth_and_direction[(depth, direction)].find_best_quotation_mark_proportion() + def get_total_quotation_mark_count(self) -> int: + total_count = 0 + for counts in self._quotation_counts_by_depth_and_direction.values(): + total_count += counts.get_observed_count() + return total_count + def calculate_similarity(self, quote_convention: QuoteConvention) -> float: weighted_difference = 0 total_weight = 0 diff --git a/machine/punctuation_analysis/quote_convention.py b/machine/punctuation_analysis/quote_convention.py index 386cd559..3160894c 100644 --- a/machine/punctuation_analysis/quote_convention.py +++ b/machine/punctuation_analysis/quote_convention.py @@ -37,6 +37,9 @@ def normalize(self) -> "SingleLevelQuoteConvention": ) return SingleLevelQuoteConvention(normalized_opening_quotation_mark, normalized_closing_quotation_mark) + def __hash__(self) -> int: + return hash((self.opening_quotation_mark, self.closing_quotation_mark)) + class QuoteConvention: def __init__(self, name: str, level_conventions: list[SingleLevelQuoteConvention]): @@ -57,6 +60,9 @@ def __eq__(self, value): return False return True + def __hash__(self) -> int: + return hash((tuple(self.level_conventions))) + @property def name(self) -> str: return self._name diff --git a/machine/punctuation_analysis/quote_convention_analysis.py b/machine/punctuation_analysis/quote_convention_analysis.py new file mode 100644 index 00000000..c0a8cfa2 --- /dev/null +++ b/machine/punctuation_analysis/quote_convention_analysis.py @@ -0,0 +1,51 @@ +from .quotation_mark_tabulator import QuotationMarkTabulator +from .quote_convention import QuoteConvention + + +class QuoteConventionAnalysis: + + def __init__( + self, + convention_scores: dict[QuoteConvention, float], + tabulated_quotation_marks: QuotationMarkTabulator, + analysis_weight: float = 1.0, + ): + self._convention_scores = convention_scores + self._best_quote_convention = max(convention_scores.items(), key=lambda item: item[1])[0] + self._best_quote_convention_score = convention_scores[self._best_quote_convention] + self._tabulated_quotation_marks = tabulated_quotation_marks + self._analysis_weight = analysis_weight + + def get_all_convention_scores(self) -> list[tuple[QuoteConvention, float]]: + return list(self._convention_scores.items()) + + @property + def analysis_summary(self) -> str: + return self._tabulated_quotation_marks.get_summary_message() + + @property + def best_quote_convention(self) -> QuoteConvention: + return self._best_quote_convention + + @property + def best_quote_convention_score(self) -> float: + return self._best_quote_convention_score + + @property + def weight(self) -> float: + return self._analysis_weight + + class Builder: + def __init__(self, tabulated_quotation_marks: QuotationMarkTabulator): + self._convention_scores: dict[QuoteConvention, float] = {} + self._tabulated_quotation_marks = tabulated_quotation_marks + + def record_convention_score(self, quote_convention: QuoteConvention, score: float) -> None: + self._convention_scores[quote_convention] = score + + def build(self) -> "QuoteConventionAnalysis": + return QuoteConventionAnalysis( + self._convention_scores, + self._tabulated_quotation_marks, + self._tabulated_quotation_marks.get_total_quotation_mark_count(), + ) diff --git a/machine/punctuation_analysis/quote_convention_detector.py b/machine/punctuation_analysis/quote_convention_detector.py index c37e8135..17286cd1 100644 --- a/machine/punctuation_analysis/quote_convention_detector.py +++ b/machine/punctuation_analysis/quote_convention_detector.py @@ -1,5 +1,4 @@ -from dataclasses import dataclass -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Tuple from .chapter import Chapter from .depth_based_quotation_mark_resolver import DepthBasedQuotationMarkResolver @@ -8,20 +7,13 @@ from .quotation_mark_metadata import QuotationMarkMetadata from .quotation_mark_string_match import QuotationMarkStringMatch from .quotation_mark_tabulator import QuotationMarkTabulator -from .quote_convention import QuoteConvention +from .quote_convention_analysis import QuoteConventionAnalysis from .quote_convention_detection_resolution_settings import QuoteConventionDetectionResolutionSettings from .quote_convention_set import QuoteConventionSet from .standard_quote_conventions import STANDARD_QUOTE_CONVENTIONS from .usfm_structure_extractor import UsfmStructureExtractor -@dataclass(frozen=True) -class QuoteConventionAnalysis: - best_quote_convention: QuoteConvention - best_quote_convention_score: float - analysis_summary: str - - class QuoteConventionDetector(UsfmStructureExtractor): def __init__(self): @@ -56,12 +48,14 @@ def detect_quote_convention( ) -> Optional[QuoteConventionAnalysis]: self._count_quotation_marks_in_chapters(self.get_chapters(include_chapters)) - (best_quote_convention, score) = STANDARD_QUOTE_CONVENTIONS.find_most_similar_convention( - self._quotation_mark_tabulator - ) + return STANDARD_QUOTE_CONVENTIONS.score_all_quote_conventions(self._quotation_mark_tabulator) - if score > 0 and best_quote_convention is not None: - return QuoteConventionAnalysis( - best_quote_convention, score, self._quotation_mark_tabulator.get_summary_message() - ) - return None + def detect_quote_convention_and_get_tabulated_quotation_marks( + self, include_chapters: Optional[Dict[int, List[int]]] = None + ) -> Tuple[Optional[QuoteConventionAnalysis], QuotationMarkTabulator]: + self._count_quotation_marks_in_chapters(self.get_chapters(include_chapters)) + + return ( + STANDARD_QUOTE_CONVENTIONS.score_all_quote_conventions(self._quotation_mark_tabulator), + self._quotation_mark_tabulator, + ) diff --git a/machine/punctuation_analysis/quote_convention_set.py b/machine/punctuation_analysis/quote_convention_set.py index bef15639..f4c99f22 100644 --- a/machine/punctuation_analysis/quote_convention_set.py +++ b/machine/punctuation_analysis/quote_convention_set.py @@ -7,6 +7,7 @@ from .quotation_mark_direction import QuotationMarkDirection from .quotation_mark_tabulator import QuotationMarkTabulator from .quote_convention import QuoteConvention +from .quote_convention_analysis import QuoteConventionAnalysis class QuoteConventionSet: @@ -149,3 +150,13 @@ def find_most_similar_convention( best_quote_convention = quote_convention return (best_quote_convention, best_similarity) + + def score_all_quote_conventions( + self, tabulated_quotation_marks: QuotationMarkTabulator + ) -> Optional[QuoteConventionAnalysis]: + quote_convention_analysis_builder = QuoteConventionAnalysis.Builder(tabulated_quotation_marks) + for quote_convention in self._conventions: + score = tabulated_quotation_marks.calculate_similarity(quote_convention) + quote_convention_analysis_builder.record_convention_score(quote_convention, score) + + return quote_convention_analysis_builder.build() diff --git a/machine/punctuation_analysis/zip_paratext_project_quote_convention_detector.py b/machine/punctuation_analysis/zip_paratext_project_quote_convention_detector.py index c0ccc90d..e51b2754 100644 --- a/machine/punctuation_analysis/zip_paratext_project_quote_convention_detector.py +++ b/machine/punctuation_analysis/zip_paratext_project_quote_convention_detector.py @@ -2,12 +2,14 @@ from typing import BinaryIO, Optional from zipfile import ZipFile +from ..corpora.zip_paratext_project_settings_parser import ZipParatextProjectSettingsParser from .paratext_project_quote_convention_detector import ParatextProjectQuoteConventionDetector class ZipParatextProjectQuoteConventionDetector(ParatextProjectQuoteConventionDetector): def __init__(self, archive: ZipFile) -> None: self._archive = archive + super().__init__(ZipParatextProjectSettingsParser(archive)) def _exists(self, file_name: str) -> bool: return file_name in self._archive.namelist() diff --git a/tests/punctuation_analysis/test_paratext_project_quote_convention_detector.py b/tests/punctuation_analysis/test_paratext_project_quote_convention_detector.py index 36e0f9fb..64c74d44 100644 --- a/tests/punctuation_analysis/test_paratext_project_quote_convention_detector.py +++ b/tests/punctuation_analysis/test_paratext_project_quote_convention_detector.py @@ -90,6 +90,27 @@ def test_get_quote_convention_invalid_book_code() -> None: assert analysis is None +def test_get_quote_convention_weighted_average_of_multiple_books() -> None: + env = _TestEnvironment( + files={ + "41MATTest.SFM": rf"""\id MAT +{get_test_chapter(1, standard_english_quote_convention)}""", + "42MRKTest.SFM": r"""\id MRK +\c 1 +\v 1 This "sentence uses a different" convention""", + } + ) + analysis: Optional[QuoteConventionAnalysis] = env.get_quote_convention() + assert analysis is not None + assert analysis.best_quote_convention.name == "standard_english" + assert analysis.best_quote_convention_score > 0.8 + assert analysis.best_quote_convention_score < 0.9 + assert ( + analysis.analysis_summary + == "The most common level 1 quotation marks are “ (5 of 6 opening marks) and ” (5 of 6 closing marks)" + ) + + class _TestEnvironment: def __init__( self, From d8829101181d9cd292572f1d21d135aa272a001c Mon Sep 17 00:00:00 2001 From: Ben King Date: Thu, 30 Oct 2025 09:12:27 -0400 Subject: [PATCH 2/7] Always return a QuoteConventionAnalysis instead of None --- ...atext_project_quote_convention_detector.py | 22 +++--- .../quote_convention_analysis.py | 16 ++++- .../quote_convention_detector.py | 4 +- .../quote_convention_set.py | 4 +- ...atext_project_quote_convention_detector.py | 18 +++-- .../test_quote_convention_detector.py | 70 ++++++++++++------- 6 files changed, 84 insertions(+), 50 deletions(-) diff --git a/machine/punctuation_analysis/paratext_project_quote_convention_detector.py b/machine/punctuation_analysis/paratext_project_quote_convention_detector.py index c3b73b86..89ec698e 100644 --- a/machine/punctuation_analysis/paratext_project_quote_convention_detector.py +++ b/machine/punctuation_analysis/paratext_project_quote_convention_detector.py @@ -16,7 +16,8 @@ class WeightedAverageQuoteConventionAnalysisBuilder: def __init__(self) -> None: self._total_weight: float = 0 - self._convention_votes: Dict[QuoteConvention, float] = defaultdict(float) + self._convention_votes: Dict[str, float] = defaultdict(float) + self._quote_conventions_by_name: Dict[str, QuoteConvention] = {} self._total_tabulated_quotation_marks = QuotationMarkTabulator() def record_book_results( @@ -24,24 +25,25 @@ def record_book_results( quote_convention_analysis: Optional[QuoteConventionAnalysis], tabulated_quotation_marks: QuotationMarkTabulator, ) -> None: - if quote_convention_analysis is None: + if quote_convention_analysis is None or quote_convention_analysis.weight == 0: return self._total_tabulated_quotation_marks.tabulate_from(tabulated_quotation_marks) self._total_weight += quote_convention_analysis.weight for convention, score in quote_convention_analysis.get_all_convention_scores(): - self._convention_votes[convention] += score * quote_convention_analysis.weight - - def to_quote_convention_analysis(self) -> Optional[QuoteConventionAnalysis]: - if self._total_weight == 0: - return None + if convention.name not in self._quote_conventions_by_name: + self._quote_conventions_by_name[convention.name] = convention + self._convention_votes[convention.name] += score * quote_convention_analysis.weight + def to_quote_convention_analysis(self) -> QuoteConventionAnalysis: quote_convention_analysis_builder = QuoteConventionAnalysis.Builder(self._total_tabulated_quotation_marks) - for convention, total_score in self._convention_votes.items(): + for convention_name, total_score in self._convention_votes.items(): if total_score > 0: - quote_convention_analysis_builder.record_convention_score(convention, total_score / self._total_weight) + quote_convention_analysis_builder.record_convention_score( + self._quote_conventions_by_name[convention_name], total_score / self._total_weight + ) return quote_convention_analysis_builder.build() @@ -55,7 +57,7 @@ def __init__(self, settings: Union[ParatextProjectSettings, ParatextProjectSetti def get_quote_convention_analysis( self, include_chapters: Optional[Dict[int, List[int]]] = None - ) -> Optional[QuoteConventionAnalysis]: + ) -> QuoteConventionAnalysis: weighted_average_quote_convention_analysis_builder = WeightedAverageQuoteConventionAnalysisBuilder() diff --git a/machine/punctuation_analysis/quote_convention_analysis.py b/machine/punctuation_analysis/quote_convention_analysis.py index c0a8cfa2..ae637b84 100644 --- a/machine/punctuation_analysis/quote_convention_analysis.py +++ b/machine/punctuation_analysis/quote_convention_analysis.py @@ -1,3 +1,5 @@ +from typing import Optional + from .quotation_mark_tabulator import QuotationMarkTabulator from .quote_convention import QuoteConvention @@ -11,8 +13,16 @@ def __init__( analysis_weight: float = 1.0, ): self._convention_scores = convention_scores - self._best_quote_convention = max(convention_scores.items(), key=lambda item: item[1])[0] - self._best_quote_convention_score = convention_scores[self._best_quote_convention] + if len(convention_scores) > 0: + self._best_quote_convention_score = max(convention_scores.items(), key=lambda item: item[1])[1] + else: + self._best_quote_convention_score = 0 + + if self._best_quote_convention_score > 0: + self._best_quote_convention = max(convention_scores.items(), key=lambda item: item[1])[0] + else: + self._best_quote_convention = None + self._tabulated_quotation_marks = tabulated_quotation_marks self._analysis_weight = analysis_weight @@ -24,7 +34,7 @@ def analysis_summary(self) -> str: return self._tabulated_quotation_marks.get_summary_message() @property - def best_quote_convention(self) -> QuoteConvention: + def best_quote_convention(self) -> Optional[QuoteConvention]: return self._best_quote_convention @property diff --git a/machine/punctuation_analysis/quote_convention_detector.py b/machine/punctuation_analysis/quote_convention_detector.py index 17286cd1..21f2b827 100644 --- a/machine/punctuation_analysis/quote_convention_detector.py +++ b/machine/punctuation_analysis/quote_convention_detector.py @@ -45,14 +45,14 @@ def _count_quotation_marks_in_chapter( def detect_quote_convention( self, include_chapters: Optional[Dict[int, List[int]]] = None - ) -> Optional[QuoteConventionAnalysis]: + ) -> QuoteConventionAnalysis: self._count_quotation_marks_in_chapters(self.get_chapters(include_chapters)) return STANDARD_QUOTE_CONVENTIONS.score_all_quote_conventions(self._quotation_mark_tabulator) def detect_quote_convention_and_get_tabulated_quotation_marks( self, include_chapters: Optional[Dict[int, List[int]]] = None - ) -> Tuple[Optional[QuoteConventionAnalysis], QuotationMarkTabulator]: + ) -> Tuple[QuoteConventionAnalysis, QuotationMarkTabulator]: self._count_quotation_marks_in_chapters(self.get_chapters(include_chapters)) return ( diff --git a/machine/punctuation_analysis/quote_convention_set.py b/machine/punctuation_analysis/quote_convention_set.py index f4c99f22..cddb6f2e 100644 --- a/machine/punctuation_analysis/quote_convention_set.py +++ b/machine/punctuation_analysis/quote_convention_set.py @@ -151,9 +151,7 @@ def find_most_similar_convention( return (best_quote_convention, best_similarity) - def score_all_quote_conventions( - self, tabulated_quotation_marks: QuotationMarkTabulator - ) -> Optional[QuoteConventionAnalysis]: + def score_all_quote_conventions(self, tabulated_quotation_marks: QuotationMarkTabulator) -> QuoteConventionAnalysis: quote_convention_analysis_builder = QuoteConventionAnalysis.Builder(tabulated_quotation_marks) for quote_convention in self._conventions: score = tabulated_quotation_marks.calculate_similarity(quote_convention) diff --git a/tests/punctuation_analysis/test_paratext_project_quote_convention_detector.py b/tests/punctuation_analysis/test_paratext_project_quote_convention_detector.py index 64c74d44..7977b6b7 100644 --- a/tests/punctuation_analysis/test_paratext_project_quote_convention_detector.py +++ b/tests/punctuation_analysis/test_paratext_project_quote_convention_detector.py @@ -27,7 +27,8 @@ def test_get_quote_convention() -> None: } ) analysis: Optional[QuoteConventionAnalysis] = env.get_quote_convention() - assert analysis is not None + + assert analysis.best_quote_convention is not None assert analysis.best_quote_convention_score > 0.8 assert analysis.best_quote_convention.name == "standard_english" @@ -42,7 +43,8 @@ def test_get_quote_convention_by_book() -> None: } ) analysis: Optional[QuoteConventionAnalysis] = env.get_quote_convention("MRK") - assert analysis is not None + + assert analysis.best_quote_convention is not None assert analysis.best_quote_convention_score > 0.8 assert analysis.best_quote_convention.name == "standard_french" @@ -61,7 +63,8 @@ def test_get_quote_convention_by_chapter() -> None: } ) analysis: Optional[QuoteConventionAnalysis] = env.get_quote_convention("MRK2,4-5") - assert analysis is not None + + assert analysis.best_quote_convention is not None assert analysis.best_quote_convention_score > 0.66 assert analysis.best_quote_convention.name == "standard_french" @@ -76,7 +79,7 @@ def test_get_quote_convention_by_chapter_indeterminate() -> None: } ) analysis: Optional[QuoteConventionAnalysis] = env.get_quote_convention("MAT1,3") - assert analysis is None + assert analysis.best_quote_convention is None def test_get_quote_convention_invalid_book_code() -> None: @@ -87,7 +90,7 @@ def test_get_quote_convention_invalid_book_code() -> None: } ) analysis: Optional[QuoteConventionAnalysis] = env.get_quote_convention("MAT") - assert analysis is None + assert analysis.best_quote_convention is None def test_get_quote_convention_weighted_average_of_multiple_books() -> None: @@ -101,7 +104,8 @@ def test_get_quote_convention_weighted_average_of_multiple_books() -> None: } ) analysis: Optional[QuoteConventionAnalysis] = env.get_quote_convention() - assert analysis is not None + + assert analysis.best_quote_convention is not None assert analysis.best_quote_convention.name == "standard_english" assert analysis.best_quote_convention_score > 0.8 assert analysis.best_quote_convention_score < 0.9 @@ -125,7 +129,7 @@ def __init__( def detector(self) -> ParatextProjectQuoteConventionDetector: return self._detector - def get_quote_convention(self, scripture_range: Optional[str] = None) -> Optional[QuoteConventionAnalysis]: + def get_quote_convention(self, scripture_range: Optional[str] = None) -> QuoteConventionAnalysis: chapters: Optional[Dict[int, List[int]]] = None if scripture_range is not None: chapters = get_chapters(scripture_range, ORIGINAL_VERSIFICATION) diff --git a/tests/punctuation_analysis/test_quote_convention_detector.py b/tests/punctuation_analysis/test_quote_convention_detector.py index 44ff3b09..6f34abb3 100644 --- a/tests/punctuation_analysis/test_quote_convention_detector.py +++ b/tests/punctuation_analysis/test_quote_convention_detector.py @@ -1,5 +1,3 @@ -from typing import Union - from machine.corpora import parse_usfm from machine.punctuation_analysis import QuoteConventionAnalysis, QuoteConventionDetector @@ -15,7 +13,8 @@ def test_standard_english() -> None: ‘You shall not eat of any tree of the garden’?” """ analysis = detect_quote_convention(usfm) - assert analysis is not None + + assert analysis.best_quote_convention is not None assert analysis.best_quote_convention.name == "standard_english" @@ -28,7 +27,8 @@ def test_typewriter_english() -> None: 'You shall not eat of any tree of the garden'?\" """ analysis = detect_quote_convention(usfm) - assert analysis is not None + + assert analysis.best_quote_convention is not None assert analysis.best_quote_convention.name == "typewriter_english" @@ -41,7 +41,8 @@ def test_british_english() -> None: “You shall not eat of any tree of the garden”?’ """ analysis = detect_quote_convention(usfm) - assert analysis is not None + + assert analysis.best_quote_convention is not None assert analysis.best_quote_convention.name == "british_english" @@ -54,7 +55,8 @@ def test_british_typewriter_english() -> None: \"You shall not eat of any tree of the garden\"?' """ analysis = detect_quote_convention(usfm) - assert analysis is not None + + assert analysis.best_quote_convention is not None assert analysis.best_quote_convention.name == "british_typewriter_english" @@ -67,7 +69,8 @@ def test_hybrid_typewriter_english() -> None: 'You shall not eat of any tree of the garden'?” """ analysis = detect_quote_convention(usfm) - assert analysis is not None + + assert analysis.best_quote_convention is not None assert analysis.best_quote_convention.name == "hybrid_typewriter_english" @@ -80,7 +83,8 @@ def test_standard_french() -> None: ‹You shall not eat of any tree of the garden›?» """ analysis = detect_quote_convention(usfm) - assert analysis is not None + + assert analysis.best_quote_convention is not None assert analysis.best_quote_convention.name == "standard_french" @@ -93,7 +97,8 @@ def test_typewriter_french() -> None: ?>> """ analysis = detect_quote_convention(usfm) - assert analysis is not None + + assert analysis.best_quote_convention is not None assert analysis.best_quote_convention.name == "typewriter_french" @@ -109,7 +114,8 @@ def test_western_european() -> None: “You shall not eat of any tree of the garden”?» """ analysis = detect_quote_convention(usfm) - assert analysis is not None + + assert analysis.best_quote_convention is not None assert analysis.best_quote_convention.name == "western_european" @@ -122,7 +128,8 @@ def test_british_inspired_western_european() -> None: ‘You shall not eat of any tree of the garden’?» """ analysis = detect_quote_convention(usfm) - assert analysis is not None + + assert analysis.best_quote_convention is not None assert analysis.best_quote_convention.name == "british_inspired_western_european" @@ -135,7 +142,8 @@ def test_typewriter_western_european() -> None: "You shall not eat of any tree of the garden"?>> """ analysis = detect_quote_convention(usfm) - assert analysis is not None + + assert analysis.best_quote_convention is not None assert analysis.best_quote_convention.name == "typewriter_western_european" @@ -148,7 +156,8 @@ def test_typewriter_western_european_variant() -> None: ?" """ analysis = detect_quote_convention(usfm) - assert analysis is not None + + assert analysis.best_quote_convention is not None assert analysis.best_quote_convention.name == "typewriter_western_european_variant" @@ -161,7 +170,8 @@ def test_hybrid_typewriter_western_european() -> None: "You shall not eat of any tree of the garden"?» """ analysis = detect_quote_convention(usfm) - assert analysis is not None + + assert analysis.best_quote_convention is not None assert analysis.best_quote_convention.name == "hybrid_typewriter_western_european" @@ -174,7 +184,8 @@ def test_hybrid_british_typewriter_western_european() -> None: 'You shall not eat of any tree of the garden'?» """ analysis = detect_quote_convention(usfm) - assert analysis is not None + + assert analysis.best_quote_convention is not None assert analysis.best_quote_convention.name == "hybrid_british_typewriter_western_european" @@ -187,7 +198,8 @@ def test_central_european() -> None: ‚You shall not eat of any tree of the garden‘?“ """ analysis = detect_quote_convention(usfm) - assert analysis is not None + + assert analysis.best_quote_convention is not None assert analysis.best_quote_convention.name == "central_european" @@ -200,7 +212,8 @@ def test_central_european_guillemets() -> None: ›You shall not eat of any tree of the garden‹?« """ analysis = detect_quote_convention(usfm) - assert analysis is not None + + assert analysis.best_quote_convention is not None assert analysis.best_quote_convention.name == "central_european_guillemets" @@ -213,7 +226,8 @@ def test_standard_swedish() -> None: ’You shall not eat of any tree of the garden’?” """ analysis = detect_quote_convention(usfm) - assert analysis is not None + + assert analysis.best_quote_convention is not None assert analysis.best_quote_convention.name == "standard_swedish" @@ -226,7 +240,8 @@ def test_standard_finnish() -> None: ’You shall not eat of any tree of the garden’?» """ analysis = detect_quote_convention(usfm) - assert analysis is not None + + assert analysis.best_quote_convention is not None assert analysis.best_quote_convention.name == "standard_finnish" @@ -239,7 +254,8 @@ def test_eastern_european() -> None: ‚You shall not eat of any tree of the garden’?” """ analysis = detect_quote_convention(usfm) - assert analysis is not None + + assert analysis.best_quote_convention is not None assert analysis.best_quote_convention.name == "eastern_european" @@ -252,7 +268,8 @@ def test_standard_russian() -> None: „You shall not eat of any tree of the garden“?» """ analysis = detect_quote_convention(usfm) - assert analysis is not None + + assert analysis.best_quote_convention is not None assert analysis.best_quote_convention.name == "standard_russian" @@ -265,7 +282,8 @@ def test_standard_arabic() -> None: ’You shall not eat of any tree of the garden‘?“ """ analysis = detect_quote_convention(usfm) - assert analysis is not None + + assert analysis.best_quote_convention is not None assert analysis.best_quote_convention.name == "standard_arabic" @@ -278,7 +296,8 @@ def test_non_standard_arabic() -> None: ’You shall not eat of any tree of the garden‘?» """ analysis = detect_quote_convention(usfm) - assert analysis is not None + + assert analysis.best_quote_convention is not None assert analysis.best_quote_convention.name == "non-standard_arabic" @@ -295,11 +314,12 @@ def test_mismatched_quotation_marks() -> None: God has said, ‘You shall not eat of it. You shall not touch it, lest you die.’ """ analysis = detect_quote_convention(usfm) - assert analysis is not None + + assert analysis.best_quote_convention is not None assert analysis.best_quote_convention.name == "standard_english" -def detect_quote_convention(usfm: str) -> Union[QuoteConventionAnalysis, None]: +def detect_quote_convention(usfm: str) -> QuoteConventionAnalysis: quote_convention_detector = QuoteConventionDetector() parse_usfm(usfm, quote_convention_detector) return quote_convention_detector.detect_quote_convention() From e4ea5f7c5cd676b9739c24afc0495a9ebbde8ae9 Mon Sep 17 00:00:00 2001 From: Ben King Date: Thu, 30 Oct 2025 10:47:25 -0400 Subject: [PATCH 3/7] Modify quote convention similarity calculation --- .../quotation_mark_tabulator.py | 34 ++++++++++++------- .../test_quotation_mark_tabulator.py | 15 ++++---- .../test_quote_convention_set.py | 8 ++--- 3 files changed, 35 insertions(+), 22 deletions(-) diff --git a/machine/punctuation_analysis/quotation_mark_tabulator.py b/machine/punctuation_analysis/quotation_mark_tabulator.py index 308be297..080a95b7 100644 --- a/machine/punctuation_analysis/quotation_mark_tabulator.py +++ b/machine/punctuation_analysis/quotation_mark_tabulator.py @@ -1,5 +1,5 @@ from collections import Counter, defaultdict -from typing import List +from typing import Dict, List from .quotation_mark_direction import QuotationMarkDirection from .quotation_mark_metadata import QuotationMarkMetadata @@ -66,22 +66,32 @@ def get_total_quotation_mark_count(self) -> int: return total_count def calculate_similarity(self, quote_convention: QuoteConvention) -> float: - weighted_difference = 0 - total_weight = 0 - for depth, direction in self._quotation_counts_by_depth_and_direction: + num_marks_by_depth: Dict[int, int] = defaultdict(int) + num_matching_marks_by_depth: Dict[int, int] = defaultdict(int) + + for depth, direction in sorted(self._quotation_counts_by_depth_and_direction, key=lambda item: item[0]): expected_quotation_mark: str = quote_convention.get_expected_quotation_mark(depth, direction) - # Give higher weight to shallower depths, since deeper marks are more likely to be mistakes - weighted_difference += self._quotation_counts_by_depth_and_direction[ - (depth, direction) - ].calculate_num_differences(expected_quotation_mark) * 2 ** (-depth) - total_weight += self._quotation_counts_by_depth_and_direction[ + num_matching_marks = self._quotation_counts_by_depth_and_direction[(depth, direction)].get_observed_count() + num_marks_by_depth[depth] += num_matching_marks + num_matching_marks_by_depth[depth] += num_matching_marks - self._quotation_counts_by_depth_and_direction[ (depth, direction) - ].get_observed_count() * 2 ** (-depth) + ].calculate_num_differences(expected_quotation_mark) + + # The scores of greater depths depend on the scores of shallower depths + scores_by_depth: Dict[int, float] = defaultdict(float) + for depth in sorted(num_marks_by_depth.keys()): + previous_depth_score = ( + scores_by_depth[depth - 1] / num_marks_by_depth[depth - 1] if depth - 1 in scores_by_depth else 1 + ) + scores_by_depth[depth] = previous_depth_score * num_matching_marks_by_depth[depth] + + total_marks = sum(num_marks_by_depth.values()) + total_score = sum(scores_by_depth.values()) - if total_weight == 0: + if total_marks == 0: return 0 - return 1 - (weighted_difference / total_weight) + return total_score / total_marks def get_summary_message(self) -> str: message_lines: List[str] = [] diff --git a/tests/punctuation_analysis/test_quotation_mark_tabulator.py b/tests/punctuation_analysis/test_quotation_mark_tabulator.py index 80c17ea2..e7e8cfbf 100644 --- a/tests/punctuation_analysis/test_quotation_mark_tabulator.py +++ b/tests/punctuation_analysis/test_quotation_mark_tabulator.py @@ -118,7 +118,7 @@ def test_calculate_similarity() -> None: ) assert two_level_quotation_mark_tabulator.calculate_similarity( QuoteConvention("", [SingleLevelQuoteConvention("\u201c", "\u201d")]) - ) == approx(0.66666666666667, rel=1e-9) + ) == approx(0.5, rel=1e-9) assert ( two_level_quotation_mark_tabulator.calculate_similarity( QuoteConvention( @@ -131,9 +131,12 @@ def test_calculate_similarity() -> None: QuoteConvention( "", [SingleLevelQuoteConvention("\u201c", "\u201d"), SingleLevelQuoteConvention("\u00ab", "\u00bb")] ) - ) == approx(0.66666666666667, rel=1e-9) - assert two_level_quotation_mark_tabulator.calculate_similarity( - QuoteConvention( - "", [SingleLevelQuoteConvention("\u2018", "\u2019"), SingleLevelQuoteConvention("\u2018", "\u2019")] + ) == approx(0.5, rel=1e-9) + assert ( + two_level_quotation_mark_tabulator.calculate_similarity( + QuoteConvention( + "", [SingleLevelQuoteConvention("\u2018", "\u2019"), SingleLevelQuoteConvention("\u2018", "\u2019")] + ) ) - ) == approx(0.33333333333333, rel=1e-9) + == 0.0 + ) diff --git a/tests/punctuation_analysis/test_quote_convention_set.py b/tests/punctuation_analysis/test_quote_convention_set.py index a2a87c4d..2ddf165d 100644 --- a/tests/punctuation_analysis/test_quote_convention_set.py +++ b/tests/punctuation_analysis/test_quote_convention_set.py @@ -1251,11 +1251,11 @@ def test_find_most_similar_convention() -> None: ) assert all_three_quote_convention_set.find_most_similar_convention(noisy_multiple_english_quotes_tabulator) == ( standard_english_quote_convention, - approx(0.9, rel=1e-9), + approx(0.8333333333333, rel=1e-9), ) assert two_french_quote_convention_set.find_most_similar_convention(noisy_multiple_english_quotes_tabulator) == ( western_european_quote_convention, - approx(0.1, rel=1e-9), + 0, ) noisy_multiple_french_quotes_tabulator = QuotationMarkTabulator() @@ -1273,7 +1273,7 @@ def test_find_most_similar_convention() -> None: ) assert all_three_quote_convention_set.find_most_similar_convention(noisy_multiple_french_quotes_tabulator) == ( standard_french_quote_convention, - approx(0.916666666666, rel=1e-9), + approx(0.875, rel=1e-9), ) too_deep_english_quotes_tabulator = QuotationMarkTabulator() @@ -1288,7 +1288,7 @@ def test_find_most_similar_convention() -> None: ) assert all_three_quote_convention_set.find_most_similar_convention(too_deep_english_quotes_tabulator) == ( standard_english_quote_convention, - approx(0.967741935483871, rel=1e-9), + approx(0.8, rel=1e-9), ) # in case of ties, the earlier convention in the list should be returned From 3bab6bd2e0f316586d12297646be4290661b67c0 Mon Sep 17 00:00:00 2001 From: Ben King Date: Thu, 30 Oct 2025 12:01:27 -0400 Subject: [PATCH 4/7] Add new quote convention --- .../punctuation_analysis/standard_quote_conventions.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/machine/punctuation_analysis/standard_quote_conventions.py b/machine/punctuation_analysis/standard_quote_conventions.py index b1292e15..5e69eb75 100644 --- a/machine/punctuation_analysis/standard_quote_conventions.py +++ b/machine/punctuation_analysis/standard_quote_conventions.py @@ -189,5 +189,13 @@ SingleLevelQuoteConvention("\u2019", "\u2018"), ], ), + QuoteConvention( + "arabic_inspired_western_european", + [ + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u201d", "\u201c"), + SingleLevelQuoteConvention("\u2019", "\u2018"), + ], + ), ] ) From 14274dd66b9a759008835ff61b9a110c4f078e96 Mon Sep 17 00:00:00 2001 From: Ben King Date: Thu, 30 Oct 2025 14:15:26 -0400 Subject: [PATCH 5/7] Minor code clarity changes --- .../paratext_project_quote_convention_detector.py | 4 ++-- machine/punctuation_analysis/quote_convention_analysis.py | 2 +- machine/punctuation_analysis/standard_quote_conventions.py | 1 + 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/machine/punctuation_analysis/paratext_project_quote_convention_detector.py b/machine/punctuation_analysis/paratext_project_quote_convention_detector.py index 89ec698e..2c79fbcb 100644 --- a/machine/punctuation_analysis/paratext_project_quote_convention_detector.py +++ b/machine/punctuation_analysis/paratext_project_quote_convention_detector.py @@ -22,10 +22,10 @@ def __init__(self) -> None: def record_book_results( self, - quote_convention_analysis: Optional[QuoteConventionAnalysis], + quote_convention_analysis: QuoteConventionAnalysis, tabulated_quotation_marks: QuotationMarkTabulator, ) -> None: - if quote_convention_analysis is None or quote_convention_analysis.weight == 0: + if quote_convention_analysis.best_quote_convention is None or quote_convention_analysis.weight == 0: return self._total_tabulated_quotation_marks.tabulate_from(tabulated_quotation_marks) diff --git a/machine/punctuation_analysis/quote_convention_analysis.py b/machine/punctuation_analysis/quote_convention_analysis.py index ae637b84..d4765c14 100644 --- a/machine/punctuation_analysis/quote_convention_analysis.py +++ b/machine/punctuation_analysis/quote_convention_analysis.py @@ -10,7 +10,7 @@ def __init__( self, convention_scores: dict[QuoteConvention, float], tabulated_quotation_marks: QuotationMarkTabulator, - analysis_weight: float = 1.0, + analysis_weight: float = 1.0, # weight is used for combining scores for multiple books ): self._convention_scores = convention_scores if len(convention_scores) > 0: diff --git a/machine/punctuation_analysis/standard_quote_conventions.py b/machine/punctuation_analysis/standard_quote_conventions.py index 5e69eb75..fddda562 100644 --- a/machine/punctuation_analysis/standard_quote_conventions.py +++ b/machine/punctuation_analysis/standard_quote_conventions.py @@ -187,6 +187,7 @@ [ SingleLevelQuoteConvention("\u00ab", "\u00bb"), SingleLevelQuoteConvention("\u2019", "\u2018"), + SingleLevelQuoteConvention("\u201d", "\u201c"), ], ), QuoteConvention( From afa93ba970a7a7fe7b533dca0caffd156016a3c3 Mon Sep 17 00:00:00 2001 From: Ben King Date: Thu, 30 Oct 2025 14:45:25 -0400 Subject: [PATCH 6/7] Fix linting issue --- tests/corpora/test_usfm_manual.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/tests/corpora/test_usfm_manual.py b/tests/corpora/test_usfm_manual.py index 6247634f..241ba195 100644 --- a/tests/corpora/test_usfm_manual.py +++ b/tests/corpora/test_usfm_manual.py @@ -24,7 +24,7 @@ ZipParatextProjectSettingsParser, ZipParatextProjectTextUpdater, ) -from machine.punctuation_analysis import QuoteConventionDetector, ZipParatextProjectQuoteConventionDetector +from machine.punctuation_analysis import ZipParatextProjectQuoteConventionDetector @pytest.mark.skip(reason="This is for manual testing only. Remove this decorator to run the test.") @@ -135,18 +135,13 @@ def get_usfm(project_path: Path): @pytest.mark.skip(reason="This is for manual testing only. Remove this decorator to run the test.") def test_analyze_corpora_quote_conventions(): - source_handler = QuoteConventionDetector() source_archive = zipfile.ZipFile(USFM_SOURCE_PROJECT_ZIP_PATH, "r") source_quote_convention_detector = ZipParatextProjectQuoteConventionDetector(source_archive) - source_quote_convention_detector.get_quote_convention_analysis(source_handler) + source_analysis = source_quote_convention_detector.get_quote_convention_analysis() - target_handler = QuoteConventionDetector() target_archive = zipfile.ZipFile(USFM_TARGET_PROJECT_ZIP_PATH, "r") target_quote_convention_detector = ZipParatextProjectQuoteConventionDetector(target_archive) - target_quote_convention_detector.get_quote_convention_analysis(target_handler) + target_analysis = target_quote_convention_detector.get_quote_convention_analysis() - source_analysis = source_handler.detect_quote_convention() - target_analysis = target_handler.detect_quote_convention() - - assert source_analysis is not None - assert target_analysis is not None + assert source_analysis.best_quote_convention is not None + assert target_analysis.best_quote_convention is not None From 2e699e25bc487135f3c2bdf92946c547fa2ec676 Mon Sep 17 00:00:00 2001 From: Ben King Date: Thu, 6 Nov 2025 14:47:28 -0500 Subject: [PATCH 7/7] Address reviewer comments + refactor weighted average --- ...atext_project_quote_convention_detector.py | 50 ++----------------- .../punctuation_analysis/quote_convention.py | 2 +- .../quote_convention_analysis.py | 44 +++++++++++----- .../quote_convention_detector.py | 12 +---- 4 files changed, 37 insertions(+), 71 deletions(-) diff --git a/machine/punctuation_analysis/paratext_project_quote_convention_detector.py b/machine/punctuation_analysis/paratext_project_quote_convention_detector.py index 2c79fbcb..4b46d136 100644 --- a/machine/punctuation_analysis/paratext_project_quote_convention_detector.py +++ b/machine/punctuation_analysis/paratext_project_quote_convention_detector.py @@ -1,5 +1,4 @@ from abc import ABC, abstractmethod -from collections import defaultdict from typing import BinaryIO, Dict, List, Optional, Union from ..corpora.paratext_project_settings import ParatextProjectSettings @@ -7,47 +6,10 @@ from ..corpora.usfm_parser import parse_usfm from ..scripture.canon import book_id_to_number, get_scripture_books from ..utils.typeshed import StrPath -from .quotation_mark_tabulator import QuotationMarkTabulator -from .quote_convention import QuoteConvention from .quote_convention_analysis import QuoteConventionAnalysis from .quote_convention_detector import QuoteConventionDetector -class WeightedAverageQuoteConventionAnalysisBuilder: - def __init__(self) -> None: - self._total_weight: float = 0 - self._convention_votes: Dict[str, float] = defaultdict(float) - self._quote_conventions_by_name: Dict[str, QuoteConvention] = {} - self._total_tabulated_quotation_marks = QuotationMarkTabulator() - - def record_book_results( - self, - quote_convention_analysis: QuoteConventionAnalysis, - tabulated_quotation_marks: QuotationMarkTabulator, - ) -> None: - if quote_convention_analysis.best_quote_convention is None or quote_convention_analysis.weight == 0: - return - - self._total_tabulated_quotation_marks.tabulate_from(tabulated_quotation_marks) - - self._total_weight += quote_convention_analysis.weight - for convention, score in quote_convention_analysis.get_all_convention_scores(): - if convention.name not in self._quote_conventions_by_name: - self._quote_conventions_by_name[convention.name] = convention - self._convention_votes[convention.name] += score * quote_convention_analysis.weight - - def to_quote_convention_analysis(self) -> QuoteConventionAnalysis: - quote_convention_analysis_builder = QuoteConventionAnalysis.Builder(self._total_tabulated_quotation_marks) - - for convention_name, total_score in self._convention_votes.items(): - if total_score > 0: - quote_convention_analysis_builder.record_convention_score( - self._quote_conventions_by_name[convention_name], total_score / self._total_weight - ) - - return quote_convention_analysis_builder.build() - - class ParatextProjectQuoteConventionDetector(ABC): def __init__(self, settings: Union[ParatextProjectSettings, ParatextProjectSettingsParserBase]) -> None: if isinstance(settings, ParatextProjectSettingsParserBase): @@ -59,7 +21,7 @@ def get_quote_convention_analysis( self, include_chapters: Optional[Dict[int, List[int]]] = None ) -> QuoteConventionAnalysis: - weighted_average_quote_convention_analysis_builder = WeightedAverageQuoteConventionAnalysisBuilder() + book_quote_convention_analyses: List[QuoteConventionAnalysis] = [] for book_id in get_scripture_books(): if include_chapters is not None and book_id_to_number(book_id) not in include_chapters: @@ -82,14 +44,10 @@ def get_quote_convention_analysis( ) raise RuntimeError(error_message) from e - quote_convention_analysis, tabulated_quotation_marks = ( - handler.detect_quote_convention_and_get_tabulated_quotation_marks(include_chapters) - ) - weighted_average_quote_convention_analysis_builder.record_book_results( - quote_convention_analysis, tabulated_quotation_marks - ) + quote_convention_analysis = handler.detect_quote_convention(include_chapters) + book_quote_convention_analyses.append(quote_convention_analysis) - return weighted_average_quote_convention_analysis_builder.to_quote_convention_analysis() + return QuoteConventionAnalysis.combine_with_weighted_average(book_quote_convention_analyses) @abstractmethod def _exists(self, file_name: StrPath) -> bool: ... diff --git a/machine/punctuation_analysis/quote_convention.py b/machine/punctuation_analysis/quote_convention.py index 3160894c..9dda42e3 100644 --- a/machine/punctuation_analysis/quote_convention.py +++ b/machine/punctuation_analysis/quote_convention.py @@ -61,7 +61,7 @@ def __eq__(self, value): return True def __hash__(self) -> int: - return hash((tuple(self.level_conventions))) + return hash(tuple(self.level_conventions)) @property def name(self) -> str: diff --git a/machine/punctuation_analysis/quote_convention_analysis.py b/machine/punctuation_analysis/quote_convention_analysis.py index d4765c14..1de547fc 100644 --- a/machine/punctuation_analysis/quote_convention_analysis.py +++ b/machine/punctuation_analysis/quote_convention_analysis.py @@ -1,4 +1,5 @@ -from typing import Optional +from collections import defaultdict +from typing import Dict, List, Optional from .quotation_mark_tabulator import QuotationMarkTabulator from .quote_convention import QuoteConvention @@ -14,21 +15,16 @@ def __init__( ): self._convention_scores = convention_scores if len(convention_scores) > 0: - self._best_quote_convention_score = max(convention_scores.items(), key=lambda item: item[1])[1] + (self._best_quote_convention, self._best_quote_convention_score) = max( + convention_scores.items(), key=lambda item: item[1] + ) else: self._best_quote_convention_score = 0 - - if self._best_quote_convention_score > 0: - self._best_quote_convention = max(convention_scores.items(), key=lambda item: item[1])[0] - else: self._best_quote_convention = None self._tabulated_quotation_marks = tabulated_quotation_marks self._analysis_weight = analysis_weight - def get_all_convention_scores(self) -> list[tuple[QuoteConvention, float]]: - return list(self._convention_scores.items()) - @property def analysis_summary(self) -> str: return self._tabulated_quotation_marks.get_summary_message() @@ -41,10 +37,6 @@ def best_quote_convention(self) -> Optional[QuoteConvention]: def best_quote_convention_score(self) -> float: return self._best_quote_convention_score - @property - def weight(self) -> float: - return self._analysis_weight - class Builder: def __init__(self, tabulated_quotation_marks: QuotationMarkTabulator): self._convention_scores: dict[QuoteConvention, float] = {} @@ -59,3 +51,29 @@ def build(self) -> "QuoteConventionAnalysis": self._tabulated_quotation_marks, self._tabulated_quotation_marks.get_total_quotation_mark_count(), ) + + @staticmethod + def combine_with_weighted_average( + quote_convention_analyses: List["QuoteConventionAnalysis"], + ) -> "QuoteConventionAnalysis": + total_weight: float = 0 + convention_votes: Dict[str, float] = defaultdict(float) + quote_conventions_by_name: Dict[str, QuoteConvention] = {} + total_tabulated_quotation_marks = QuotationMarkTabulator() + for quote_convention_analysis in quote_convention_analyses: + total_tabulated_quotation_marks.tabulate_from(quote_convention_analysis._tabulated_quotation_marks) + total_weight += quote_convention_analysis._analysis_weight + for convention, score in quote_convention_analysis._convention_scores.items(): + if convention.name not in quote_conventions_by_name: + quote_conventions_by_name[convention.name] = convention + convention_votes[convention.name] += score * quote_convention_analysis._analysis_weight + + quote_convention_analysis_builder = QuoteConventionAnalysis.Builder(total_tabulated_quotation_marks) + + for convention_name, total_score in convention_votes.items(): + if total_score > 0: + quote_convention_analysis_builder.record_convention_score( + quote_conventions_by_name[convention_name], total_score / total_weight + ) + + return quote_convention_analysis_builder.build() diff --git a/machine/punctuation_analysis/quote_convention_detector.py b/machine/punctuation_analysis/quote_convention_detector.py index 21f2b827..ca1662d9 100644 --- a/machine/punctuation_analysis/quote_convention_detector.py +++ b/machine/punctuation_analysis/quote_convention_detector.py @@ -1,4 +1,4 @@ -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional from .chapter import Chapter from .depth_based_quotation_mark_resolver import DepthBasedQuotationMarkResolver @@ -49,13 +49,3 @@ def detect_quote_convention( self._count_quotation_marks_in_chapters(self.get_chapters(include_chapters)) return STANDARD_QUOTE_CONVENTIONS.score_all_quote_conventions(self._quotation_mark_tabulator) - - def detect_quote_convention_and_get_tabulated_quotation_marks( - self, include_chapters: Optional[Dict[int, List[int]]] = None - ) -> Tuple[QuoteConventionAnalysis, QuotationMarkTabulator]: - self._count_quotation_marks_in_chapters(self.get_chapters(include_chapters)) - - return ( - STANDARD_QUOTE_CONVENTIONS.score_all_quote_conventions(self._quotation_mark_tabulator), - self._quotation_mark_tabulator, - )