diff --git a/machine/punctuation_analysis/__init__.py b/machine/punctuation_analysis/__init__.py index aa28e3b..1773cbf 100644 --- a/machine/punctuation_analysis/__init__.py +++ b/machine/punctuation_analysis/__init__.py @@ -32,9 +32,10 @@ from .quotation_mark_update_settings import QuotationMarkUpdateSettings from .quotation_mark_update_strategy import QuotationMarkUpdateStrategy from .quote_convention import QuoteConvention, SingleLevelQuoteConvention +from .quote_convention_analysis import QuoteConventionAnalysis from .quote_convention_changing_usfm_update_block_handler import QuoteConventionChangingUsfmUpdateBlockHandler from .quote_convention_detection_resolution_settings import QuoteConventionDetectionResolutionSettings -from .quote_convention_detector import QuoteConventionAnalysis, QuoteConventionDetector +from .quote_convention_detector import QuoteConventionDetector from .quote_convention_set import QuoteConventionSet from .standard_quote_conventions import STANDARD_QUOTE_CONVENTIONS from .text_segment import TextSegment diff --git a/machine/punctuation_analysis/paratext_project_quote_convention_detector.py b/machine/punctuation_analysis/paratext_project_quote_convention_detector.py index 808a717..4b46d13 100644 --- a/machine/punctuation_analysis/paratext_project_quote_convention_detector.py +++ b/machine/punctuation_analysis/paratext_project_quote_convention_detector.py @@ -6,7 +6,8 @@ from ..corpora.usfm_parser import parse_usfm from ..scripture.canon import book_id_to_number, get_scripture_books from ..utils.typeshed import StrPath -from .quote_convention_detector import QuoteConventionAnalysis, QuoteConventionDetector +from .quote_convention_analysis import QuoteConventionAnalysis +from .quote_convention_detector import QuoteConventionDetector class ParatextProjectQuoteConventionDetector(ABC): @@ -17,15 +18,20 @@ def __init__(self, settings: Union[ParatextProjectSettings, ParatextProjectSetti self._settings = settings def get_quote_convention_analysis( - self, handler: Optional[QuoteConventionDetector] = None, include_chapters: Optional[Dict[int, List[int]]] = None - ) -> Optional[QuoteConventionAnalysis]: - handler = QuoteConventionDetector() if handler is None else handler + self, include_chapters: Optional[Dict[int, List[int]]] = None + ) -> QuoteConventionAnalysis: + + book_quote_convention_analyses: List[QuoteConventionAnalysis] = [] + for book_id in get_scripture_books(): if include_chapters is not None and book_id_to_number(book_id) not in include_chapters: continue file_name: str = self._settings.get_book_file_name(book_id) if not self._exists(file_name): continue + + handler = QuoteConventionDetector() + with self._open(file_name) as sfm_file: usfm: str = sfm_file.read().decode(self._settings.encoding) try: @@ -37,7 +43,11 @@ def get_quote_convention_analysis( f". Error: '{e}'" ) raise RuntimeError(error_message) from e - return handler.detect_quote_convention(include_chapters) + + quote_convention_analysis = handler.detect_quote_convention(include_chapters) + book_quote_convention_analyses.append(quote_convention_analysis) + + return QuoteConventionAnalysis.combine_with_weighted_average(book_quote_convention_analyses) @abstractmethod def _exists(self, file_name: StrPath) -> bool: ... diff --git a/machine/punctuation_analysis/quotation_mark_tabulator.py b/machine/punctuation_analysis/quotation_mark_tabulator.py index c76ff54..080a95b 100644 --- a/machine/punctuation_analysis/quotation_mark_tabulator.py +++ b/machine/punctuation_analysis/quotation_mark_tabulator.py @@ -1,5 +1,5 @@ from collections import Counter, defaultdict -from typing import List +from typing import Dict, List from .quotation_mark_direction import QuotationMarkDirection from .quotation_mark_metadata import QuotationMarkMetadata @@ -15,6 +15,10 @@ def count_quotation_mark(self, quotation_mark: str) -> None: self._quotation_mark_counter.update([quotation_mark]) self._total_count += 1 + def count_from(self, quotation_mark_counts: "QuotationMarkCounts") -> None: + self._quotation_mark_counter.update(quotation_mark_counts._quotation_mark_counter) + self._total_count += quotation_mark_counts._total_count + def find_best_quotation_mark_proportion(self) -> tuple[str, int, int]: return self._quotation_mark_counter.most_common(1)[0] + (self._total_count,) @@ -36,6 +40,13 @@ def tabulate(self, quotation_marks: list[QuotationMarkMetadata]) -> None: for quotation_mark in quotation_marks: self._count_quotation_mark(quotation_mark) + def tabulate_from(self, tabulated_quotation_marks: "QuotationMarkTabulator") -> None: + for ( + depth_and_direction, + quotation_mark_counts, + ) in tabulated_quotation_marks._quotation_counts_by_depth_and_direction.items(): + self._quotation_counts_by_depth_and_direction[depth_and_direction].count_from(quotation_mark_counts) + def _count_quotation_mark(self, quotation_mark: QuotationMarkMetadata) -> None: key = (quotation_mark.depth, quotation_mark.direction) self._quotation_counts_by_depth_and_direction[key].count_quotation_mark(quotation_mark.quotation_mark) @@ -48,23 +59,39 @@ def _find_most_common_quotation_mark_with_depth_and_direction( ) -> tuple[str, int, int]: return self._quotation_counts_by_depth_and_direction[(depth, direction)].find_best_quotation_mark_proportion() + def get_total_quotation_mark_count(self) -> int: + total_count = 0 + for counts in self._quotation_counts_by_depth_and_direction.values(): + total_count += counts.get_observed_count() + return total_count + def calculate_similarity(self, quote_convention: QuoteConvention) -> float: - weighted_difference = 0 - total_weight = 0 - for depth, direction in self._quotation_counts_by_depth_and_direction: + num_marks_by_depth: Dict[int, int] = defaultdict(int) + num_matching_marks_by_depth: Dict[int, int] = defaultdict(int) + + for depth, direction in sorted(self._quotation_counts_by_depth_and_direction, key=lambda item: item[0]): expected_quotation_mark: str = quote_convention.get_expected_quotation_mark(depth, direction) - # Give higher weight to shallower depths, since deeper marks are more likely to be mistakes - weighted_difference += self._quotation_counts_by_depth_and_direction[ + num_matching_marks = self._quotation_counts_by_depth_and_direction[(depth, direction)].get_observed_count() + num_marks_by_depth[depth] += num_matching_marks + num_matching_marks_by_depth[depth] += num_matching_marks - self._quotation_counts_by_depth_and_direction[ (depth, direction) - ].calculate_num_differences(expected_quotation_mark) * 2 ** (-depth) - total_weight += self._quotation_counts_by_depth_and_direction[ - (depth, direction) - ].get_observed_count() * 2 ** (-depth) + ].calculate_num_differences(expected_quotation_mark) + + # The scores of greater depths depend on the scores of shallower depths + scores_by_depth: Dict[int, float] = defaultdict(float) + for depth in sorted(num_marks_by_depth.keys()): + previous_depth_score = ( + scores_by_depth[depth - 1] / num_marks_by_depth[depth - 1] if depth - 1 in scores_by_depth else 1 + ) + scores_by_depth[depth] = previous_depth_score * num_matching_marks_by_depth[depth] + + total_marks = sum(num_marks_by_depth.values()) + total_score = sum(scores_by_depth.values()) - if total_weight == 0: + if total_marks == 0: return 0 - return 1 - (weighted_difference / total_weight) + return total_score / total_marks def get_summary_message(self) -> str: message_lines: List[str] = [] diff --git a/machine/punctuation_analysis/quote_convention.py b/machine/punctuation_analysis/quote_convention.py index 386cd55..9dda42e 100644 --- a/machine/punctuation_analysis/quote_convention.py +++ b/machine/punctuation_analysis/quote_convention.py @@ -37,6 +37,9 @@ def normalize(self) -> "SingleLevelQuoteConvention": ) return SingleLevelQuoteConvention(normalized_opening_quotation_mark, normalized_closing_quotation_mark) + def __hash__(self) -> int: + return hash((self.opening_quotation_mark, self.closing_quotation_mark)) + class QuoteConvention: def __init__(self, name: str, level_conventions: list[SingleLevelQuoteConvention]): @@ -57,6 +60,9 @@ def __eq__(self, value): return False return True + def __hash__(self) -> int: + return hash(tuple(self.level_conventions)) + @property def name(self) -> str: return self._name diff --git a/machine/punctuation_analysis/quote_convention_analysis.py b/machine/punctuation_analysis/quote_convention_analysis.py new file mode 100644 index 0000000..1de547f --- /dev/null +++ b/machine/punctuation_analysis/quote_convention_analysis.py @@ -0,0 +1,79 @@ +from collections import defaultdict +from typing import Dict, List, Optional + +from .quotation_mark_tabulator import QuotationMarkTabulator +from .quote_convention import QuoteConvention + + +class QuoteConventionAnalysis: + + def __init__( + self, + convention_scores: dict[QuoteConvention, float], + tabulated_quotation_marks: QuotationMarkTabulator, + analysis_weight: float = 1.0, # weight is used for combining scores for multiple books + ): + self._convention_scores = convention_scores + if len(convention_scores) > 0: + (self._best_quote_convention, self._best_quote_convention_score) = max( + convention_scores.items(), key=lambda item: item[1] + ) + else: + self._best_quote_convention_score = 0 + self._best_quote_convention = None + + self._tabulated_quotation_marks = tabulated_quotation_marks + self._analysis_weight = analysis_weight + + @property + def analysis_summary(self) -> str: + return self._tabulated_quotation_marks.get_summary_message() + + @property + def best_quote_convention(self) -> Optional[QuoteConvention]: + return self._best_quote_convention + + @property + def best_quote_convention_score(self) -> float: + return self._best_quote_convention_score + + class Builder: + def __init__(self, tabulated_quotation_marks: QuotationMarkTabulator): + self._convention_scores: dict[QuoteConvention, float] = {} + self._tabulated_quotation_marks = tabulated_quotation_marks + + def record_convention_score(self, quote_convention: QuoteConvention, score: float) -> None: + self._convention_scores[quote_convention] = score + + def build(self) -> "QuoteConventionAnalysis": + return QuoteConventionAnalysis( + self._convention_scores, + self._tabulated_quotation_marks, + self._tabulated_quotation_marks.get_total_quotation_mark_count(), + ) + + @staticmethod + def combine_with_weighted_average( + quote_convention_analyses: List["QuoteConventionAnalysis"], + ) -> "QuoteConventionAnalysis": + total_weight: float = 0 + convention_votes: Dict[str, float] = defaultdict(float) + quote_conventions_by_name: Dict[str, QuoteConvention] = {} + total_tabulated_quotation_marks = QuotationMarkTabulator() + for quote_convention_analysis in quote_convention_analyses: + total_tabulated_quotation_marks.tabulate_from(quote_convention_analysis._tabulated_quotation_marks) + total_weight += quote_convention_analysis._analysis_weight + for convention, score in quote_convention_analysis._convention_scores.items(): + if convention.name not in quote_conventions_by_name: + quote_conventions_by_name[convention.name] = convention + convention_votes[convention.name] += score * quote_convention_analysis._analysis_weight + + quote_convention_analysis_builder = QuoteConventionAnalysis.Builder(total_tabulated_quotation_marks) + + for convention_name, total_score in convention_votes.items(): + if total_score > 0: + quote_convention_analysis_builder.record_convention_score( + quote_conventions_by_name[convention_name], total_score / total_weight + ) + + return quote_convention_analysis_builder.build() diff --git a/machine/punctuation_analysis/quote_convention_detector.py b/machine/punctuation_analysis/quote_convention_detector.py index c37e813..ca1662d 100644 --- a/machine/punctuation_analysis/quote_convention_detector.py +++ b/machine/punctuation_analysis/quote_convention_detector.py @@ -1,4 +1,3 @@ -from dataclasses import dataclass from typing import Dict, List, Optional from .chapter import Chapter @@ -8,20 +7,13 @@ from .quotation_mark_metadata import QuotationMarkMetadata from .quotation_mark_string_match import QuotationMarkStringMatch from .quotation_mark_tabulator import QuotationMarkTabulator -from .quote_convention import QuoteConvention +from .quote_convention_analysis import QuoteConventionAnalysis from .quote_convention_detection_resolution_settings import QuoteConventionDetectionResolutionSettings from .quote_convention_set import QuoteConventionSet from .standard_quote_conventions import STANDARD_QUOTE_CONVENTIONS from .usfm_structure_extractor import UsfmStructureExtractor -@dataclass(frozen=True) -class QuoteConventionAnalysis: - best_quote_convention: QuoteConvention - best_quote_convention_score: float - analysis_summary: str - - class QuoteConventionDetector(UsfmStructureExtractor): def __init__(self): @@ -53,15 +45,7 @@ def _count_quotation_marks_in_chapter( def detect_quote_convention( self, include_chapters: Optional[Dict[int, List[int]]] = None - ) -> Optional[QuoteConventionAnalysis]: + ) -> QuoteConventionAnalysis: self._count_quotation_marks_in_chapters(self.get_chapters(include_chapters)) - (best_quote_convention, score) = STANDARD_QUOTE_CONVENTIONS.find_most_similar_convention( - self._quotation_mark_tabulator - ) - - if score > 0 and best_quote_convention is not None: - return QuoteConventionAnalysis( - best_quote_convention, score, self._quotation_mark_tabulator.get_summary_message() - ) - return None + return STANDARD_QUOTE_CONVENTIONS.score_all_quote_conventions(self._quotation_mark_tabulator) diff --git a/machine/punctuation_analysis/quote_convention_set.py b/machine/punctuation_analysis/quote_convention_set.py index bef1563..cddb6f2 100644 --- a/machine/punctuation_analysis/quote_convention_set.py +++ b/machine/punctuation_analysis/quote_convention_set.py @@ -7,6 +7,7 @@ from .quotation_mark_direction import QuotationMarkDirection from .quotation_mark_tabulator import QuotationMarkTabulator from .quote_convention import QuoteConvention +from .quote_convention_analysis import QuoteConventionAnalysis class QuoteConventionSet: @@ -149,3 +150,11 @@ def find_most_similar_convention( best_quote_convention = quote_convention return (best_quote_convention, best_similarity) + + def score_all_quote_conventions(self, tabulated_quotation_marks: QuotationMarkTabulator) -> QuoteConventionAnalysis: + quote_convention_analysis_builder = QuoteConventionAnalysis.Builder(tabulated_quotation_marks) + for quote_convention in self._conventions: + score = tabulated_quotation_marks.calculate_similarity(quote_convention) + quote_convention_analysis_builder.record_convention_score(quote_convention, score) + + return quote_convention_analysis_builder.build() diff --git a/machine/punctuation_analysis/standard_quote_conventions.py b/machine/punctuation_analysis/standard_quote_conventions.py index b1292e1..fddda56 100644 --- a/machine/punctuation_analysis/standard_quote_conventions.py +++ b/machine/punctuation_analysis/standard_quote_conventions.py @@ -187,6 +187,15 @@ [ SingleLevelQuoteConvention("\u00ab", "\u00bb"), SingleLevelQuoteConvention("\u2019", "\u2018"), + SingleLevelQuoteConvention("\u201d", "\u201c"), + ], + ), + QuoteConvention( + "arabic_inspired_western_european", + [ + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u201d", "\u201c"), + SingleLevelQuoteConvention("\u2019", "\u2018"), ], ), ] diff --git a/machine/punctuation_analysis/zip_paratext_project_quote_convention_detector.py b/machine/punctuation_analysis/zip_paratext_project_quote_convention_detector.py index c0ccc90..e51b275 100644 --- a/machine/punctuation_analysis/zip_paratext_project_quote_convention_detector.py +++ b/machine/punctuation_analysis/zip_paratext_project_quote_convention_detector.py @@ -2,12 +2,14 @@ from typing import BinaryIO, Optional from zipfile import ZipFile +from ..corpora.zip_paratext_project_settings_parser import ZipParatextProjectSettingsParser from .paratext_project_quote_convention_detector import ParatextProjectQuoteConventionDetector class ZipParatextProjectQuoteConventionDetector(ParatextProjectQuoteConventionDetector): def __init__(self, archive: ZipFile) -> None: self._archive = archive + super().__init__(ZipParatextProjectSettingsParser(archive)) def _exists(self, file_name: str) -> bool: return file_name in self._archive.namelist() diff --git a/tests/corpora/test_usfm_manual.py b/tests/corpora/test_usfm_manual.py index 6247634..241ba19 100644 --- a/tests/corpora/test_usfm_manual.py +++ b/tests/corpora/test_usfm_manual.py @@ -24,7 +24,7 @@ ZipParatextProjectSettingsParser, ZipParatextProjectTextUpdater, ) -from machine.punctuation_analysis import QuoteConventionDetector, ZipParatextProjectQuoteConventionDetector +from machine.punctuation_analysis import ZipParatextProjectQuoteConventionDetector @pytest.mark.skip(reason="This is for manual testing only. Remove this decorator to run the test.") @@ -135,18 +135,13 @@ def get_usfm(project_path: Path): @pytest.mark.skip(reason="This is for manual testing only. Remove this decorator to run the test.") def test_analyze_corpora_quote_conventions(): - source_handler = QuoteConventionDetector() source_archive = zipfile.ZipFile(USFM_SOURCE_PROJECT_ZIP_PATH, "r") source_quote_convention_detector = ZipParatextProjectQuoteConventionDetector(source_archive) - source_quote_convention_detector.get_quote_convention_analysis(source_handler) + source_analysis = source_quote_convention_detector.get_quote_convention_analysis() - target_handler = QuoteConventionDetector() target_archive = zipfile.ZipFile(USFM_TARGET_PROJECT_ZIP_PATH, "r") target_quote_convention_detector = ZipParatextProjectQuoteConventionDetector(target_archive) - target_quote_convention_detector.get_quote_convention_analysis(target_handler) + target_analysis = target_quote_convention_detector.get_quote_convention_analysis() - source_analysis = source_handler.detect_quote_convention() - target_analysis = target_handler.detect_quote_convention() - - assert source_analysis is not None - assert target_analysis is not None + assert source_analysis.best_quote_convention is not None + assert target_analysis.best_quote_convention is not None diff --git a/tests/punctuation_analysis/test_paratext_project_quote_convention_detector.py b/tests/punctuation_analysis/test_paratext_project_quote_convention_detector.py index 36e0f9f..7977b6b 100644 --- a/tests/punctuation_analysis/test_paratext_project_quote_convention_detector.py +++ b/tests/punctuation_analysis/test_paratext_project_quote_convention_detector.py @@ -27,7 +27,8 @@ def test_get_quote_convention() -> None: } ) analysis: Optional[QuoteConventionAnalysis] = env.get_quote_convention() - assert analysis is not None + + assert analysis.best_quote_convention is not None assert analysis.best_quote_convention_score > 0.8 assert analysis.best_quote_convention.name == "standard_english" @@ -42,7 +43,8 @@ def test_get_quote_convention_by_book() -> None: } ) analysis: Optional[QuoteConventionAnalysis] = env.get_quote_convention("MRK") - assert analysis is not None + + assert analysis.best_quote_convention is not None assert analysis.best_quote_convention_score > 0.8 assert analysis.best_quote_convention.name == "standard_french" @@ -61,7 +63,8 @@ def test_get_quote_convention_by_chapter() -> None: } ) analysis: Optional[QuoteConventionAnalysis] = env.get_quote_convention("MRK2,4-5") - assert analysis is not None + + assert analysis.best_quote_convention is not None assert analysis.best_quote_convention_score > 0.66 assert analysis.best_quote_convention.name == "standard_french" @@ -76,7 +79,7 @@ def test_get_quote_convention_by_chapter_indeterminate() -> None: } ) analysis: Optional[QuoteConventionAnalysis] = env.get_quote_convention("MAT1,3") - assert analysis is None + assert analysis.best_quote_convention is None def test_get_quote_convention_invalid_book_code() -> None: @@ -87,7 +90,29 @@ def test_get_quote_convention_invalid_book_code() -> None: } ) analysis: Optional[QuoteConventionAnalysis] = env.get_quote_convention("MAT") - assert analysis is None + assert analysis.best_quote_convention is None + + +def test_get_quote_convention_weighted_average_of_multiple_books() -> None: + env = _TestEnvironment( + files={ + "41MATTest.SFM": rf"""\id MAT +{get_test_chapter(1, standard_english_quote_convention)}""", + "42MRKTest.SFM": r"""\id MRK +\c 1 +\v 1 This "sentence uses a different" convention""", + } + ) + analysis: Optional[QuoteConventionAnalysis] = env.get_quote_convention() + + assert analysis.best_quote_convention is not None + assert analysis.best_quote_convention.name == "standard_english" + assert analysis.best_quote_convention_score > 0.8 + assert analysis.best_quote_convention_score < 0.9 + assert ( + analysis.analysis_summary + == "The most common level 1 quotation marks are “ (5 of 6 opening marks) and ” (5 of 6 closing marks)" + ) class _TestEnvironment: @@ -104,7 +129,7 @@ def __init__( def detector(self) -> ParatextProjectQuoteConventionDetector: return self._detector - def get_quote_convention(self, scripture_range: Optional[str] = None) -> Optional[QuoteConventionAnalysis]: + def get_quote_convention(self, scripture_range: Optional[str] = None) -> QuoteConventionAnalysis: chapters: Optional[Dict[int, List[int]]] = None if scripture_range is not None: chapters = get_chapters(scripture_range, ORIGINAL_VERSIFICATION) diff --git a/tests/punctuation_analysis/test_quotation_mark_tabulator.py b/tests/punctuation_analysis/test_quotation_mark_tabulator.py index 80c17ea..e7e8cfb 100644 --- a/tests/punctuation_analysis/test_quotation_mark_tabulator.py +++ b/tests/punctuation_analysis/test_quotation_mark_tabulator.py @@ -118,7 +118,7 @@ def test_calculate_similarity() -> None: ) assert two_level_quotation_mark_tabulator.calculate_similarity( QuoteConvention("", [SingleLevelQuoteConvention("\u201c", "\u201d")]) - ) == approx(0.66666666666667, rel=1e-9) + ) == approx(0.5, rel=1e-9) assert ( two_level_quotation_mark_tabulator.calculate_similarity( QuoteConvention( @@ -131,9 +131,12 @@ def test_calculate_similarity() -> None: QuoteConvention( "", [SingleLevelQuoteConvention("\u201c", "\u201d"), SingleLevelQuoteConvention("\u00ab", "\u00bb")] ) - ) == approx(0.66666666666667, rel=1e-9) - assert two_level_quotation_mark_tabulator.calculate_similarity( - QuoteConvention( - "", [SingleLevelQuoteConvention("\u2018", "\u2019"), SingleLevelQuoteConvention("\u2018", "\u2019")] + ) == approx(0.5, rel=1e-9) + assert ( + two_level_quotation_mark_tabulator.calculate_similarity( + QuoteConvention( + "", [SingleLevelQuoteConvention("\u2018", "\u2019"), SingleLevelQuoteConvention("\u2018", "\u2019")] + ) ) - ) == approx(0.33333333333333, rel=1e-9) + == 0.0 + ) diff --git a/tests/punctuation_analysis/test_quote_convention_detector.py b/tests/punctuation_analysis/test_quote_convention_detector.py index 44ff3b0..6f34abb 100644 --- a/tests/punctuation_analysis/test_quote_convention_detector.py +++ b/tests/punctuation_analysis/test_quote_convention_detector.py @@ -1,5 +1,3 @@ -from typing import Union - from machine.corpora import parse_usfm from machine.punctuation_analysis import QuoteConventionAnalysis, QuoteConventionDetector @@ -15,7 +13,8 @@ def test_standard_english() -> None: ‘You shall not eat of any tree of the garden’?” """ analysis = detect_quote_convention(usfm) - assert analysis is not None + + assert analysis.best_quote_convention is not None assert analysis.best_quote_convention.name == "standard_english" @@ -28,7 +27,8 @@ def test_typewriter_english() -> None: 'You shall not eat of any tree of the garden'?\" """ analysis = detect_quote_convention(usfm) - assert analysis is not None + + assert analysis.best_quote_convention is not None assert analysis.best_quote_convention.name == "typewriter_english" @@ -41,7 +41,8 @@ def test_british_english() -> None: “You shall not eat of any tree of the garden”?’ """ analysis = detect_quote_convention(usfm) - assert analysis is not None + + assert analysis.best_quote_convention is not None assert analysis.best_quote_convention.name == "british_english" @@ -54,7 +55,8 @@ def test_british_typewriter_english() -> None: \"You shall not eat of any tree of the garden\"?' """ analysis = detect_quote_convention(usfm) - assert analysis is not None + + assert analysis.best_quote_convention is not None assert analysis.best_quote_convention.name == "british_typewriter_english" @@ -67,7 +69,8 @@ def test_hybrid_typewriter_english() -> None: 'You shall not eat of any tree of the garden'?” """ analysis = detect_quote_convention(usfm) - assert analysis is not None + + assert analysis.best_quote_convention is not None assert analysis.best_quote_convention.name == "hybrid_typewriter_english" @@ -80,7 +83,8 @@ def test_standard_french() -> None: ‹You shall not eat of any tree of the garden›?» """ analysis = detect_quote_convention(usfm) - assert analysis is not None + + assert analysis.best_quote_convention is not None assert analysis.best_quote_convention.name == "standard_french" @@ -93,7 +97,8 @@ def test_typewriter_french() -> None: ?>> """ analysis = detect_quote_convention(usfm) - assert analysis is not None + + assert analysis.best_quote_convention is not None assert analysis.best_quote_convention.name == "typewriter_french" @@ -109,7 +114,8 @@ def test_western_european() -> None: “You shall not eat of any tree of the garden”?» """ analysis = detect_quote_convention(usfm) - assert analysis is not None + + assert analysis.best_quote_convention is not None assert analysis.best_quote_convention.name == "western_european" @@ -122,7 +128,8 @@ def test_british_inspired_western_european() -> None: ‘You shall not eat of any tree of the garden’?» """ analysis = detect_quote_convention(usfm) - assert analysis is not None + + assert analysis.best_quote_convention is not None assert analysis.best_quote_convention.name == "british_inspired_western_european" @@ -135,7 +142,8 @@ def test_typewriter_western_european() -> None: "You shall not eat of any tree of the garden"?>> """ analysis = detect_quote_convention(usfm) - assert analysis is not None + + assert analysis.best_quote_convention is not None assert analysis.best_quote_convention.name == "typewriter_western_european" @@ -148,7 +156,8 @@ def test_typewriter_western_european_variant() -> None: ?" """ analysis = detect_quote_convention(usfm) - assert analysis is not None + + assert analysis.best_quote_convention is not None assert analysis.best_quote_convention.name == "typewriter_western_european_variant" @@ -161,7 +170,8 @@ def test_hybrid_typewriter_western_european() -> None: "You shall not eat of any tree of the garden"?» """ analysis = detect_quote_convention(usfm) - assert analysis is not None + + assert analysis.best_quote_convention is not None assert analysis.best_quote_convention.name == "hybrid_typewriter_western_european" @@ -174,7 +184,8 @@ def test_hybrid_british_typewriter_western_european() -> None: 'You shall not eat of any tree of the garden'?» """ analysis = detect_quote_convention(usfm) - assert analysis is not None + + assert analysis.best_quote_convention is not None assert analysis.best_quote_convention.name == "hybrid_british_typewriter_western_european" @@ -187,7 +198,8 @@ def test_central_european() -> None: ‚You shall not eat of any tree of the garden‘?“ """ analysis = detect_quote_convention(usfm) - assert analysis is not None + + assert analysis.best_quote_convention is not None assert analysis.best_quote_convention.name == "central_european" @@ -200,7 +212,8 @@ def test_central_european_guillemets() -> None: ›You shall not eat of any tree of the garden‹?« """ analysis = detect_quote_convention(usfm) - assert analysis is not None + + assert analysis.best_quote_convention is not None assert analysis.best_quote_convention.name == "central_european_guillemets" @@ -213,7 +226,8 @@ def test_standard_swedish() -> None: ’You shall not eat of any tree of the garden’?” """ analysis = detect_quote_convention(usfm) - assert analysis is not None + + assert analysis.best_quote_convention is not None assert analysis.best_quote_convention.name == "standard_swedish" @@ -226,7 +240,8 @@ def test_standard_finnish() -> None: ’You shall not eat of any tree of the garden’?» """ analysis = detect_quote_convention(usfm) - assert analysis is not None + + assert analysis.best_quote_convention is not None assert analysis.best_quote_convention.name == "standard_finnish" @@ -239,7 +254,8 @@ def test_eastern_european() -> None: ‚You shall not eat of any tree of the garden’?” """ analysis = detect_quote_convention(usfm) - assert analysis is not None + + assert analysis.best_quote_convention is not None assert analysis.best_quote_convention.name == "eastern_european" @@ -252,7 +268,8 @@ def test_standard_russian() -> None: „You shall not eat of any tree of the garden“?» """ analysis = detect_quote_convention(usfm) - assert analysis is not None + + assert analysis.best_quote_convention is not None assert analysis.best_quote_convention.name == "standard_russian" @@ -265,7 +282,8 @@ def test_standard_arabic() -> None: ’You shall not eat of any tree of the garden‘?“ """ analysis = detect_quote_convention(usfm) - assert analysis is not None + + assert analysis.best_quote_convention is not None assert analysis.best_quote_convention.name == "standard_arabic" @@ -278,7 +296,8 @@ def test_non_standard_arabic() -> None: ’You shall not eat of any tree of the garden‘?» """ analysis = detect_quote_convention(usfm) - assert analysis is not None + + assert analysis.best_quote_convention is not None assert analysis.best_quote_convention.name == "non-standard_arabic" @@ -295,11 +314,12 @@ def test_mismatched_quotation_marks() -> None: God has said, ‘You shall not eat of it. You shall not touch it, lest you die.’ """ analysis = detect_quote_convention(usfm) - assert analysis is not None + + assert analysis.best_quote_convention is not None assert analysis.best_quote_convention.name == "standard_english" -def detect_quote_convention(usfm: str) -> Union[QuoteConventionAnalysis, None]: +def detect_quote_convention(usfm: str) -> QuoteConventionAnalysis: quote_convention_detector = QuoteConventionDetector() parse_usfm(usfm, quote_convention_detector) return quote_convention_detector.detect_quote_convention() diff --git a/tests/punctuation_analysis/test_quote_convention_set.py b/tests/punctuation_analysis/test_quote_convention_set.py index a2a87c4..2ddf165 100644 --- a/tests/punctuation_analysis/test_quote_convention_set.py +++ b/tests/punctuation_analysis/test_quote_convention_set.py @@ -1251,11 +1251,11 @@ def test_find_most_similar_convention() -> None: ) assert all_three_quote_convention_set.find_most_similar_convention(noisy_multiple_english_quotes_tabulator) == ( standard_english_quote_convention, - approx(0.9, rel=1e-9), + approx(0.8333333333333, rel=1e-9), ) assert two_french_quote_convention_set.find_most_similar_convention(noisy_multiple_english_quotes_tabulator) == ( western_european_quote_convention, - approx(0.1, rel=1e-9), + 0, ) noisy_multiple_french_quotes_tabulator = QuotationMarkTabulator() @@ -1273,7 +1273,7 @@ def test_find_most_similar_convention() -> None: ) assert all_three_quote_convention_set.find_most_similar_convention(noisy_multiple_french_quotes_tabulator) == ( standard_french_quote_convention, - approx(0.916666666666, rel=1e-9), + approx(0.875, rel=1e-9), ) too_deep_english_quotes_tabulator = QuotationMarkTabulator() @@ -1288,7 +1288,7 @@ def test_find_most_similar_convention() -> None: ) assert all_three_quote_convention_set.find_most_similar_convention(too_deep_english_quotes_tabulator) == ( standard_english_quote_convention, - approx(0.967741935483871, rel=1e-9), + approx(0.8, rel=1e-9), ) # in case of ties, the earlier convention in the list should be returned