Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion machine/punctuation_analysis/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,10 @@
from .quotation_mark_update_settings import QuotationMarkUpdateSettings
from .quotation_mark_update_strategy import QuotationMarkUpdateStrategy
from .quote_convention import QuoteConvention, SingleLevelQuoteConvention
from .quote_convention_analysis import QuoteConventionAnalysis
from .quote_convention_changing_usfm_update_block_handler import QuoteConventionChangingUsfmUpdateBlockHandler
from .quote_convention_detection_resolution_settings import QuoteConventionDetectionResolutionSettings
from .quote_convention_detector import QuoteConventionAnalysis, QuoteConventionDetector
from .quote_convention_detector import QuoteConventionDetector
from .quote_convention_set import QuoteConventionSet
from .standard_quote_conventions import STANDARD_QUOTE_CONVENTIONS
from .text_segment import TextSegment
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,51 @@
from abc import ABC, abstractmethod
from collections import defaultdict
from typing import BinaryIO, Dict, List, Optional, Union

from ..corpora.paratext_project_settings import ParatextProjectSettings
from ..corpora.paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
from ..corpora.usfm_parser import parse_usfm
from ..scripture.canon import book_id_to_number, get_scripture_books
from ..utils.typeshed import StrPath
from .quote_convention_detector import QuoteConventionAnalysis, QuoteConventionDetector
from .quotation_mark_tabulator import QuotationMarkTabulator
from .quote_convention import QuoteConvention
from .quote_convention_analysis import QuoteConventionAnalysis
from .quote_convention_detector import QuoteConventionDetector


class WeightedAverageQuoteConventionAnalysisBuilder:
def __init__(self) -> None:
self._total_weight: float = 0
self._convention_votes: Dict[str, float] = defaultdict(float)
self._quote_conventions_by_name: Dict[str, QuoteConvention] = {}
self._total_tabulated_quotation_marks = QuotationMarkTabulator()

def record_book_results(
self,
quote_convention_analysis: QuoteConventionAnalysis,
tabulated_quotation_marks: QuotationMarkTabulator,
) -> None:
if quote_convention_analysis.best_quote_convention is None or quote_convention_analysis.weight == 0:
return

self._total_tabulated_quotation_marks.tabulate_from(tabulated_quotation_marks)

self._total_weight += quote_convention_analysis.weight
for convention, score in quote_convention_analysis.get_all_convention_scores():
if convention.name not in self._quote_conventions_by_name:
self._quote_conventions_by_name[convention.name] = convention
self._convention_votes[convention.name] += score * quote_convention_analysis.weight

def to_quote_convention_analysis(self) -> QuoteConventionAnalysis:
quote_convention_analysis_builder = QuoteConventionAnalysis.Builder(self._total_tabulated_quotation_marks)

for convention_name, total_score in self._convention_votes.items():
if total_score > 0:
quote_convention_analysis_builder.record_convention_score(
self._quote_conventions_by_name[convention_name], total_score / self._total_weight
)

return quote_convention_analysis_builder.build()


class ParatextProjectQuoteConventionDetector(ABC):
Expand All @@ -17,15 +56,20 @@ def __init__(self, settings: Union[ParatextProjectSettings, ParatextProjectSetti
self._settings = settings

def get_quote_convention_analysis(
self, handler: Optional[QuoteConventionDetector] = None, include_chapters: Optional[Dict[int, List[int]]] = None
) -> Optional[QuoteConventionAnalysis]:
handler = QuoteConventionDetector() if handler is None else handler
self, include_chapters: Optional[Dict[int, List[int]]] = None
) -> QuoteConventionAnalysis:

weighted_average_quote_convention_analysis_builder = WeightedAverageQuoteConventionAnalysisBuilder()

for book_id in get_scripture_books():
if include_chapters is not None and book_id_to_number(book_id) not in include_chapters:
continue
file_name: str = self._settings.get_book_file_name(book_id)
if not self._exists(file_name):
continue

handler = QuoteConventionDetector()

with self._open(file_name) as sfm_file:
usfm: str = sfm_file.read().decode(self._settings.encoding)
try:
Expand All @@ -37,7 +81,15 @@ def get_quote_convention_analysis(
f". Error: '{e}'"
)
raise RuntimeError(error_message) from e
return handler.detect_quote_convention(include_chapters)

quote_convention_analysis, tabulated_quotation_marks = (
handler.detect_quote_convention_and_get_tabulated_quotation_marks(include_chapters)
)
weighted_average_quote_convention_analysis_builder.record_book_results(
quote_convention_analysis, tabulated_quotation_marks
)

return weighted_average_quote_convention_analysis_builder.to_quote_convention_analysis()

@abstractmethod
def _exists(self, file_name: StrPath) -> bool: ...
Expand Down
51 changes: 39 additions & 12 deletions machine/punctuation_analysis/quotation_mark_tabulator.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from collections import Counter, defaultdict
from typing import List
from typing import Dict, List

from .quotation_mark_direction import QuotationMarkDirection
from .quotation_mark_metadata import QuotationMarkMetadata
Expand All @@ -15,6 +15,10 @@ def count_quotation_mark(self, quotation_mark: str) -> None:
self._quotation_mark_counter.update([quotation_mark])
self._total_count += 1

def count_from(self, quotation_mark_counts: "QuotationMarkCounts") -> None:
self._quotation_mark_counter.update(quotation_mark_counts._quotation_mark_counter)
self._total_count += quotation_mark_counts._total_count

def find_best_quotation_mark_proportion(self) -> tuple[str, int, int]:
return self._quotation_mark_counter.most_common(1)[0] + (self._total_count,)

Expand All @@ -36,6 +40,13 @@ def tabulate(self, quotation_marks: list[QuotationMarkMetadata]) -> None:
for quotation_mark in quotation_marks:
self._count_quotation_mark(quotation_mark)

def tabulate_from(self, tabulated_quotation_marks: "QuotationMarkTabulator") -> None:
for (
depth_and_direction,
quotation_mark_counts,
) in tabulated_quotation_marks._quotation_counts_by_depth_and_direction.items():
self._quotation_counts_by_depth_and_direction[depth_and_direction].count_from(quotation_mark_counts)

def _count_quotation_mark(self, quotation_mark: QuotationMarkMetadata) -> None:
key = (quotation_mark.depth, quotation_mark.direction)
self._quotation_counts_by_depth_and_direction[key].count_quotation_mark(quotation_mark.quotation_mark)
Expand All @@ -48,23 +59,39 @@ def _find_most_common_quotation_mark_with_depth_and_direction(
) -> tuple[str, int, int]:
return self._quotation_counts_by_depth_and_direction[(depth, direction)].find_best_quotation_mark_proportion()

def get_total_quotation_mark_count(self) -> int:
total_count = 0
for counts in self._quotation_counts_by_depth_and_direction.values():
total_count += counts.get_observed_count()
return total_count

def calculate_similarity(self, quote_convention: QuoteConvention) -> float:
weighted_difference = 0
total_weight = 0
for depth, direction in self._quotation_counts_by_depth_and_direction:
num_marks_by_depth: Dict[int, int] = defaultdict(int)
num_matching_marks_by_depth: Dict[int, int] = defaultdict(int)

for depth, direction in sorted(self._quotation_counts_by_depth_and_direction, key=lambda item: item[0]):
expected_quotation_mark: str = quote_convention.get_expected_quotation_mark(depth, direction)

# Give higher weight to shallower depths, since deeper marks are more likely to be mistakes
weighted_difference += self._quotation_counts_by_depth_and_direction[
num_matching_marks = self._quotation_counts_by_depth_and_direction[(depth, direction)].get_observed_count()
num_marks_by_depth[depth] += num_matching_marks
num_matching_marks_by_depth[depth] += num_matching_marks - self._quotation_counts_by_depth_and_direction[
(depth, direction)
].calculate_num_differences(expected_quotation_mark) * 2 ** (-depth)
total_weight += self._quotation_counts_by_depth_and_direction[
(depth, direction)
].get_observed_count() * 2 ** (-depth)
].calculate_num_differences(expected_quotation_mark)

# The scores of greater depths depend on the scores of shallower depths
scores_by_depth: Dict[int, float] = defaultdict(float)
for depth in sorted(num_marks_by_depth.keys()):
previous_depth_score = (
scores_by_depth[depth - 1] / num_marks_by_depth[depth - 1] if depth - 1 in scores_by_depth else 1
)
scores_by_depth[depth] = previous_depth_score * num_matching_marks_by_depth[depth]

total_marks = sum(num_marks_by_depth.values())
total_score = sum(scores_by_depth.values())

if total_weight == 0:
if total_marks == 0:
return 0
return 1 - (weighted_difference / total_weight)
return total_score / total_marks

def get_summary_message(self) -> str:
message_lines: List[str] = []
Expand Down
6 changes: 6 additions & 0 deletions machine/punctuation_analysis/quote_convention.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ def normalize(self) -> "SingleLevelQuoteConvention":
)
return SingleLevelQuoteConvention(normalized_opening_quotation_mark, normalized_closing_quotation_mark)

def __hash__(self) -> int:
return hash((self.opening_quotation_mark, self.closing_quotation_mark))


class QuoteConvention:
def __init__(self, name: str, level_conventions: list[SingleLevelQuoteConvention]):
Expand All @@ -57,6 +60,9 @@ def __eq__(self, value):
return False
return True

def __hash__(self) -> int:
return hash((tuple(self.level_conventions)))

@property
def name(self) -> str:
return self._name
Expand Down
61 changes: 61 additions & 0 deletions machine/punctuation_analysis/quote_convention_analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
from typing import Optional

from .quotation_mark_tabulator import QuotationMarkTabulator
from .quote_convention import QuoteConvention


class QuoteConventionAnalysis:

def __init__(
self,
convention_scores: dict[QuoteConvention, float],
tabulated_quotation_marks: QuotationMarkTabulator,
analysis_weight: float = 1.0, # weight is used for combining scores for multiple books
):
self._convention_scores = convention_scores
if len(convention_scores) > 0:
self._best_quote_convention_score = max(convention_scores.items(), key=lambda item: item[1])[1]
else:
self._best_quote_convention_score = 0

if self._best_quote_convention_score > 0:
self._best_quote_convention = max(convention_scores.items(), key=lambda item: item[1])[0]
else:
self._best_quote_convention = None

self._tabulated_quotation_marks = tabulated_quotation_marks
self._analysis_weight = analysis_weight

def get_all_convention_scores(self) -> list[tuple[QuoteConvention, float]]:
return list(self._convention_scores.items())

@property
def analysis_summary(self) -> str:
return self._tabulated_quotation_marks.get_summary_message()

@property
def best_quote_convention(self) -> Optional[QuoteConvention]:
return self._best_quote_convention

@property
def best_quote_convention_score(self) -> float:
return self._best_quote_convention_score

@property
def weight(self) -> float:
return self._analysis_weight

class Builder:
def __init__(self, tabulated_quotation_marks: QuotationMarkTabulator):
self._convention_scores: dict[QuoteConvention, float] = {}
self._tabulated_quotation_marks = tabulated_quotation_marks

def record_convention_score(self, quote_convention: QuoteConvention, score: float) -> None:
self._convention_scores[quote_convention] = score

def build(self) -> "QuoteConventionAnalysis":
return QuoteConventionAnalysis(
self._convention_scores,
self._tabulated_quotation_marks,
self._tabulated_quotation_marks.get_total_quotation_mark_count(),
)
32 changes: 13 additions & 19 deletions machine/punctuation_analysis/quote_convention_detector.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from dataclasses import dataclass
from typing import Dict, List, Optional
from typing import Dict, List, Optional, Tuple

from .chapter import Chapter
from .depth_based_quotation_mark_resolver import DepthBasedQuotationMarkResolver
Expand All @@ -8,20 +7,13 @@
from .quotation_mark_metadata import QuotationMarkMetadata
from .quotation_mark_string_match import QuotationMarkStringMatch
from .quotation_mark_tabulator import QuotationMarkTabulator
from .quote_convention import QuoteConvention
from .quote_convention_analysis import QuoteConventionAnalysis
from .quote_convention_detection_resolution_settings import QuoteConventionDetectionResolutionSettings
from .quote_convention_set import QuoteConventionSet
from .standard_quote_conventions import STANDARD_QUOTE_CONVENTIONS
from .usfm_structure_extractor import UsfmStructureExtractor


@dataclass(frozen=True)
class QuoteConventionAnalysis:
best_quote_convention: QuoteConvention
best_quote_convention_score: float
analysis_summary: str


class QuoteConventionDetector(UsfmStructureExtractor):

def __init__(self):
Expand Down Expand Up @@ -53,15 +45,17 @@ def _count_quotation_marks_in_chapter(

def detect_quote_convention(
self, include_chapters: Optional[Dict[int, List[int]]] = None
) -> Optional[QuoteConventionAnalysis]:
) -> QuoteConventionAnalysis:
self._count_quotation_marks_in_chapters(self.get_chapters(include_chapters))

(best_quote_convention, score) = STANDARD_QUOTE_CONVENTIONS.find_most_similar_convention(
self._quotation_mark_tabulator
)
return STANDARD_QUOTE_CONVENTIONS.score_all_quote_conventions(self._quotation_mark_tabulator)

if score > 0 and best_quote_convention is not None:
return QuoteConventionAnalysis(
best_quote_convention, score, self._quotation_mark_tabulator.get_summary_message()
)
return None
def detect_quote_convention_and_get_tabulated_quotation_marks(
self, include_chapters: Optional[Dict[int, List[int]]] = None
) -> Tuple[QuoteConventionAnalysis, QuotationMarkTabulator]:
self._count_quotation_marks_in_chapters(self.get_chapters(include_chapters))

return (
STANDARD_QUOTE_CONVENTIONS.score_all_quote_conventions(self._quotation_mark_tabulator),
self._quotation_mark_tabulator,
)
9 changes: 9 additions & 0 deletions machine/punctuation_analysis/quote_convention_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from .quotation_mark_direction import QuotationMarkDirection
from .quotation_mark_tabulator import QuotationMarkTabulator
from .quote_convention import QuoteConvention
from .quote_convention_analysis import QuoteConventionAnalysis


class QuoteConventionSet:
Expand Down Expand Up @@ -149,3 +150,11 @@ def find_most_similar_convention(
best_quote_convention = quote_convention

return (best_quote_convention, best_similarity)

def score_all_quote_conventions(self, tabulated_quotation_marks: QuotationMarkTabulator) -> QuoteConventionAnalysis:
quote_convention_analysis_builder = QuoteConventionAnalysis.Builder(tabulated_quotation_marks)
for quote_convention in self._conventions:
score = tabulated_quotation_marks.calculate_similarity(quote_convention)
quote_convention_analysis_builder.record_convention_score(quote_convention, score)

return quote_convention_analysis_builder.build()
9 changes: 9 additions & 0 deletions machine/punctuation_analysis/standard_quote_conventions.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,15 @@
[
SingleLevelQuoteConvention("\u00ab", "\u00bb"),
SingleLevelQuoteConvention("\u2019", "\u2018"),
SingleLevelQuoteConvention("\u201d", "\u201c"),
],
),
QuoteConvention(
"arabic_inspired_western_european",
[
SingleLevelQuoteConvention("\u00ab", "\u00bb"),
SingleLevelQuoteConvention("\u201d", "\u201c"),
SingleLevelQuoteConvention("\u2019", "\u2018"),
],
),
]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@
from typing import BinaryIO, Optional
from zipfile import ZipFile

from ..corpora.zip_paratext_project_settings_parser import ZipParatextProjectSettingsParser
from .paratext_project_quote_convention_detector import ParatextProjectQuoteConventionDetector


class ZipParatextProjectQuoteConventionDetector(ParatextProjectQuoteConventionDetector):
def __init__(self, archive: ZipFile) -> None:
self._archive = archive
super().__init__(ZipParatextProjectSettingsParser(archive))

def _exists(self, file_name: str) -> bool:
return file_name in self._archive.namelist()
Expand Down
Loading
Loading