From f71108f301a28377ab11830cce0c849707236f6c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mickae=CC=88l=20Menu?= Date: Fri, 10 Mar 2023 16:36:31 +0100 Subject: [PATCH 1/2] Fix parsing elements containing both a text node and child elements --- Makefile | 10 +++ .../iterators/HtmlResourceContentIterator.kt | 35 ++++++-- .../HtmlResourceContentIteratorTest.kt | 84 +++++++++++++++++++ 3 files changed, 120 insertions(+), 9 deletions(-) diff --git a/Makefile b/Makefile index 91ab04c167..8e42a5b89f 100644 --- a/Makefile +++ b/Makefile @@ -2,9 +2,19 @@ SCRIPTS_PATH := readium/navigator/src/main/assets/_scripts help: @echo "Usage: make \n\n\ + lint\t\tLint the Kotlin sources with ktlint\n\ + format\tFormat the Kotlin sources with ktlint\n\ scripts\tBundle the Navigator EPUB scripts\n\ " +.PHONY: lint +lint: + ./gradlew ktlintCheck + +.PHONY: format +format: + ./gradlew ktlintFormat + .PHONY: scripts scripts: yarn --cwd "$(SCRIPTS_PATH)" install --frozen-lockfile diff --git a/readium/shared/src/main/java/org/readium/r2/shared/publication/services/content/iterators/HtmlResourceContentIterator.kt b/readium/shared/src/main/java/org/readium/r2/shared/publication/services/content/iterators/HtmlResourceContentIterator.kt index 6bccdccf7e..facea2e3a7 100644 --- a/readium/shared/src/main/java/org/readium/r2/shared/publication/services/content/iterators/HtmlResourceContentIterator.kt +++ b/readium/shared/src/main/java/org/readium/r2/shared/publication/services/content/iterators/HtmlResourceContentIterator.kt @@ -158,12 +158,19 @@ class HtmlResourceContentIterator( private val elements = mutableListOf() private var startIndex = 0 + /** Segments accumulated for the current element. */ private val segmentsAcc = mutableListOf() + /** Text since the beginning of the current segment, after coalescing whitespaces. */ private var textAcc = StringBuilder() + /** Text content since the beginning of the resource, including whitespaces. */ private var wholeRawTextAcc: String? = null + /** Text content since the beginning of the current element, including whitespaces. */ private var elementRawTextAcc: String = "" + /** Text content since the beginning of the current segment, including whitespaces. */ private var rawTextAcc: String = "" + /** Language of the current segment. */ private var currentLanguage: String? = null + /** CSS selector of the current element. */ private var currentCssSelector: String? = null /** LIFO stack of the current element's block ancestors. */ @@ -240,9 +247,7 @@ class HtmlResourceContentIterator( } node.isBlock -> { - segmentsAcc.clear() - textAcc.clear() - rawTextAcc = "" + flushText() currentCssSelector = node.cssSelector() } } @@ -250,7 +255,7 @@ class HtmlResourceContentIterator( } override fun tail(node: Node, depth: Int) { - if (node is TextNode) { + if (node is TextNode && node.wholeText.isNotBlank()) { val language = node.language if (currentLanguage != language) { flushSegment() @@ -283,6 +288,11 @@ class HtmlResourceContentIterator( if (startElement != null && breadcrumbs.lastOrNull() == startElement) { startIndex = elements.size } + + // Trim the end of the last segment's text to get a cleaner output for the TextElement. + // Only whitespaces between the segments are meaningful. + segmentsAcc[segmentsAcc.size - 1] = segmentsAcc.last().run { copy(text = text.trimEnd()) } + elements.add( Content.TextElement( locator = baseLocator.copy( @@ -293,9 +303,9 @@ class HtmlResourceContentIterator( } } ), - text = Locator.Text( - before = segmentsAcc.firstOrNull()?.locator?.text?.before, - highlight = elementRawTextAcc, + text = Locator.Text.trimmingText( + elementRawTextAcc, + before = segmentsAcc.firstOrNull()?.locator?.text?.before ) ), role = TextElement.Role.Body, @@ -331,8 +341,8 @@ class HtmlResourceContentIterator( } } ), - text = Locator.Text( - highlight = rawTextAcc, + text = Locator.Text.trimmingText( + rawTextAcc, before = wholeRawTextAcc?.takeLast(beforeMaxLength) ) ), @@ -356,6 +366,13 @@ class HtmlResourceContentIterator( } } +private fun Locator.Text.Companion.trimmingText(text: String, before: String?): Locator.Text = + Locator.Text( + before = ((before ?: "") + text.takeWhile { it.isWhitespace() }).takeUnless { it.isBlank() }, + highlight = text.trim(), + after = text.takeLastWhile { it.isWhitespace() }.takeUnless { it.isBlank() } + ) + private val Node.language: String? get() = attr("xml:lang").takeUnless { it.isBlank() } ?: attr("lang").takeUnless { it.isBlank() } diff --git a/readium/shared/src/test/java/org/readium/r2/shared/publication/services/content/iterators/HtmlResourceContentIteratorTest.kt b/readium/shared/src/test/java/org/readium/r2/shared/publication/services/content/iterators/HtmlResourceContentIteratorTest.kt index da0aa11365..0912130f79 100644 --- a/readium/shared/src/test/java/org/readium/r2/shared/publication/services/content/iterators/HtmlResourceContentIteratorTest.kt +++ b/readium/shared/src/test/java/org/readium/r2/shared/publication/services/content/iterators/HtmlResourceContentIteratorTest.kt @@ -455,4 +455,88 @@ class HtmlResourceContentIteratorTest { iterator(html).elements() ) } + + @Test + fun `iterating over an element containing both a text node and child elements`() = runTest { + val html = """ + + + +
    +
  1. Let's start at the top—the source of ideas. +
  2. +
+ + + """ + + assertEquals( + listOf( + TextElement( + locator = locator( + selector = "#c06-li-0001", + highlight = "Let's start at the top—the source of ideas." + ), + role = TextElement.Role.Body, + segments = listOf( + Segment( + locator = locator( + selector = "#c06-li-0001", + highlight = "Let's start at the top—the source of ideas." + ), + text = "Let's start at the top—the source of ideas.", + attributes = emptyList() + ), + ), + attributes = emptyList() + ), + TextElement( + locator = locator( + selector = "#c06-para-0019", + before = " top—the source of ideas.\n ", + highlight = "While almost everyone today claims to be Agile, what I've just described is very much a waterfall process." + ), + role = TextElement.Role.Body, + segments = listOf( + Segment( + locator = locator( + selector = "#c06-para-0019", + before = " top—the source of ideas.\n ", + highlight = "While almost everyone today claims to be Agile, what I've just described is very much a waterfall process." + ), + text = "While almost everyone today claims to be Agile, what I've just described is very much a waterfall process.", + attributes = emptyList() + ) + ), + attributes = emptyList() + ), + TextElement( + locator = locator( + selector = "#c06-para-0019", + before = "e just described is very much a waterfall process.\n \n ", + highlight = "Trailing text" + ), + role = TextElement.Role.Body, + segments = listOf( + Segment( + locator = locator( + selector = "#c06-para-0019", + before = "e just described is very much a waterfall process.\n ", + highlight = "Trailing text" + ), + text = "Trailing text", + attributes = emptyList() + ) + ), + attributes = emptyList() + ) + ), + iterator(html).elements() + ) + } } From 4469ae46edcac2eacd49e2d94cb1ed40b5a3b92e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mickae=CC=88l=20Menu?= Date: Fri, 10 Mar 2023 17:11:44 +0100 Subject: [PATCH 2/2] Fix locating the start element when the node is empty --- .../content/iterators/HtmlResourceContentIterator.kt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/readium/shared/src/main/java/org/readium/r2/shared/publication/services/content/iterators/HtmlResourceContentIterator.kt b/readium/shared/src/main/java/org/readium/r2/shared/publication/services/content/iterators/HtmlResourceContentIterator.kt index facea2e3a7..af8c534f30 100644 --- a/readium/shared/src/main/java/org/readium/r2/shared/publication/services/content/iterators/HtmlResourceContentIterator.kt +++ b/readium/shared/src/main/java/org/readium/r2/shared/publication/services/content/iterators/HtmlResourceContentIterator.kt @@ -283,12 +283,13 @@ class HtmlResourceContentIterator( private fun flushText() { flushSegment() - if (segmentsAcc.isEmpty()) return - if (startElement != null && breadcrumbs.lastOrNull() == startElement) { + if (startIndex == 0 && startElement != null && breadcrumbs.lastOrNull() == startElement) { startIndex = elements.size } + if (segmentsAcc.isEmpty()) return + // Trim the end of the last segment's text to get a cleaner output for the TextElement. // Only whitespaces between the segments are meaningful. segmentsAcc[segmentsAcc.size - 1] = segmentsAcc.last().run { copy(text = text.trimEnd()) }