From 807c201de018aba48a625618c59048905e776684 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mickae=CC=88l=20Menu?= Date: Thu, 16 Feb 2023 14:21:39 +0100 Subject: [PATCH 1/7] Add tests for `HtmlResourceContentIterator` and change the indexing logic --- .../iterators/HtmlResourceContentIterator.kt | 51 ++-- .../HtmlResourceContentIteratorTest.kt | 241 ++++++++++++++++++ 2 files changed, 268 insertions(+), 24 deletions(-) create mode 100644 readium/shared/src/test/java/org/readium/r2/shared/publication/services/content/iterators/HtmlResourceContentIteratorTest.kt diff --git a/readium/shared/src/main/java/org/readium/r2/shared/publication/services/content/iterators/HtmlResourceContentIterator.kt b/readium/shared/src/main/java/org/readium/r2/shared/publication/services/content/iterators/HtmlResourceContentIterator.kt index e581b772f1..97c65d5660 100644 --- a/readium/shared/src/main/java/org/readium/r2/shared/publication/services/content/iterators/HtmlResourceContentIterator.kt +++ b/readium/shared/src/main/java/org/readium/r2/shared/publication/services/content/iterators/HtmlResourceContentIterator.kt @@ -27,7 +27,7 @@ import org.readium.r2.shared.util.Language import org.readium.r2.shared.util.mediatype.MediaType import org.readium.r2.shared.util.use -// FIXME: Support custom skipped elements +// FIXME: Support custom skipped elements? /** * Iterates an HTML [resource], starting from the given [locator]. @@ -57,47 +57,50 @@ class HtmlResourceContentIterator( /** * [Content.Element] loaded with [hasPrevious] or [hasNext], associated with the move delta. */ - private data class ContentWithDelta( + private data class ElementWithDelta( val element: Content.Element, val delta: Int ) - private var currentElement: ContentWithDelta? = null + private var requestedElement: ElementWithDelta? = null override suspend fun hasPrevious(): Boolean { - currentElement = nextBy(-1) - return currentElement != null + if (requestedElement?.delta == -1) return true + + val index = currentIndex() - 1 + val element = elements().elements.getOrNull(index) ?: return false + currentIndex = index + requestedElement = ElementWithDelta(element, -1) + return true } override fun previous(): Content.Element = - currentElement + requestedElement ?.takeIf { it.delta == -1 }?.element + ?.also { requestedElement = null } ?: throw IllegalStateException("Called previous() without a successful call to hasPrevious() first") override suspend fun hasNext(): Boolean { - currentElement = nextBy(+1) - return currentElement != null + if (requestedElement?.delta == 1) return true + + val index = currentIndex() + val element = elements().elements.getOrNull(index) ?: return false + currentIndex = index + 1 + requestedElement = ElementWithDelta(element, +1) + return true } override fun next(): Content.Element = - currentElement - ?.takeIf { it.delta == +1 }?.element + requestedElement + ?.takeIf { it.delta == 1 }?.element + ?.also { requestedElement = null } ?: throw IllegalStateException("Called next() without a successful call to hasNext() first") - private suspend fun nextBy(delta: Int): ContentWithDelta? { - val elements = elements() - val index = currentIndex?.let { it + delta } - ?: elements.startIndex - - val content = elements.elements.getOrNull(index) - ?: return null - - currentIndex = index - return ContentWithDelta(content, delta) - } - private var currentIndex: Int? = null + private suspend fun currentIndex(): Int = + currentIndex ?: elements().startIndex + private suspend fun elements(): ParsedElements = parsedElements ?: parseElements().also { parsedElements = it } @@ -123,7 +126,7 @@ class HtmlResourceContentIterator( } /** - * Holds the result of parsing the HTML resource into a list of [ContentElement]. + * Holds the result of parsing the HTML resource into a list of `ContentElement`. * * The [startIndex] will be calculated from the element matched by the base [locator], if * possible. Defaults to 0. @@ -183,7 +186,7 @@ class HtmlResourceContentIterator( if (href != null) { elements.add( - Content.ImageElement( + ImageElement( locator = baseLocator.copy( locations = Locator.Locations( otherLocations = buildMap { diff --git a/readium/shared/src/test/java/org/readium/r2/shared/publication/services/content/iterators/HtmlResourceContentIteratorTest.kt b/readium/shared/src/test/java/org/readium/r2/shared/publication/services/content/iterators/HtmlResourceContentIteratorTest.kt new file mode 100644 index 0000000000..7a7c4bd1e7 --- /dev/null +++ b/readium/shared/src/test/java/org/readium/r2/shared/publication/services/content/iterators/HtmlResourceContentIteratorTest.kt @@ -0,0 +1,241 @@ +@file:OptIn(ExperimentalReadiumApi::class, ExperimentalCoroutinesApi::class) + +package org.readium.r2.shared.publication.services.content.iterators + +import kotlin.test.* +import kotlinx.coroutines.ExperimentalCoroutinesApi +import kotlinx.coroutines.test.runTest +import org.junit.Test +import org.junit.runner.RunWith +import org.readium.r2.shared.ExperimentalReadiumApi +import org.readium.r2.shared.fetcher.StringResource +import org.readium.r2.shared.publication.Link +import org.readium.r2.shared.publication.Locator +import org.readium.r2.shared.publication.services.content.Content +import org.readium.r2.shared.publication.services.content.Content.Attribute +import org.readium.r2.shared.publication.services.content.Content.AttributeKey.Companion.LANGUAGE +import org.readium.r2.shared.publication.services.content.Content.TextElement +import org.readium.r2.shared.publication.services.content.Content.TextElement.Segment +import org.readium.r2.shared.util.Language +import org.robolectric.RobolectricTestRunner + +@RunWith(RobolectricTestRunner::class) +class HtmlResourceContentIteratorTest { + + private val link = Link(href = "res.xhtml", type = "application/xhtml+xml") + private val locator = Locator(href = "res.xhtml", type = "application/xhtml+xml") + + private val html = """ + + + + + Section IV: FAIRY STORIES—MODERN FANTASTIC TALES + + + +
+
171
+

INTRODUCTORY

+ +

The difficulties of classification are very apparent here, and once more it must be noted that illustrative and practical purposes rather than logical ones are served by the arrangement adopted. The modern fanciful story is here placed next to the real folk story instead of after all the groups of folk products. The Hebrew stories at the beginning belong quite as well, perhaps even better, in Section V, while the stories at the end of Section VI shade off into the more modern types of short tales.

+

The child's natural literature. The world has lost certain secrets as the price of an advancing civilization.

+

Without discussing the limits of the culture-epoch theory of human development as a complete guide in education, it is clear that the young child passes through a period when his mind looks out upon the world in a manner analogous to that of the folk as expressed in their literature.

+
+ + + """ + + private val elements: List = listOf( + TextElement( + locator = locator( + selector = "#pgepubid00498 > div.center", + highlight = "171" + ), + role = TextElement.Role.Body, + segments = listOf( + Segment( + locator = locator( + selector = "#pgepubid00498 > div.center", + before = "", + highlight = "171" + ), + text = "171", + attributes = listOf(Attribute(LANGUAGE, Language("en"))) + ) + ) + ), + TextElement( + locator = locator( + selector = "#pgepubid00498 > h3", + highlight = "INTRODUCTORY" + ), + role = TextElement.Role.Body, + segments = listOf( + Segment( + locator = locator( + selector = "#pgepubid00498 > h3", + before = "171", + highlight = "INTRODUCTORY" + ), + text = "INTRODUCTORY", + attributes = listOf(Attribute(LANGUAGE, Language("en"))) + ), + ) + ), + TextElement( + locator = locator( + selector = "#pgepubid00498 > p:nth-child(3)", + highlight = "The difficulties of classification are very apparent here, and once more it must be noted that illustrative and practical purposes rather than logical ones are served by the arrangement adopted. The modern fanciful story is here placed next to the real folk story instead of after all the groups of folk products. The Hebrew stories at the beginning belong quite as well, perhaps even better, in Section V, while the stories at the end of Section VI shade off into the more modern types of short tales." + ), + role = TextElement.Role.Body, + segments = listOf( + Segment( + locator = locator( + selector = "#pgepubid00498 > p:nth-child(3)", + before = "171INTRODUCTORY", + highlight = "The difficulties of classification are very apparent here, and once more it must be noted that illustrative and practical purposes rather than logical ones are served by the arrangement adopted. The modern fanciful story is here placed next to the real folk story instead of after all the groups of folk products. The Hebrew stories at the beginning belong quite as well, perhaps even better, in Section V, while the stories at the end of Section VI shade off into the more modern types of short tales." + ), + text = "The difficulties of classification are very apparent here, and once more it must be noted that illustrative and practical purposes rather than logical ones are served by the arrangement adopted. The modern fanciful story is here placed next to the real folk story instead of after all the groups of folk products. The Hebrew stories at the beginning belong quite as well, perhaps even better, in Section V, while the stories at the end of Section VI shade off into the more modern types of short tales.", + attributes = listOf(Attribute(LANGUAGE, Language("en"))), + ), + ) + ), + TextElement( + locator = locator( + selector = "#pgepubid00498 > p:nth-child(4)", + highlight = "The child's natural literature. The world has lost certain secrets as the price of an advancing civilization." + ), + role = TextElement.Role.Body, + segments = listOf( + Segment( + locator = locator( + selector = "#pgepubid00498 > p:nth-child(4)", + before = "ade off into the more modern types of short tales.", + highlight = "The child's natural literature. The world has lost certain secrets as the price of an advancing civilization." + ), + text = "The child's natural literature. The world has lost certain secrets as the price of an advancing civilization.", + attributes = listOf(Attribute(LANGUAGE, Language("en"))) + ), + ) + ), + TextElement( + locator = locator( + selector = "#pgepubid00498 > p:nth-child(5)", + highlight = "Without discussing the limits of the culture-epoch theory of human development as a complete guide in education, it is clear that the young child passes through a period when his mind looks out upon the world in a manner analogous to that of the folk as expressed in their literature." + ), + role = TextElement.Role.Body, + segments = listOf( + Segment( + locator = locator( + selector = "#pgepubid00498 > p:nth-child(5)", + before = "secrets as the price of an advancing civilization.", + highlight = "Without discussing the limits of the culture-epoch theory of human development as a complete guide in education, it is clear that the young child passes through a period when his mind looks out upon the world in a manner analogous to that of the folk as expressed in their literature." + ), + text = "Without discussing the limits of the culture-epoch theory of human development as a complete guide in education, it is clear that the young child passes through a period when his mind looks out upon the world in a manner analogous to that of the folk as expressed in their literature.", + attributes = listOf(Attribute(LANGUAGE, Language("en"))) + ), + ) + ) + ) + + private fun locator( + selector: String, + before: String? = null, + highlight: String? = null, + after: String? = null + ): Locator = + locator.copy( + locations = Locator.Locations(otherLocations = mapOf("cssSelector" to selector)), + text = Locator.Text(before = before, highlight = highlight, after = after) + ) + + private fun iterator(startLocator: Locator = locator): HtmlResourceContentIterator = + HtmlResourceContentIterator(StringResource(link, html), startLocator) + + @Test + fun `cannot call previous() without first hasPrevious()`() = runTest { + val iter = iterator() + iter.hasNext(); iter.next() + + assertFailsWith(IllegalStateException::class) { iter.previous() } + iter.hasPrevious() + iter.previous() + } + + @Test + fun `cannot call next() without first hasNext()`() = runTest { + val iter = iterator() + assertFailsWith(IllegalStateException::class) { iter.next() } + iter.hasNext() + iter.next() + } + + @Test + fun `iterate from start to finish`() = runTest { + val iter = iterator() + val res = mutableListOf() + while (iter.hasNext()) { + res.add(iter.next()) + } + assertContentEquals(elements, res) + } + + @Test + fun `previous() is null from the beginning`() = runTest { + val iter = iterator() + assertFalse(iter.hasPrevious()) + } + + @Test + fun `next() returns the first element from the beginning`() = runTest { + val iter = iterator() + assertTrue(iter.hasNext()) + assertEquals(elements[0], iter.next()) + } + + @Test + fun `next() then previous() returns the first element`() = runTest { + val iter = iterator() + assertTrue(iter.hasNext()) + assertEquals(elements[0], iter.next()) + assertTrue(iter.hasPrevious()) + assertEquals(elements[0], iter.previous()) + } + + @Test + fun `calling hasPrevious() several times doesn't move the index`() = runTest { + val iter = iterator() + iter.hasNext(); iter.next() + assertTrue(iter.hasPrevious()) + assertTrue(iter.hasPrevious()) + assertTrue(iter.hasPrevious()) + assertEquals(elements[0], iter.previous()) + } + + @Test + fun `calling hasNext() several times doesn't move the index`() = runTest { + val iter = iterator() + assertTrue(iter.hasNext()) + assertTrue(iter.hasNext()) + assertTrue(iter.hasNext()) + assertEquals(elements[0], iter.next()) + } + + @Test + fun `starting from a CSS selector`() = runTest { + val iter = iterator(locator(selector = "#pgepubid00498 > p:nth-child(3)")) + val res = mutableListOf() + while (iter.hasNext()) { + res.add(iter.next()) + } + assertContentEquals(elements.subList(2, elements.size), res) + } + + @Test + fun `calling previous() when starting from a CSS selector`() = runTest { + val iter = iterator(locator(selector = "#pgepubid00498 > p:nth-child(3)")) + assertTrue(iter.hasPrevious()) + assertEquals(elements[1], iter.previous()) + } +} From 6c0bd3a31f3ffaf0dc0a272c320bdf84f0cf3a4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mickae=CC=88l=20Menu?= Date: Thu, 16 Feb 2023 19:04:05 +0100 Subject: [PATCH 2/7] Fix issue when a block element contains inline elements --- readium/navigator/Makefile => Makefile | 15 ++--- .../iterators/HtmlResourceContentIterator.kt | 11 +++- .../HtmlResourceContentIteratorTest.kt | 65 +++++++++++++++---- 3 files changed, 65 insertions(+), 26 deletions(-) rename readium/navigator/Makefile => Makefile (52%) diff --git a/readium/navigator/Makefile b/Makefile similarity index 52% rename from readium/navigator/Makefile rename to Makefile index bf591dc1ae..91ab04c167 100644 --- a/readium/navigator/Makefile +++ b/Makefile @@ -1,18 +1,13 @@ -SCRIPTS_PATH := src/main/assets/_scripts +SCRIPTS_PATH := readium/navigator/src/main/assets/_scripts help: @echo "Usage: make \n\n\ - install\tDownload NPM dependencies\n\ - scripts\tBundle EPUB scripts with Webpack\n\ - lint-scripts\tCheck quality of EPUB scripts\n\ + scripts\tBundle the Navigator EPUB scripts\n\ " -install: - yarn --cwd "$(SCRIPTS_PATH)" install --frozen-lockfile - +.PHONY: scripts scripts: + yarn --cwd "$(SCRIPTS_PATH)" install --frozen-lockfile yarn --cwd "$(SCRIPTS_PATH)" run format - yarn --cwd "$(SCRIPTS_PATH)" run bundle - -lint-scripts: yarn --cwd "$(SCRIPTS_PATH)" run lint + yarn --cwd "$(SCRIPTS_PATH)" run bundle diff --git a/readium/shared/src/main/java/org/readium/r2/shared/publication/services/content/iterators/HtmlResourceContentIterator.kt b/readium/shared/src/main/java/org/readium/r2/shared/publication/services/content/iterators/HtmlResourceContentIterator.kt index 97c65d5660..337cbef8f1 100644 --- a/readium/shared/src/main/java/org/readium/r2/shared/publication/services/content/iterators/HtmlResourceContentIterator.kt +++ b/readium/shared/src/main/java/org/readium/r2/shared/publication/services/content/iterators/HtmlResourceContentIterator.kt @@ -149,7 +149,6 @@ class HtmlResourceContentIterator( private val elements = mutableListOf() private var startIndex = 0 - private var currentElement: Element? = null private val segmentsAcc = mutableListOf() private var textAcc = StringBuilder() @@ -160,6 +159,9 @@ class HtmlResourceContentIterator( private var currentCssSelector: String? = null private var ignoredNode: Node? = null + /** LIFO stack of the current element's block ancestors. */ + private val breadcrumbs = mutableListOf() + override fun head(node: Node, depth: Int) { if (ignoredNode != null) return @@ -169,7 +171,6 @@ class HtmlResourceContentIterator( } if (node is Element) { - currentElement = node val tag = node.normalName() @@ -207,6 +208,8 @@ class HtmlResourceContentIterator( } } node.isBlock -> { + breadcrumbs.add(node) + segmentsAcc.clear() textAcc.clear() rawTextAcc = "" @@ -232,7 +235,9 @@ class HtmlResourceContentIterator( appendNormalisedText(node) } else if (node is Element) { if (node.isBlock) { + assert(breadcrumbs.last() == node) flushText() + breadcrumbs.removeLast() } } } @@ -249,7 +254,7 @@ class HtmlResourceContentIterator( flushSegment() if (segmentsAcc.isEmpty()) return - if (startElement != null && currentElement == startElement) { + if (startElement != null && breadcrumbs.lastOrNull() == startElement) { startIndex = elements.size } elements.add( diff --git a/readium/shared/src/test/java/org/readium/r2/shared/publication/services/content/iterators/HtmlResourceContentIteratorTest.kt b/readium/shared/src/test/java/org/readium/r2/shared/publication/services/content/iterators/HtmlResourceContentIteratorTest.kt index 7a7c4bd1e7..aff43ebe9d 100644 --- a/readium/shared/src/test/java/org/readium/r2/shared/publication/services/content/iterators/HtmlResourceContentIteratorTest.kt +++ b/readium/shared/src/test/java/org/readium/r2/shared/publication/services/content/iterators/HtmlResourceContentIteratorTest.kt @@ -140,22 +140,24 @@ class HtmlResourceContentIteratorTest { ) private fun locator( - selector: String, + selector: String? = null, before: String? = null, highlight: String? = null, after: String? = null ): Locator = locator.copy( - locations = Locator.Locations(otherLocations = mapOf("cssSelector" to selector)), + locations = Locator.Locations(otherLocations = buildMap { + selector?.let { put("cssSelector", it) } + }), text = Locator.Text(before = before, highlight = highlight, after = after) ) - private fun iterator(startLocator: Locator = locator): HtmlResourceContentIterator = + private fun iterator(html: String, startLocator: Locator = locator): HtmlResourceContentIterator = HtmlResourceContentIterator(StringResource(link, html), startLocator) @Test fun `cannot call previous() without first hasPrevious()`() = runTest { - val iter = iterator() + val iter = iterator(html) iter.hasNext(); iter.next() assertFailsWith(IllegalStateException::class) { iter.previous() } @@ -165,7 +167,7 @@ class HtmlResourceContentIteratorTest { @Test fun `cannot call next() without first hasNext()`() = runTest { - val iter = iterator() + val iter = iterator(html) assertFailsWith(IllegalStateException::class) { iter.next() } iter.hasNext() iter.next() @@ -173,7 +175,7 @@ class HtmlResourceContentIteratorTest { @Test fun `iterate from start to finish`() = runTest { - val iter = iterator() + val iter = iterator(html) val res = mutableListOf() while (iter.hasNext()) { res.add(iter.next()) @@ -183,20 +185,20 @@ class HtmlResourceContentIteratorTest { @Test fun `previous() is null from the beginning`() = runTest { - val iter = iterator() + val iter = iterator(html) assertFalse(iter.hasPrevious()) } @Test fun `next() returns the first element from the beginning`() = runTest { - val iter = iterator() + val iter = iterator(html) assertTrue(iter.hasNext()) assertEquals(elements[0], iter.next()) } @Test fun `next() then previous() returns the first element`() = runTest { - val iter = iterator() + val iter = iterator(html) assertTrue(iter.hasNext()) assertEquals(elements[0], iter.next()) assertTrue(iter.hasPrevious()) @@ -205,7 +207,7 @@ class HtmlResourceContentIteratorTest { @Test fun `calling hasPrevious() several times doesn't move the index`() = runTest { - val iter = iterator() + val iter = iterator(html) iter.hasNext(); iter.next() assertTrue(iter.hasPrevious()) assertTrue(iter.hasPrevious()) @@ -215,7 +217,7 @@ class HtmlResourceContentIteratorTest { @Test fun `calling hasNext() several times doesn't move the index`() = runTest { - val iter = iterator() + val iter = iterator(html) assertTrue(iter.hasNext()) assertTrue(iter.hasNext()) assertTrue(iter.hasNext()) @@ -224,7 +226,7 @@ class HtmlResourceContentIteratorTest { @Test fun `starting from a CSS selector`() = runTest { - val iter = iterator(locator(selector = "#pgepubid00498 > p:nth-child(3)")) + val iter = iterator(html, locator(selector = "#pgepubid00498 > p:nth-child(3)")) val res = mutableListOf() while (iter.hasNext()) { res.add(iter.next()) @@ -234,8 +236,45 @@ class HtmlResourceContentIteratorTest { @Test fun `calling previous() when starting from a CSS selector`() = runTest { - val iter = iterator(locator(selector = "#pgepubid00498 > p:nth-child(3)")) + val iter = iterator(html, locator(selector = "#pgepubid00498 > p:nth-child(3)")) assertTrue(iter.hasPrevious()) assertEquals(elements[1], iter.previous()) } + + @Test + fun `starting from a CSS selector to a block element containing an inline element`() = runTest { + val nbspHtml = """ + + + +

Tout au loin sur la chaussée, aussi loin qu’on pouvait voir

+

Lui, notre colonel, savait peut-être pourquoi ces deux gens-là tiraient [...] On buvait de la bière sucrée.

+ + + """ + + val iter = iterator(nbspHtml, locator(selector = ":root > :nth-child(2) > :nth-child(2)")) + assertTrue(iter.hasNext()) + assertEquals( + TextElement( + locator = locator( + selector = "html > body > p:nth-child(2)", + highlight = "Lui, notre colonel, savait peut-être pourquoi ces deux gens-là tiraient [...] On buvait de la bière sucrée." + ), + role = TextElement.Role.Body, + segments = listOf( + Segment( + locator = locator( + selector = "html > body > p:nth-child(2)", + before = "oin sur la chaussée, aussi loin qu’on pouvait voir", + highlight = "Lui, notre colonel, savait peut-être pourquoi ces deux gens-là tiraient [...] On buvait de la bière sucrée.", + ), + text = "Lui, notre colonel, savait peut-être pourquoi ces deux gens-là tiraient [...] On buvait de la bière sucrée.", + attributes = listOf(Attribute(LANGUAGE, Language("fr"))) + ) + ) + ), + iter.next() + ) + } } From 77672409d987f3f808e79976ff75a8bccb6733e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mickae=CC=88l=20Menu?= Date: Thu, 23 Feb 2023 13:57:50 +0100 Subject: [PATCH 3/7] Parse audio and video elements --- .../publication/services/content/Content.kt | 1 - .../iterators/HtmlResourceContentIterator.kt | 63 ++++++-- .../HtmlResourceContentIteratorTest.kt | 151 ++++++++++++++++-- 3 files changed, 184 insertions(+), 31 deletions(-) diff --git a/readium/shared/src/main/java/org/readium/r2/shared/publication/services/content/Content.kt b/readium/shared/src/main/java/org/readium/r2/shared/publication/services/content/Content.kt index 7f2ac53895..baafea867d 100644 --- a/readium/shared/src/main/java/org/readium/r2/shared/publication/services/content/Content.kt +++ b/readium/shared/src/main/java/org/readium/r2/shared/publication/services/content/Content.kt @@ -242,7 +242,6 @@ interface Content { /** * Extracts the full raw text, or returns null if no text content can be found. - * * @param separator Separator to use between individual elements. Defaults to newline. */ suspend fun text(separator: String = "\n"): String? = diff --git a/readium/shared/src/main/java/org/readium/r2/shared/publication/services/content/iterators/HtmlResourceContentIterator.kt b/readium/shared/src/main/java/org/readium/r2/shared/publication/services/content/iterators/HtmlResourceContentIterator.kt index 337cbef8f1..a84557d391 100644 --- a/readium/shared/src/main/java/org/readium/r2/shared/publication/services/content/iterators/HtmlResourceContentIterator.kt +++ b/readium/shared/src/main/java/org/readium/r2/shared/publication/services/content/iterators/HtmlResourceContentIterator.kt @@ -171,30 +171,34 @@ class HtmlResourceContentIterator( } if (node is Element) { + if (node.isBlock) { + breadcrumbs.add(node) + } val tag = node.normalName() + val elementLocator: Locator by lazy { + baseLocator.copy( + locations = Locator.Locations( + otherLocations = buildMap { + put("cssSelector", node.cssSelector() as Any) + } + ) + ) + } + when { tag == "br" -> { flushText() } + tag == "img" -> { flushText() - val href = node.attr("src") - .takeIf { it.isNotBlank() } - ?.let { Href(it, baseLocator.href).string } - - if (href != null) { + node.srcRelativeToHref(baseLocator.href)?.let { href -> elements.add( ImageElement( - locator = baseLocator.copy( - locations = Locator.Locations( - otherLocations = buildMap { - put("cssSelector", node.cssSelector() as Any) - } - ) - ), + locator = elementLocator, embeddedLink = Link(href = href), caption = null, // FIXME: Get the caption from figcaption attributes = buildList { @@ -207,9 +211,35 @@ class HtmlResourceContentIterator( ) } } - node.isBlock -> { - breadcrumbs.add(node) + tag == "audio" || tag == "video" -> { + flushText() + + val href = node.srcRelativeToHref(baseLocator.href) + val link: Link? = + if (href != null) { + Link(href = href) + } else { + val sources = node.select("source") + .mapNotNull { source -> + source.srcRelativeToHref(baseLocator.href)?.let { href -> + Link(href = href, type = source.attr("type").takeUnless { it.isBlank() }) + } + } + + sources.firstOrNull()?.copy(alternates = sources.drop(1)) + } + + if (link != null) { + when (tag) { + "audio" -> elements.add(AudioElement(locator = elementLocator, embeddedLink = link, attributes = emptyList())) + "video" -> elements.add(VideoElement(locator = elementLocator, embeddedLink = link, attributes = emptyList())) + else -> {} + } + } + } + + node.isBlock -> { segmentsAcc.clear() textAcc.clear() rawTextAcc = "" @@ -332,3 +362,8 @@ private val Node.language: String? get() = attr("xml:lang").takeUnless { it.isBlank() } ?: attr("lang").takeUnless { it.isBlank() } ?: parent()?.language + +private fun Node.srcRelativeToHref(baseHref: String): String? = + attr("src") + .takeIf { it.isNotBlank() } + ?.let { Href(it, baseHref).string } \ No newline at end of file diff --git a/readium/shared/src/test/java/org/readium/r2/shared/publication/services/content/iterators/HtmlResourceContentIteratorTest.kt b/readium/shared/src/test/java/org/readium/r2/shared/publication/services/content/iterators/HtmlResourceContentIteratorTest.kt index aff43ebe9d..e1cac33835 100644 --- a/readium/shared/src/test/java/org/readium/r2/shared/publication/services/content/iterators/HtmlResourceContentIteratorTest.kt +++ b/readium/shared/src/test/java/org/readium/r2/shared/publication/services/content/iterators/HtmlResourceContentIteratorTest.kt @@ -2,9 +2,9 @@ package org.readium.r2.shared.publication.services.content.iterators -import kotlin.test.* import kotlinx.coroutines.ExperimentalCoroutinesApi import kotlinx.coroutines.test.runTest +import org.junit.Assert.* import org.junit.Test import org.junit.runner.RunWith import org.readium.r2.shared.ExperimentalReadiumApi @@ -13,17 +13,19 @@ import org.readium.r2.shared.publication.Link import org.readium.r2.shared.publication.Locator import org.readium.r2.shared.publication.services.content.Content import org.readium.r2.shared.publication.services.content.Content.Attribute +import org.readium.r2.shared.publication.services.content.Content.AttributeKey.Companion.ACCESSIBILITY_LABEL import org.readium.r2.shared.publication.services.content.Content.AttributeKey.Companion.LANGUAGE import org.readium.r2.shared.publication.services.content.Content.TextElement import org.readium.r2.shared.publication.services.content.Content.TextElement.Segment import org.readium.r2.shared.util.Language import org.robolectric.RobolectricTestRunner +@OptIn(ExperimentalReadiumApi::class) @RunWith(RobolectricTestRunner::class) class HtmlResourceContentIteratorTest { - private val link = Link(href = "res.xhtml", type = "application/xhtml+xml") - private val locator = Locator(href = "res.xhtml", type = "application/xhtml+xml") + private val link = Link(href = "/dir/res.xhtml", type = "application/xhtml+xml") + private val locator = Locator(href = "/dir/res.xhtml", type = "application/xhtml+xml") private val html = """ @@ -155,12 +157,19 @@ class HtmlResourceContentIteratorTest { private fun iterator(html: String, startLocator: Locator = locator): HtmlResourceContentIterator = HtmlResourceContentIterator(StringResource(link, html), startLocator) + private suspend fun HtmlResourceContentIterator.elements(): List = + buildList { + while (hasNext()) { + add(next()) + } + } + @Test fun `cannot call previous() without first hasPrevious()`() = runTest { val iter = iterator(html) iter.hasNext(); iter.next() - assertFailsWith(IllegalStateException::class) { iter.previous() } + assertThrows(IllegalStateException::class.java) { iter.previous() } iter.hasPrevious() iter.previous() } @@ -168,19 +177,14 @@ class HtmlResourceContentIteratorTest { @Test fun `cannot call next() without first hasNext()`() = runTest { val iter = iterator(html) - assertFailsWith(IllegalStateException::class) { iter.next() } + assertThrows(IllegalStateException::class.java) { iter.next() } iter.hasNext() iter.next() } @Test fun `iterate from start to finish`() = runTest { - val iter = iterator(html) - val res = mutableListOf() - while (iter.hasNext()) { - res.add(iter.next()) - } - assertContentEquals(elements, res) + assertEquals(elements, iterator(html).elements()) } @Test @@ -227,11 +231,7 @@ class HtmlResourceContentIteratorTest { @Test fun `starting from a CSS selector`() = runTest { val iter = iterator(html, locator(selector = "#pgepubid00498 > p:nth-child(3)")) - val res = mutableListOf() - while (iter.hasNext()) { - res.add(iter.next()) - } - assertContentEquals(elements.subList(2, elements.size), res) + assertEquals(elements.subList(2, elements.size), iter.elements()) } @Test @@ -277,4 +277,123 @@ class HtmlResourceContentIteratorTest { iter.next() ) } + + @Test + fun `iterating over image elements`() = runTest { + val html = """ + + + + + Accessibility description + + + """ + + assertEquals( + listOf( + Content.ImageElement( + locator = locator( + selector = "html > body > img:nth-child(1)" + ), + embeddedLink = Link(href = "/dir/image.png"), + caption = null, + attributes = emptyList() + ), + Content.ImageElement( + locator = locator( + selector = "html > body > img:nth-child(2)" + ), + embeddedLink = Link(href = "/cover.jpg"), + caption = null, + attributes = listOf(Attribute(ACCESSIBILITY_LABEL, "Accessibility description")) + ) + ), + iterator(html).elements() + ) + } + + @Test + fun `iterating over audio elements`() = runTest { + val html = """ + + + +