Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,19 @@ SCRIPTS_PATH := readium/navigator/src/main/assets/_scripts

help:
@echo "Usage: make <target>\n\n\
lint\t\tLint the Kotlin sources with ktlint\n\
format\tFormat the Kotlin sources with ktlint\n\
scripts\tBundle the Navigator EPUB scripts\n\
"

.PHONY: lint
lint:
./gradlew ktlintCheck

.PHONY: format
format:
./gradlew ktlintFormat

.PHONY: scripts
scripts:
yarn --cwd "$(SCRIPTS_PATH)" install --frozen-lockfile
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -158,12 +158,19 @@ class HtmlResourceContentIterator(
private val elements = mutableListOf<Content.Element>()
private var startIndex = 0

/** Segments accumulated for the current element. */
private val segmentsAcc = mutableListOf<TextElement.Segment>()
/** Text since the beginning of the current segment, after coalescing whitespaces. */
private var textAcc = StringBuilder()
/** Text content since the beginning of the resource, including whitespaces. */
private var wholeRawTextAcc: String? = null
/** Text content since the beginning of the current element, including whitespaces. */
private var elementRawTextAcc: String = ""
/** Text content since the beginning of the current segment, including whitespaces. */
private var rawTextAcc: String = ""
/** Language of the current segment. */
private var currentLanguage: String? = null
/** CSS selector of the current element. */
private var currentCssSelector: String? = null

/** LIFO stack of the current element's block ancestors. */
Expand Down Expand Up @@ -240,17 +247,15 @@ class HtmlResourceContentIterator(
}

node.isBlock -> {
segmentsAcc.clear()
textAcc.clear()
rawTextAcc = ""
flushText()
currentCssSelector = node.cssSelector()
}
}
}
}

override fun tail(node: Node, depth: Int) {
if (node is TextNode) {
if (node is TextNode && node.wholeText.isNotBlank()) {
val language = node.language
if (currentLanguage != language) {
flushSegment()
Expand Down Expand Up @@ -278,11 +283,17 @@ class HtmlResourceContentIterator(

private fun flushText() {
flushSegment()
if (segmentsAcc.isEmpty()) return

if (startElement != null && breadcrumbs.lastOrNull() == startElement) {
if (startIndex == 0 && startElement != null && breadcrumbs.lastOrNull() == startElement) {
startIndex = elements.size
}

if (segmentsAcc.isEmpty()) return

// Trim the end of the last segment's text to get a cleaner output for the TextElement.
// Only whitespaces between the segments are meaningful.
segmentsAcc[segmentsAcc.size - 1] = segmentsAcc.last().run { copy(text = text.trimEnd()) }

elements.add(
Content.TextElement(
locator = baseLocator.copy(
Expand All @@ -293,9 +304,9 @@ class HtmlResourceContentIterator(
}
}
),
text = Locator.Text(
before = segmentsAcc.firstOrNull()?.locator?.text?.before,
highlight = elementRawTextAcc,
text = Locator.Text.trimmingText(
elementRawTextAcc,
before = segmentsAcc.firstOrNull()?.locator?.text?.before
)
),
role = TextElement.Role.Body,
Expand Down Expand Up @@ -331,8 +342,8 @@ class HtmlResourceContentIterator(
}
}
),
text = Locator.Text(
highlight = rawTextAcc,
text = Locator.Text.trimmingText(
rawTextAcc,
before = wholeRawTextAcc?.takeLast(beforeMaxLength)
)
),
Expand All @@ -356,6 +367,13 @@ class HtmlResourceContentIterator(
}
}

private fun Locator.Text.Companion.trimmingText(text: String, before: String?): Locator.Text =
Locator.Text(
before = ((before ?: "") + text.takeWhile { it.isWhitespace() }).takeUnless { it.isBlank() },
highlight = text.trim(),
after = text.takeLastWhile { it.isWhitespace() }.takeUnless { it.isBlank() }
)

private val Node.language: String? get() =
attr("xml:lang").takeUnless { it.isBlank() }
?: attr("lang").takeUnless { it.isBlank() }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -455,4 +455,88 @@ class HtmlResourceContentIteratorTest {
iterator(html).elements()
)
}

@Test
fun `iterating over an element containing both a text node and child elements`() = runTest {
val html = """
<?xml version="1.0" encoding="UTF-8"?>
<html xmlns="http://www.w3.org/1999/xhtml">
<body>
<ol class="decimal" id="c06-list-0001">
<li id="c06-li-0001">Let&#39;s start at the top&#8212;the <i>source of ideas</i>.
<aside><div class="top hr"><hr/></div>
<section class="feature1">
<p id="c06-para-0019"><i>While almost everyone today claims to be Agile, what I&#39;ve just described is very much a <i>waterfall</i> process.</i></p>
</section>
Trailing text
</li>
</ol>
</body>
</html>
"""

assertEquals(
listOf(
TextElement(
locator = locator(
selector = "#c06-li-0001",
highlight = "Let's start at the top—the source of ideas."
),
role = TextElement.Role.Body,
segments = listOf(
Segment(
locator = locator(
selector = "#c06-li-0001",
highlight = "Let's start at the top—the source of ideas."
),
text = "Let's start at the top—the source of ideas.",
attributes = emptyList()
),
),
attributes = emptyList()
),
TextElement(
locator = locator(
selector = "#c06-para-0019",
before = " top—the source of ideas.\n ",
highlight = "While almost everyone today claims to be Agile, what I've just described is very much a waterfall process."
),
role = TextElement.Role.Body,
segments = listOf(
Segment(
locator = locator(
selector = "#c06-para-0019",
before = " top—the source of ideas.\n ",
highlight = "While almost everyone today claims to be Agile, what I've just described is very much a waterfall process."
),
text = "While almost everyone today claims to be Agile, what I've just described is very much a waterfall process.",
attributes = emptyList()
)
),
attributes = emptyList()
),
TextElement(
locator = locator(
selector = "#c06-para-0019",
before = "e just described is very much a waterfall process.\n \n ",
highlight = "Trailing text"
),
role = TextElement.Role.Body,
segments = listOf(
Segment(
locator = locator(
selector = "#c06-para-0019",
before = "e just described is very much a waterfall process.\n ",
highlight = "Trailing text"
),
text = "Trailing text",
attributes = emptyList()
)
),
attributes = emptyList()
)
),
iterator(html).elements()
)
}
}