Skip to content

Commit

Permalink
Improve text extraction
Browse files Browse the repository at this point in the history
  • Loading branch information
rushter committed Sep 7, 2022
1 parent cced011 commit abc9a3c
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 6 deletions.
13 changes: 9 additions & 4 deletions selectolax/lexbor/node.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -772,20 +772,25 @@ cdef class LexborNode:
return container.text
@cython.final
cdef class TextContainer:
cdef public str text
cdef str _text
cdef public str separator
cdef public bool strip

def __init__(self, str separator = '', bool strip = False):
self.text = ""
self._text = ""
self.separator = separator
self.strip = strip

def append(self, node_text):
if self.strip:
self.text += node_text.strip() + self.separator
self._text += node_text.strip() + self.separator
else:
self.text += node_text + self.separator
self._text += node_text + self.separator
@property
def text(self):
if self.separator and self._text and self._text.endswith(self.separator):
self._text = self._text[:-len(self.separator)]
return self._text


cdef lexbor_action_t text_callback(lxb_dom_node_t *node, void *ctx):
Expand Down
4 changes: 3 additions & 1 deletion selectolax/modest/node.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,9 @@ cdef class Node:
text = append_text(text, node_text, separator, strip)
node = node.next
else:
return self._text_deep(self.node, separator=separator, strip=strip)
text = self._text_deep(self.node, separator=separator, strip=strip)
if separator and text and text.endswith(separator):
text = text[:-len(separator)]
return text

cdef inline _text_deep(self, myhtml_tree_node_t *node, separator='', strip=False):
Expand Down
2 changes: 1 addition & 1 deletion tests/test_nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -542,4 +542,4 @@ def test_merge_text_nodes(parser):
node.merge_text_nodes()
assert node.html == "<div><p>John</p><p>Doe</p></div>"
text = tree.text(deep=True, separator=" ", strip=True)
assert text == "John Doe "
assert text == "John Doe"

0 comments on commit abc9a3c

Please sign in to comment.