Skip to content

Commit

Permalink
Merge pull request #129 from victor-torres/remove-selector
Browse files Browse the repository at this point in the history
Remove Selectors or SelectorLists from their parent elements
  • Loading branch information
dangra authored Sep 11, 2019
2 parents c9901d2 + 121dd1f commit d9b3d0c
Show file tree
Hide file tree
Showing 4 changed files with 130 additions and 0 deletions.
1 change: 1 addition & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ Features
--------

* Extract text using CSS or XPath selectors
* Remove elements using CSS or XPath selectors
* Regular expression helper methods

Example::
Expand Down
35 changes: 35 additions & 0 deletions docs/usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -385,6 +385,41 @@ XPath specification.
.. _Location Paths: https://www.w3.org/TR/xpath#location-paths


Removing elements
-----------------

If for any reason you need to remove elements based on a Selector or
a SelectorList, you can do it with the ``remove()`` method, available for both
classes.

.. warning:: this is a destructive action and cannot be undone. The original
content of the selector is removed from the elements tree. This could be useful
when trying to reduce the memory footprint of Responses.

Example removing an ad from a blog post:

>>> from parsel import Selector
>>> doc = u"""
... <article>
... <div class="row">Content paragraph...</div>
... <div class="row">
... <div class="ad">
... Ad content...
... <a href="http://...">Link</a>
... </div>
... </div>
... <div class="row">More content...</div>
... </article>
... """
>>> sel = Selector(text=doc)
>>> sel.xpath('//div/text()').getall()
['Content paragraph...', 'Ad content...', 'Link', 'More content...']
>>> sel.xpath('//div[@class="ad"]').remove()
>>> sel.xpath('//div//text()').getall()
['Content paragraph...', 'More content...']
>>>


Using EXSLT extensions
----------------------

Expand Down
39 changes: 39 additions & 0 deletions parsel/selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,14 @@
from .csstranslator import HTMLTranslator, GenericTranslator


class CannotRemoveElementWithoutRoot(Exception):
pass


class CannotRemoveElementWithoutParent(Exception):
pass


class SafeXMLParser(etree.XMLParser):
def __init__(self, *args, **kwargs):
kwargs.setdefault('resolve_entities', False)
Expand Down Expand Up @@ -150,6 +158,13 @@ def attrib(self):
else:
return {}

def remove(self):
"""
Remove matched nodes from the parent for each element in this list.
"""
for x in self:
x.remove()


class Selector(object):
"""
Expand Down Expand Up @@ -342,6 +357,30 @@ def remove_namespaces(self):
# remove namespace declarations
etree.cleanup_namespaces(self.root)

def remove(self):
"""
Remove matched nodes from the parent element.
"""
try:
parent = self.root.getparent()
except AttributeError:
# 'str' object has no attribute 'getparent'
raise CannotRemoveElementWithoutRoot(
"The node you're trying to remove has no root, "
"are you trying to remove a pseudo-element? "
"Try to use 'li' as a selector instead of 'li::text' or "
"'//li' instead of '//li/text()', for example."
)

try:
parent.remove(self.root)
except AttributeError:
# 'NoneType' object has no attribute 'remove'
raise CannotRemoveElementWithoutParent(
"The node you're trying to remove has no parent, "
"are you trying to remove a root element?"
)

@property
def attrib(self):
"""Return the attributes dictionary for underlying element.
Expand Down
55 changes: 55 additions & 0 deletions tests/test_selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@
import pickle

from parsel import Selector
from parsel.selector import (
CannotRemoveElementWithoutRoot,
CannotRemoveElementWithoutParent,
)


class SelectorTestCase(unittest.TestCase):
Expand Down Expand Up @@ -745,6 +749,57 @@ def test_replacement_null_char_from_body(self):
self.assertEqual(u'<html><body><p>Grainy</p></body></html>',
self.sscls(text).extract())

def test_remove_selector_list(self):
sel = self.sscls(text=u'<html><body><ul><li>1</li><li>2</li><li>3</li></ul></body></html>')
sel_list = sel.css('li')
sel_list.remove()
self.assertIsInstance(sel.css('li'), self.sscls.selectorlist_cls)
self.assertEqual(sel.css('li'), [])

def test_remove_selector(self):
sel = self.sscls(text=u'<html><body><ul><li>1</li><li>2</li><li>3</li></ul></body></html>')
sel_list = sel.css('li')
sel_list[0].remove()
self.assertIsInstance(sel.css('li'), self.sscls.selectorlist_cls)
self.assertEqual(sel.css('li::text').getall(), ['2', '3'])

def test_remove_pseudo_element_selector_list(self):
sel = self.sscls(text=u'<html><body><ul><li>1</li><li>2</li><li>3</li></ul></body></html>')
sel_list = sel.css('li::text')
self.assertEqual(sel_list.getall(), ['1', '2', '3'])
with self.assertRaises(CannotRemoveElementWithoutRoot):
sel_list.remove()

self.assertIsInstance(sel.css('li'), self.sscls.selectorlist_cls)
self.assertEqual(sel.css('li::text').getall(), ['1', '2', '3'])

def test_remove_pseudo_element_selector(self):
sel = self.sscls(text=u'<html><body><ul><li>1</li><li>2</li><li>3</li></ul></body></html>')
sel_list = sel.css('li::text')
self.assertEqual(sel_list.getall(), ['1', '2', '3'])
with self.assertRaises(CannotRemoveElementWithoutRoot):
sel_list[0].remove()

self.assertIsInstance(sel.css('li'), self.sscls.selectorlist_cls)
self.assertEqual(sel.css('li::text').getall(), ['1', '2', '3'])

def test_remove_root_element_selector(self):
sel = self.sscls(text=u'<html><body><ul><li>1</li><li>2</li><li>3</li></ul></body></html>')
sel_list = sel.css('li::text')
self.assertEqual(sel_list.getall(), ['1', '2', '3'])
with self.assertRaises(CannotRemoveElementWithoutParent):
sel.remove()

with self.assertRaises(CannotRemoveElementWithoutParent):
sel.css('html').remove()

self.assertIsInstance(sel.css('li'), self.sscls.selectorlist_cls)
self.assertEqual(sel.css('li::text').getall(), ['1', '2', '3'])

sel.css('body').remove()
self.assertEqual(sel.get(), '<html></html>')


class ExsltTestCase(unittest.TestCase):

sscls = Selector
Expand Down

0 comments on commit d9b3d0c

Please sign in to comment.