Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Using regular expressions for comment stripping, unless the feed has …

…an error count, in which case switch to lxml.
  • Loading branch information...
commit 47c2257ef77dc1c186deb38a15deafa5b5f42b4f 1 parent 33e6058
@samuelclay authored
Showing with 24 additions and 6 deletions.
  1. +7 −3 apps/rss_feeds/models.py
  2. +17 −3 utils/story_functions.py
View
10 apps/rss_feeds/models.py
@@ -36,7 +36,7 @@
from utils.feed_functions import timelimit, TimeoutError
from utils.feed_functions import relative_timesince
from utils.feed_functions import seconds_timesince
-from utils.story_functions import strip_tags, htmldiff, strip_comments__lxml
+from utils.story_functions import strip_tags, htmldiff, strip_comments, strip_comments__lxml
from vendor.redis_completion.engine import RedisEngine
ENTRY_NEW, ENTRY_UPDATED, ENTRY_SAME, ENTRY_ERR = range(4)
@@ -840,7 +840,8 @@ def get_by_name(cls, query, limit=1):
def add_update_stories(self, stories, existing_stories, verbose=False):
ret_values = dict(new=0, updated=0, same=0, error=0)
-
+ error_count = self.error_count
+
if settings.DEBUG or verbose:
logging.debug(" ---> [%-30s] ~FBChecking ~SB%s~SN new/updated against ~SB%s~SN stories" % (
self.title[:30],
@@ -852,7 +853,10 @@ def add_update_stories(self, stories, existing_stories, verbose=False):
continue
story_content = story.get('story_content')
- story_content = strip_comments__lxml(story_content)
+ if error_count:
+ story_content = strip_comments__lxml(story_content)
+ else:
+ story_content = strip_comments(story_content)
story_tags = self.get_tags(story)
story_link = self.get_permalink(story)
View
20 utils/story_functions.py
@@ -15,7 +15,8 @@
from utils.tornado_escape import xhtml_unescape as xhtml_unescape_tornado
from vendor import reseekfile
-COMMENTS_RE = re.compile('\<![ \r\n\t]*(--([^\-]|[\r\n]|-[^\-])*--[ \r\n\t]*)\>')
+# COMMENTS_RE = re.compile('\<![ \r\n\t]*(--([^\-]|[\r\n]|-[^\-])*--[ \r\n\t]*)\>')
+COMMENTS_RE = re.compile('\<!--.*?--\>')
def story_score(story, bottom_delta=None):
# A) Date - Assumes story is unread and within unread range
@@ -197,8 +198,21 @@ def strip_tags(html):
def strip_comments(html_string):
return COMMENTS_RE.sub('', html_string)
+
+def strip_comments__lxml2(html_string=""):
+ if not html_string: return html_string
+ tree = lxml.html.fromstring(html_string)
+ comments = tree.xpath('//comment()')
+
+ for c in comments:
+ p = c.getparent()
+ p.remove(c)
+
+ return lxml.etree.tostring(tree)
+
+def strip_comments__lxml(html_string=""):
+ if not html_string: return html_string
-def strip_comments__lxml(html_string):
params = {
'comments': True,
'scripts': False,
@@ -225,7 +239,7 @@ def strip_comments__lxml(html_string):
return lxml.etree.tostring(clean_html)
except XMLSyntaxError:
return html_string
-
+
def linkify(*args, **kwargs):
return xhtml_unescape_tornado(linkify_tornado(*args, **kwargs))
Please sign in to comment.
Something went wrong with that request. Please try again.