Skip to content

Commit

Permalink
Cleaning cache buster from feed and page urls.
Browse files Browse the repository at this point in the history
  • Loading branch information
samuelclay committed Feb 3, 2016
1 parent 2d3098f commit ee2573e
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 7 deletions.
2 changes: 1 addition & 1 deletion apps/rss_feeds/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2607,7 +2607,7 @@ def add(cls, feed_id, fetch_type, date=None, message=None, code=None, exception=
history = fetch_history.push_history or []

history = [[date, code, message]] + history
any_exceptions = any([c for d, c, m in history if c >= 400])
any_exceptions = any([c for d, c, m in history if c not in [200, 304]])
if any_exceptions:
history = history[:25]
else:
Expand Down
13 changes: 7 additions & 6 deletions utils/feed_fetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from utils import feedparser
from utils.story_functions import pre_process_story, strip_tags, linkify
from utils import log as logging
from utils.feed_functions import timelimit, TimeoutError, cache_bust_url
from utils.feed_functions import timelimit, TimeoutError, cache_bust_url, clean_cache_bust_url
from BeautifulSoup import BeautifulSoup
from django.utils import feedgenerator
from django.utils.html import linebreaks
Expand Down Expand Up @@ -311,14 +311,14 @@ def process(self):
return FEED_SAME, ret_values

# 302: Temporary redirect: ignore
# 301: Permanent redirect: save it (after 20 tries)
# 301: Permanent redirect: save it (after 10 tries)
if self.fpf.status == 301:
if self.fpf.href.endswith('feedburner.com/atom.xml'):
return FEED_ERRHTTP, ret_values
redirects, non_redirects = self.feed.count_redirects_in_history('feed')
self.feed.save_feed_history(self.fpf.status, "HTTP Redirect (%d to go)" % (10-len(redirects)))
if len(redirects) >= 10 or len(non_redirects) == 0:
self.feed.feed_address = self.fpf.href
if len(redirects) >= 10 or len(non_redirects) == 0:
self.feed.feed_address = clean_cache_bust_url(self.fpf.href)
if not self.feed.known_good:
self.feed.fetched_once = True
logging.debug(" ---> [%-30s] ~SB~SK~FRFeed is %s'ing. Refetching..." % (self.feed.title[:30], self.fpf.status))
Expand Down Expand Up @@ -401,11 +401,12 @@ def process(self):

if not self.feed.feed_link_locked:
new_feed_link = self.fpf.feed.get('link') or self.fpf.feed.get('id') or self.feed.feed_link
new_feed_link = clean_cache_bust_url(new_feed_link)
if new_feed_link != self.feed.feed_link:
logging.debug(" ---> [%-30s] ~SB~FRFeed's page is different: %s to %s" % (self.feed.title[:30], self.feed.feed_link, new_feed_link))
redirects, non_redirects = self.feed.count_redirects_in_history('page')
self.feed.save_page_history(301, "HTTP Redirect (%s to go)" % (20-len(redirects)))
if len(redirects) >= 20 or len(non_redirects) == 0:
self.feed.save_page_history(301, "HTTP Redirect (%s to go)" % (10-len(redirects)))
if len(redirects) >= 10 or len(non_redirects) == 0:
self.feed.feed_link = new_feed_link
self.feed.save(update_fields=['feed_link'])

Expand Down
18 changes: 18 additions & 0 deletions utils/feed_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,28 @@ def append_query_string_to_url(url, **kwargs):
url_parts[4] = urllib.urlencode(query)

return urlparse.urlunparse(url_parts)

def remove_query_string_from_url(url, **kwargs):
url_parts = list(urlparse.urlparse(url))
query = dict(urlparse.parse_qsl(url_parts[4]))

if not url_parts[4] or (url_parts[4] and len(query.keys())):
# Ensure query string is preserved.
# ?atom should be preserved, so ignore
# ?feed=atom is fine
for kwarg in kwargs.keys():
if kwarg in query:
query.pop(kwarg)
url_parts[4] = urllib.urlencode(query)

return urlparse.urlunparse(url_parts)

def cache_bust_url(url):
return append_query_string_to_url(url, _=random.randint(0, 10000))

def clean_cache_bust_url(url):
return remove_query_string_from_url(url, _=True)

# From: http://www.poromenos.org/node/87
def levenshtein_distance(first, second):
"""Find the Levenshtein distance between two strings."""
Expand Down

0 comments on commit ee2573e

Please sign in to comment.