Skip to content

Commit

Permalink
[tumblr] filter out some non-image urls
Browse files Browse the repository at this point in the history
  • Loading branch information
soimort committed Oct 29, 2015
1 parent 42ed56d commit 214deb6
Showing 1 changed file with 3 additions and 3 deletions.
6 changes: 3 additions & 3 deletions src/you_get/extractors/tumblr.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@ def tumblr_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
page_title = r1(r'<meta name="description" content="([^"\n]+)', html) or \
r1(r'<meta property="og:description" content="([^"\n]+)', html) or \
r1(r'<title>([^<\n]*)', html)
urls = re.findall(r'(https?://[^;"]+/tumblr_[^;"]+_\d+\.jpg)', html) +\
re.findall(r'(https?://[^;"]+/tumblr_[^;"]+_\d+\.png)', html) +\
re.findall(r'(https?://[^;"]+/tumblr_[^";]+_\d+\.gif)', html)
urls = re.findall(r'(https?://[^;"&]+/tumblr_[^;"]+_\d+\.jpg)', html) +\
re.findall(r'(https?://[^;"&]+/tumblr_[^;"]+_\d+\.png)', html) +\
re.findall(r'(https?://[^;"&]+/tumblr_[^";]+_\d+\.gif)', html)

tuggles = {}
for url in urls:
Expand Down

0 comments on commit 214deb6

Please sign in to comment.