This repository has been archived by the owner. It is now read-only.
Permalink
Browse files

Add a cache by URL to the media scraper

Pull media embeds and thumbnails from a cache when cache use was requested
and cached media are available.
  • Loading branch information...
1 parent 34fd7f8 commit 11e739e1c675a77bc5ed4753aab4e4f82afa79c8 David Ehrmann committed Feb 21, 2014
Showing with 257 additions and 32 deletions.
  1. +82 −32 r2/r2/lib/media.py
  2. +175 −0 r2/r2/models/media_cache.py
View
@@ -47,6 +47,15 @@
from r2.lib.nymph import optimize_png
from r2.lib.utils import TimeoutFunction, TimeoutFunctionException, domain
from r2.models.link import Link
+from r2.models.media_cache import (
+ ERROR_MEDIA,
+ Media,
+ MediaByURL,
+)
+from urllib2 import (
+ HTTPError,
+ URLError,
+)
MEDIA_FILENAME_LENGTH = 12
@@ -238,42 +247,76 @@ def upload_stylesheet(content):
return g.media_provider.put(file_name, content)
-def _set_media(link, force=False):
- if link.is_self:
- return
- if not force and link.promoted:
- return
- elif not force and (link.has_thumbnail or link.media_object):
- return
+def _scrape_media(url, autoplay=False, force=False, use_cache=False,
+ max_cache_age=None):
+ media = None
- scraper = Scraper.for_url(link.url)
- thumbnail, media_object, secure_media_object = scraper.scrape()
+ # Use media from the cache (if available)
+ if not force and use_cache:
+ mediaByURL = MediaByURL.get(url, autoplay=bool(autoplay),
+ max_cache_age=max_cache_age)
+ if mediaByURL:
+ media = mediaByURL.media
+
+ # Otherwise, scrape it
+ if not media:
+ media_object = secure_media_object = None
+ thumbnail_image = thumbnail_url = thumbnail_size = None
+
+ scraper = Scraper.for_url(url, autoplay=autoplay)
+ try:
+ thumbnail_image, media_object, secure_media_object = (
+ scraper.scrape())
+ except (HTTPError, URLError) as e:
+ if use_cache:
+ MediaByURL.add_error(url, str(e),
+ autoplay=bool(autoplay))
+ return None
- if media_object:
# the scraper should be able to make a media embed out of the
# media object it just gave us. if not, null out the media object
# to protect downstream code
- res = scraper.media_embed(media_object)
-
- if not res:
- print "%s made a bad media obj for link %s" % (scraper, link._id36)
+ if media_object and not scraper.media_embed(media_object):
+ print "%s made a bad media obj for url %s" % (scraper, url)
media_object = None
- if secure_media_object:
- res = scraper.media_embed(secure_media_object)
-
- if not res:
- print "%s made a bad secure media obj for link %s" % (scraper,
- link._id36)
+ if (secure_media_object and
+ not scraper.media_embed(secure_media_object)):
+ print "%s made a bad secure media obj for url %s" % (scraper, url)
secure_media_object = None
- if thumbnail:
- link.thumbnail_url = upload_media(thumbnail)
- link.thumbnail_size = thumbnail.size
+ if thumbnail_image:
+ thumbnail_size = thumbnail_image.size
+ thumbnail_url = upload_media(thumbnail_image)
- link.set_media_object(media_object)
- link.set_secure_media_object(secure_media_object)
- link._commit()
+ media = Media(media_object, secure_media_object,
+ thumbnail_url, thumbnail_size)
+
+ # Store the media in the cache (if requested), possibly extending the ttl
+ if use_cache and media is not ERROR_MEDIA:
+ MediaByURL.add(url, media, autoplay=bool(autoplay))
+
+ return media
+
+
+def _set_media(link, force=False, **kwargs):
+ if link.is_self:
+ return
+ if not force and link.promoted:
+ return
+ elif not force and (link.has_thumbnail or link.media_object):
+ return
+
+ media = _scrape_media(link.url, force=force, **kwargs)
+
+ if media and not link.promoted:
+ link.thumbnail_url = media.thumbnail_url
+ link.thumbnail_size = media.thumbnail_size
+
+ link.set_media_object(media.media_object)
+ link.set_secure_media_object(media.secure_media_object)
+
+ link._commit()
def force_thumbnail(link, image_data, file_type=".jpg"):
@@ -344,15 +387,15 @@ def _make_thumbnail_from_url(thumbnail_url, referer):
class Scraper(object):
@classmethod
- def for_url(cls, url):
+ def for_url(cls, url, autoplay=False):
scraper = hooks.get_hook("scraper.factory").call_until_return(url=url)
if scraper:
return scraper
embedly_services = _fetch_embedly_services()
for service_re, service_secure in embedly_services:
if service_re.match(url):
- return _EmbedlyScraper(url, service_secure)
+ return _EmbedlyScraper(url, service_secure, autoplay=autoplay)
return _ThumbnailOnlyScraper(url)
@@ -438,18 +481,25 @@ def _find_thumbnail_image(self):
class _EmbedlyScraper(Scraper):
EMBEDLY_API_URL = "https://api.embed.ly/1/oembed"
- def __init__(self, url, can_embed_securely):
+ def __init__(self, url, can_embed_securely, autoplay=False):
self.url = url
self.can_embed_securely = can_embed_securely
+ self.embedly_params = {}
+
+ if autoplay:
+ self.embedly_params["autoplay"] = "true"
def _fetch_from_embedly(self, secure):
- params = urllib.urlencode({
+ param_dict = {
"url": self.url,
"format": "json",
"maxwidth": 600,
"key": g.embedly_api_key,
"secure": "true" if secure else "false",
- })
+ }
+
+ param_dict.update(self.embedly_params)
+ params = urllib.urlencode(param_dict)
content = requests.get(self.EMBEDLY_API_URL + "?" + params).content
return json.loads(content)
@@ -527,7 +577,7 @@ def process_link(msg):
link = Link._by_fullname(msg.body, data=True)
try:
- TimeoutFunction(_set_media, 30)(link)
+ TimeoutFunction(_set_media, 30)(link, use_cache=True)
except TimeoutFunctionException:
print "Timed out on %s" % fname
except KeyboardInterrupt:
View
@@ -0,0 +1,175 @@
+# The contents of this file are subject to the Common Public Attribution
+# License Version 1.0. (the "License"); you may not use this file except in
+# compliance with the License. You may obtain a copy of the License at
+# http://code.reddit.com/LICENSE. The License is based on the Mozilla Public
+# License Version 1.1, but Sections 14 and 15 have been added to cover use of
+# software over a computer network and provide for limited attribution for the
+# Original Developer. In addition, Exhibit A has been modified to be consistent
+# with Exhibit B.
+#
+# Software distributed under the License is distributed on an "AS IS" basis,
+# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
+# the specific language governing rights and limitations under the License.
+#
+# The Original Code is reddit.
+#
+# The Original Developer is the Initial Developer. The Initial Developer of
+# the Original Code is reddit Inc.
+#
+# All portions of the code written by reddit are Copyright (c) 2013-2014 reddit
+# Inc. All Rights Reserved.
+###############################################################################
+
+import collections
+import json
+
+from datetime import (
+ datetime,
+ timedelta,
+)
+from pycassa.system_manager import ASCII_TYPE, UTF8_TYPE
+from r2.lib.db import tdb_cassandra
+
+
+Media = collections.namedtuple('_Media', ("media_object",
+ "secure_media_object",
+ "thumbnail_url",
+ "thumbnail_size"))
+
+ERROR_MEDIA = Media(None, None, None, None)
+
+
+class MediaByURL(tdb_cassandra.View):
+ _use_db = True
+ _connection_pool = 'main'
+ _ttl = timedelta(minutes=720)
+
+ _read_consistency_level = tdb_cassandra.CL.QUORUM
+ _write_consistency_level = tdb_cassandra.CL.QUORUM
+ _int_props = {"thumbnail_width", "thumbnail_height"}
+ _date_props = {"last_modified"}
+ _extra_schema_creation_args = {
+ "key_validation_class": ASCII_TYPE,
+ "column_name_class": UTF8_TYPE,
+ }
+
+ _defaults = {
+ "state": "enqueued",
+ "error": "",
+ "thumbnail_url": "",
+ "thumbnail_width": 0,
+ "thumbnail_height": 0,
+ "media_object": "",
+ "secure_media_object": "",
+ "last_modified": datetime.utcfromtimestamp(0),
+ }
+
+ @classmethod
+ def _rowkey(cls, url, **kwargs):
+ return (
+ url +
+ # pipe is not allowed in URLs, so use it as a delimiter
+ "|" +
+
+ # append the extra cache keys in kwargs as a canonical JSON string
+ json.dumps(
+ kwargs,
+ ensure_ascii=True,
+ encoding="ascii",
+ indent=None,
+ separators=(",", ":"),
+ sort_keys=True,
+ )
+ )
+
+ @classmethod
+ def add_placeholder(cls, url, **kwargs):
+ rowkey = cls._rowkey(url, **kwargs)
+ cls._set_values(rowkey, {
+ "state": "enqueued",
+ "error": "",
+ "last_modified": datetime.utcnow(),
+ })
+
+ @classmethod
+ def add(cls, url, media, **kwargs):
+ rowkey = cls._rowkey(url, **kwargs)
+ columns = cls._defaults.copy()
+
+ columns.update({
+ "state": "processed",
+ "error": "",
+ "last_modified": datetime.utcnow(),
+ })
+
+ if media.thumbnail_url and media.thumbnail_size:
+ columns.update({
+ "thumbnail_url": media.thumbnail_url,
+ "thumbnail_width": media.thumbnail_size[0],
+ "thumbnail_height": media.thumbnail_size[1],
+ })
+
+ if media.media_object:
+ columns.update({
+ "media_object": json.dumps(media.media_object),
+ })
+
+ if media.secure_media_object:
+ columns.update({
+ "secure_media_object": (json.
+ dumps(media.secure_media_object)),
+ })
+
+ cls._set_values(rowkey, columns)
+
+ @classmethod
+ def add_error(cls, url, error, **kwargs):
+ rowkey = cls._rowkey(url, **kwargs)
+ columns = {
+ "error": error,
+ "state": "processed",
+ "last_modified": datetime.utcnow(),
+ }
+ cls._set_values(rowkey, columns)
+
+ @classmethod
+ def get(cls, url, max_cache_age=None, **kwargs):
+ rowkey = cls._rowkey(url, **kwargs)
+ try:
+ temp = cls._byID(rowkey)
+
+ # Return None if this cache entry is too old
+ if (max_cache_age is not None and
+ datetime.datetime.utcnow() - temp.last_modified >
+ max_cache_age):
+ return None
+ else:
+ return temp
+ except tdb_cassandra.NotFound:
+ return None
+
+ @property
+ def media(self):
+ if self.state == "processed":
+ if not self.error:
+ media_object = secure_media_object = None
+ thumbnail_url = thumbnail_size = None
+
+ if (self.thumbnail_width and self.thumbnail_height and
+ self.thumbnail_url):
+ thumbnail_url = self.thumbnail_url
+ thumbnail_size = (self.thumbnail_width,
+ self.thumbnail_height)
+
+ if self.media_object:
+ media_object = json.loads(self.media_object)
+
+ if self.secure_media_object:
+ secure_media_object = json.loads(self.secure_media_object)
+
+ return Media(media_object, secure_media_object,
+ thumbnail_url, thumbnail_size)
+ else:
+ return ERROR_MEDIA
+ else:
+ return None

0 comments on commit 11e739e

Please sign in to comment.