Permalink
Browse files

Merge pull request #1 from coleifer/master

latest from upstream
  • Loading branch information...
2 parents 36d06b7 + 25b737e commit 3fe7efc77f8345fa48941605900b027e18266ec0 @pombredanne committed Mar 27, 2013
View
@@ -105,6 +105,23 @@ Providers
pr = bootstrap_embedly(key='my-embedly-key')
pr.request('http://www.youtube.com/watch?v=54XHDUOHuzU')
+.. py:function:: bootstrap_noembed([cache=None, [**kwargs]])
+
+ Create a :py:class:`ProviderRegistry` and register as many providers as
+ are supported by `noembed.com <http://noembed.com>`_. Valid services are
+ fetched from http://noembed.com/providers and parsed then registered.
+
+ :param cache: an object that implements simple ``get`` and ``set``
+ :param kwargs: any default keyword arguments to use with providers, useful for
+ passing the ``nowrap`` option to noembed.
+ :rtype: a ProviderRegistry with support for noembed
+
+ .. code-block:: python
+
+ # if you have an API key, you can specify that here
+ pr = bootstrap_noembed(nowrap=1)
+ pr.request('http://www.youtube.com/watch?v=54XHDUOHuzU')
+
Parsers
-------
View
@@ -48,9 +48,9 @@
# built documents.
#
# The short X.Y version.
-version = '0.2.4'
+version = '0.2.6'
# The full version, including alpha/beta/rc tags.
-release = '0.2.4'
+release = '0.2.6'
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
@@ -35,7 +35,7 @@ def fix_width_height(width_height, params):
if 'x' in width_height:
params['maxwidth'], params['maxheight'] = map(int, width_height.split('x'))
else:
- params['maxwidth'] = int(width_height[0])
+ params['maxwidth'] = int(width_height)
params.pop('maxheight', None)
return params
@@ -9,10 +9,15 @@ def render(self, s, **params):
s = '{%% load micawber_tags %%}%s' % s
return Template(s).render(Context(params)).strip()
+ def test_fix_wh(self):
+ from micawber.contrib.mcdjango import fix_width_height
+ self.assertEqual(fix_width_height('300x400', {}), {'maxwidth': 300, 'maxheight': 400})
+ self.assertEqual(fix_width_height('300', {}), {'maxwidth': 300})
+
def test_provider_loading(self):
from micawber.contrib.mcdjango import providers
self.assertEqual(providers, test_pr)
-
+
def test_oembed_filter_multiline_plain(self):
for url, expected in self.full_pairs.items():
expected_inline = self.inline_pairs[url]
@@ -22,7 +27,7 @@ def test_oembed_filter_multiline_plain(self):
parsed = self.render('{{ test_str|oembed }}', test_str=test_str)
self.assertEqual(parsed, frame % (expected_inline, expected, expected_inline))
-
+
def test_oembed_filter_multiline_html(self):
for url, expected in self.full_pairs.items():
expected_inline = self.inline_pairs[url]
@@ -41,7 +46,7 @@ def test_oembed_filter_multiline_html(self):
parsed = self.render('{{ test_str|oembed_html }}', test_str=test_str)
self.assertEqual(parsed, frame % (url, expected_inline, expected_inline))
-
+
def test_urlize(self):
u1 = 'http://fappio.com/'
u2 = 'http://google.com/fap/'
@@ -55,7 +60,7 @@ def test_urlize(self):
parsed = self.render('{{ test_str|oembed }}', test_str=test_str)
self.assertEqual(parsed, frame % (u1h, u2h, expected, expected_inline))
-
+
def test_oembed_filter_extension(self):
for url, expected in self.full_pairs.items():
expected_inline = self.inline_pairs[url]
@@ -65,7 +70,7 @@ def test_oembed_filter_extension(self):
parsed = self.render('{{ test_str|oembed_no_urlize }}', test_str=test_str)
self.assertEqual(parsed, frame % (expected, expected_inline))
-
+
def test_extract_filter(self):
blank = 'http://fapp.io/foo/'
frame = 'test %s\n%s\n%s\n%s at last'
@@ -78,7 +83,7 @@ def test_extract_filter(self):
test_str = frame % (url, blank, url, blank)
rendered = self.render(t, test_str=test_str)
self.assertEqual(rendered, url)
-
+
test_str = frame_html % (url, blank, url, blank)
rendered = self.render(t, test_str=test_str)
self.assertEqual(rendered, url)
@@ -31,7 +31,7 @@ class GoogleMapsProvider(Provider):
pr = micawber.bootstrap_basic()
pr.register(GoogleMapsProvider.regex, GoogleMapsProvider(''))
"""
- regex = r'^http://maps.google.com/maps\?([^\s]+)'
+ regex = r'^https?://maps.google.com/maps\?([^\s]+)'
valid_params = ['q', 'z']
View
@@ -1,2 +1,5 @@
class ProviderException(Exception):
pass
+
+class ProviderNotFoundException(ProviderException):
+ pass
View
@@ -118,16 +118,17 @@ def parse_html(html, providers, urlize_all=True, handler=full_handler, block_han
if not BeautifulSoup:
raise Exception('Unable to parse HTML, please install BeautifulSoup or use the text parser')
- soup = BeautifulSoup(html)
+ soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES)
for url in soup.findAll(text=re.compile(url_re)):
if not _inside_skip(url):
if _is_standalone(url):
url_handler = handler
else:
- url_handler = inline_handler
+ url_handler = block_handler
- replacement = parse_text_full(str(url), providers, urlize_all, url_handler, **params)
+ url_unescaped = url.string
+ replacement = parse_text_full(url_unescaped, providers, urlize_all, url_handler, **params)
url.replaceWith(BeautifulSoup(replacement))
return unicode(soup)
@@ -136,7 +137,7 @@ def extract_html(html, providers, **params):
if not BeautifulSoup:
raise Exception('Unable to parse HTML, please install BeautifulSoup or use the text parser')
- soup = BeautifulSoup(html)
+ soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES)
all_urls = set()
urls = []
extracted_urls = {}
View
@@ -3,14 +3,13 @@
import re
import socket
import urllib2
-import sys
from urllib import urlencode
try:
import simplejson as json
except ImportError:
import json
-from micawber.exceptions import ProviderException
+from micawber.exceptions import ProviderException, ProviderNotFoundException
class Provider(object):
@@ -29,6 +28,8 @@ def fetch(self, url):
resp = urllib2.urlopen(req)
except urllib2.URLError:
return False
+ except urllib2.HTTPError:
+ return False
except socket.timeout:
return False
@@ -70,6 +71,7 @@ def handle_response(self, response, url):
def make_key(*args, **kwargs):
return hashlib.md5(pickle.dumps((args, kwargs))).hexdigest()
+
def url_cache(fn):
def inner(self, url, **params):
if self.cache:
@@ -82,6 +84,7 @@ def inner(self, url, **params):
return fn(self, url, **params)
return inner
+
class ProviderRegistry(object):
def __init__(self, cache=None):
self._registry = {}
@@ -106,26 +109,86 @@ def request(self, url, **params):
provider = self.provider_for_url(url)
if provider:
return provider.request(url, **params)
- raise ProviderException('Provider not found for "%s"' % url)
+ raise ProviderNotFoundException('Provider not found for "%s"' % url)
def bootstrap_basic(cache=None):
# complements of oembed.com#section7
pr = ProviderRegistry(cache)
- pr.register('http://\S*?flickr.com/\S*', Provider('http://www.flickr.com/services/oembed/'))
- pr.register('https?://\S*.youtu(\.be|be\.com)/watch\S*', Provider('http://www.youtube.com/oembed'))
- pr.register('http://\S*.viddler.com/\S*', Provider('http://lab.viddler.com/services/oembed/'))
- pr.register('http://qik.com/video/\S*', Provider('http://qik.com/api/oembed.json'))
- pr.register('http://\S*.revision3.com/\S*', Provider('http://revision3.com/api/oembed/'))
- pr.register('http://www.hulu.com/watch/\S*', Provider('http://www.hulu.com/api/oembed.json'))
- pr.register('http://vimeo.com/\S*', Provider('http://vimeo.com/api/oembed.json'))
- pr.register('http://www.polleverywhere.com/(polls|multiple_choice_polls|free_text_polls)/\S*', Provider('http://www.polleverywhere.com/services/oembed/'))
- pr.register('http://www.ifixit.com/Guide/View/\S*', Provider('http://www.ifixit.com/Embed'))
+
+ # b
+ pr.register('http://blip.tv/\S+', Provider('http://blip.tv/oembed'))
+
+ # c
+ pr.register('http://chirb.it/\S+', Provider('http://chirb.it/oembed.json'))
+ pr.register('https://www.circuitlab.com/circuit/\S+', Provider('https://www.circuitlab.com/circuit/oembed'))
+ pr.register('http://www.collegehumor.com/video/\S+', Provider('http://www.collegehumor.com/oembed.json'))
+
+ # d
+ pr.register('https?://(www\.)?dailymotion\.com/\S+', Provider('http://www.dailymotion.com/services/oembed'))
+
+ # f
+ pr.register('http://\S*?flickr.com/\S+', Provider('http://www.flickr.com/services/oembed/'))
+ pr.register('http://flic\.kr/\S*', Provider('http://www.flickr.com/services/oembed/'))
+ pr.register('https?://(www\.)?funnyordie\.com/videos/\S+', Provider('http://www.funnyordie.com/oembed'))
+
+ # g
+ pr.register(r'https?://gist.github.com/\S*', Provider('https://github.com/api/oembed'))
+
+ # h
+ pr.register('http://www.hulu.com/watch/\S+', Provider('http://www.hulu.com/api/oembed.json'))
+
+ # i
+ pr.register('http://www.ifixit.com/Guide/View/\S+', Provider('http://www.ifixit.com/Embed'))
+ pr.register('http://\S*imgur\.com/\S+', Provider('http://api.imgur.com/oembed')),
+ pr.register('http://instagr(\.am|am\.com)/p/\S+', Provider('http://api.instagram.com/oembed'))
+
+ # j
+ pr.register('http://www.jest.com/(video|embed)/\S+', Provider('http://www.jest.com/oembed.json'))
+
+ # m
+ pr.register('http://www.mobypicture.com/user/\S*?/view/\S*', Provider('http://api.mobypicture.com/oEmbed'))
+ pr.register('http://moby.to/\S*', Provider('http://api.mobypicture.com/oEmbed'))
+
+ # p
+ pr.register('http://i\S*.photobucket.com/albums/\S+', Provider('http://photobucket.com/oembed'))
+ pr.register('http://gi\S*.photobucket.com/groups/\S+', Provider('http://photobucket.com/oembed'))
+ pr.register('http://www.polleverywhere.com/(polls|multiple_choice_polls|free_text_polls)/\S+', Provider('http://www.polleverywhere.com/services/oembed/'))
+ pr.register('https?://(.+\.)?polldaddy\.com/\S*', Provider('http://polldaddy.com/oembed/'))
+
+ # q
+ pr.register('http://qik.com/video/\S+', Provider('http://qik.com/api/oembed.json'))
+
+ # r
+ pr.register('http://\S*.revision3.com/\S+', Provider('http://revision3.com/api/oembed/'))
+
+ # s
+ pr.register('http://www.slideshare.net/[^\/]+/\S+', Provider('http://www.slideshare.net/api/oembed/2'))
+ pr.register('http://slidesha\.re/\S*', Provider('http://www.slideshare.net/api/oembed/2'))
pr.register('http://\S*.smugmug.com/\S*', Provider('http://api.smugmug.com/services/oembed/'))
+ pr.register('https://\S*?soundcloud.com/\S+', Provider('http://soundcloud.com/oembed'))
+ pr.register('https?://speakerdeck\.com/\S*', Provider('https://speakerdeck.com/oembed.json')),
+ pr.register('https?://(www\.)?scribd\.com/\S*', Provider('http://www.scribd.com/services/oembed'))
+
+ # t
+ pr.register('https?://(www\.)?twitter.com/\S+/status(es)?/\S+', Provider('http://api.twitter.com/1/statuses/oembed.json'))
+
+ # v
+ pr.register('http://\S*.viddler.com/\S*', Provider('http://lab.viddler.com/services/oembed/'))
+ pr.register('http://vimeo.com/\S+', Provider('http://vimeo.com/api/oembed.json'))
+ pr.register('https://vimeo.com/\S+', Provider('https://vimeo.com/api/oembed.json'))
+
+ # y
+ pr.register('https?://(\S*.)?youtu(\.be/|be\.com/watch)\S+', Provider('http://www.youtube.com/oembed'))
+ pr.register('http://(\S*\.)?yfrog\.com/\S*', Provider('http://www.yfrog.com/api/oembed'))
+
+ # w
pr.register('http://\S+.wordpress.com/\S+', Provider('http://public-api.wordpress.com/oembed/'))
- pr.register('http://www.slideshare.net/[^\/]+/\S*', Provider('http://www.slideshare.net/api/oembed/2'))
+ pr.register('https?://wordpress.tv/\S+', Provider('http://wordpress.tv/oembed/'))
+
return pr
+
def bootstrap_embedly(cache=None, **params):
endpoint = 'http://api.embed.ly/1/oembed'
schema_url = 'http://api.embed.ly/1/services/python'
@@ -143,3 +206,22 @@ def bootstrap_embedly(cache=None, **params):
for regex in provider_meta['regex']:
pr.register(regex, Provider(endpoint, **params))
return pr
+
+
+def bootstrap_noembed(cache=None, **params):
+ endpoint = 'http://noembed.com/embed'
+ schema_url = 'http://noembed.com/providers'
+
+ pr = ProviderRegistry(cache)
+
+ # fetch the schema
+ resp = urllib2.urlopen(schema_url)
+ contents = resp.read()
+ resp.close()
+
+ json_data = json.loads(contents)
+
+ for provider_meta in json_data:
+ for regex in provider_meta['patterns']:
+ pr.register(regex, Provider(endpoint, **params))
+ return pr
View
@@ -217,3 +217,18 @@ def test_outside_of_markup(self):
for url, expected in self.full_pairs.items():
parsed = parse_html(frame % (url), test_pr)
self.assertEqual(parsed, frame % (expected))
+
+ def test_html_entities(self):
+ frame_html = '<p>test %s</p><p><a href="foo">%s</a></p>'
+
+ for url, expected in self.data_pairs.items():
+ esc_url = url.replace('&', '&amp;')
+ all_urls, extracted = extract_html(frame_html % (esc_url, esc_url), test_pr)
+ self.assertEqual(all_urls, [url])
+
+ if 'url' not in expected:
+ expected['url'] = url
+ self.assertEqual(extracted, {url: expected})
+
+ rendered = parse_html('<p>%s</p>' % esc_url, test_pr)
+ self.assertEqual(rendered, '<p>%s</p>' % self.full_pairs[url])
View
@@ -11,7 +11,7 @@ def run_django_tests():
import django
except ImportError:
print 'Skipping django tests'
- return failures, errors
+ return
else:
print 'Running django integration tests'
View
@@ -7,7 +7,7 @@
setup(
name='micawber',
- version="0.2.4",
+ version="0.2.6",
description='a small library for extracting rich content from urls',
long_description=readme,
author='Charles Leifer',

0 comments on commit 3fe7efc

Please sign in to comment.