From 1b4e4f9e9847e5883154c46175bdb8e2a4f2e67e Mon Sep 17 00:00:00 2001 From: Fletcher Tomalty Date: Fri, 4 Nov 2011 12:07:17 -0400 Subject: [PATCH] Added support for the follow API and fixed issue #2 on bitbucket. Significant change. All API functions return dictionaries. Google Appengine support is not currently working. Caching has bugs, and the wrapper defaults to no caching. CLI tool does not currently support the follow API. Unit tests coming soon. --- README | 16 +- diffbot/__init__.py | 10 +- diffbot/cache.py | 208 ++++++++++--------- diffbot/diffbot.py | 471 ++++++++++++++++++++++++++------------------ diffbot/handlers.py | 187 +++++++++--------- setup.py | 2 +- 6 files changed, 489 insertions(+), 405 deletions(-) diff --git a/README b/README index 435fcec..737d68f 100644 --- a/README +++ b/README @@ -29,13 +29,13 @@ $ ./diffbot.py Usage: diffbot.py: [options] [url] Options: - -h, --help show this help message and exit - -d, --debug - -v, --verbose - -q, --quiet - -o OFORMAT, --output=OFORMAT - Ouput format (html, raw, json, pretty) - -k KEY Diffbot developer API key + -h, --help show this help message and exit + -d, --debug + -v, --verbose + -q, --quiet + -o OFORMAT, --output=OFORMAT + Ouput format (html, raw, json, pretty) + -k KEY Diffbot developer API key }}} @@ -60,7 +60,7 @@ import diffbot def main(url): db = diffbot.DiffBot(dev_token = "mydevtoken") - article = db.get_article(url) + article = db.article(url) }}} === Links and Resources === diff --git a/diffbot/__init__.py b/diffbot/__init__.py index 8e88114..30a63f3 100644 --- a/diffbot/__init__.py +++ b/diffbot/__init__.py @@ -8,16 +8,16 @@ # URL: """ - py-diffbot + py-diffbot - Python client and library for the Diffbot article API and others. + Python client and library for the Diffbot article API and others. - :copyright: Copyright (C) 2011 Nik Cubrilovic and others, see AUTHORS - :license: new BSD, see LICENSE for more details. + :copyright: Copyright (C) 2011 Nik Cubrilovic and others, see AUTHORS + :license: new BSD, see LICENSE for more details. """ __author__ = 'Nik Cubrilovic ' -__version__ = '0.0.1' +__version__ = '0.0.2' __status__ = 'beta' __date__ = '28th March 2011' diff --git a/diffbot/cache.py b/diffbot/cache.py index 0521641..adeee38 100644 --- a/diffbot/cache.py +++ b/diffbot/cache.py @@ -2,153 +2,143 @@ # -*- coding: utf-8 -*- # vim:ts=4:sw=4:expandtab """ - py-diffbot - cache.py + py-diffbot - cache.py - Caching handlers with support for file, GAE memcache and python memcache + Caching handlers with support for file, GAE memcache and python memcache - This source file is subject to the new BSD license that is bundled with this - package in the file LICENSE.txt. The license is also available online at the - URL: + This source file is subject to the new BSD license that is bundled with this + package in the file LICENSE.txt. The license is also available online at the + URL: - :copyright: Copyright (C) 2011 Nik Cubrilovic and others, see AUTHORS - :license: new BSD, see LICENSE for more details. + :copyright: Copyright (C) 2011 Nik Cubrilovic and others, see AUTHORS + :license: new BSD, see LICENSE for more details. """ -__version__ = '0.0.1' +__version__ = '0.0.2' __author__ = 'Nik Cubrilovic ' -import os, sys, logging, hashlib +import os, logging, hashlib, urllib try: - from google.appengine.api import memcache - GAE = True + from google.appengine.api import memcache + GAE = True except ImportError: - GAE = False - try: - import memcache - LOCAL_MEMCACHE = True - except ImportError: - LOCAL_MEMCACHE = False + GAE = False + try: + import memcache + LOCAL_MEMCACHE = True + except ImportError: + LOCAL_MEMCACHE = False #--------------------------------------------------------------------------- -# Handler Classes +# Handler Classes #--------------------------------------------------------------------------- class CacheHandler(object): - options = None - - def __init__(self, options): - """docstring for __init__""" - self.options = options - - def wrap(self, func): - """docstring for fname""" - def cache(*args, **kwargs): - logging.info("Called fetch function with") - key = self.hash(args[0]) - cache_store = self.get(key) - if cache_store: - return cache_store - val = func(*args, **kwargs) - if val: - self.set(key, val) - return val - return cache - - def hash(self, key_name): - return hashlib.sha1(key_name).hexdigest() + options = None + + def __init__(self, options): + self.options = options + + def wrap(self, func): + def cache(url, data): + logging.info("Called fetch function with") + key = self.hash(url + '?' + urllib.urlencode(data)) + cache_store = self.get(key) + if cache_store: + return cache_store + val = func(url, data) + if val: + self.set(key, val) + return val + return cache + + def hash(self, key): + return hashlib.sha1(key).hexdigest() class NullHandler(CacheHandler): - """docstring for NullHandler""" - def __init__(self, options): - return None + def __init__(self, options): + pass - def wrap(self, func): - return func + def wrap(self, func): + return func class MemcacheHandler(CacheHandler): - def fname(self): - """docstring for fname""" - pass + def fname(self): + pass - def fname(self): - """docstring for fname""" - pass + def fname(self): + pass class GAEMemcacheHandler(CacheHandler): - """docstring for GAEMemcacheHandler""" + ttl = 60 * 60 * 24 * 4 - ttl = 60 * 60 * 24 * 4 + def get(self, key): + return memcache.get(key) - def get(self, key): - """docstring for get""" - return memcache.get(key) - - def set(self, key, value): - """docstring for set""" - return memcache.set(key, value, self.ttl) + def set(self, key, value): + return memcache.set(key, value, self.ttl) class FileCacheHandler(CacheHandler): - """docstring for FileCacheHandler""" - - cache_folder = None - - def __init__(self, options): - if options.has_key('cache_folder'): - cf = options['cache_folder'] - if not cf.startswith('/'): - cf = os.path.join(os.path.dirname(__file__), cf) - if os.path.isdir(cf): - self.cache_folder = options['cache_folder'] - else: - raise Exception("Not a valid cache folder: %s (got: %s)" % (cf, os.path.isdir(cf))) - else: - import tempfile - self.cache_folder = tempfile.gettempdir() - - def get_filepath(self, key): - return os.path.join(self.cache_folder, "%s.txt" % key) - - def get(self, key): - file_path = self.get_filepath(key) - if os.path.isfile(file_path): - logging.info("CACHE HIT") - return open(file_path).read() - return False - - def set(self, key, value): - file_path = self.get_filepath(key) - try: - f = open(file_path, 'w') - f.write(value) - except Exception, e: - logging.error("Exception: could not write file %s" % (file_path)) - logging.exception(e) - return False - return True + + cache_folder = None + + def __init__(self, options): + if options is not None and options.has_key('cache_folder'): + cf = options['cache_folder'] + if not cf.startswith('/'): + cf = os.path.join(os.path.dirname(__file__), cf) + if os.path.isdir(cf): + self.cache_folder = options['cache_folder'] + else: + raise Exception("Not a valid cache folder: %s (got: %s)" % (cf, os.path.isdir(cf))) + else: + import tempfile + self.cache_folder = tempfile.gettempdir() + + def get_filepath(self, key): + return os.path.join(self.cache_folder, "%s.txt" % key) + + def get(self, key): + file_path = self.get_filepath(key) + if os.path.isfile(file_path): + logging.info("CACHE HIT") + return open(file_path).read() + return False + + def set(self, key, value): + file_path = self.get_filepath(key) + try: + f = open(file_path, 'w') + f.write(value) + except Exception, e: + logging.error("Exception: could not write file %s" % (file_path)) + logging.exception(e) + return False + return True #--------------------------------------------------------------------------- -# Handler Class +# Handler Class #--------------------------------------------------------------------------- def handler(cache_options = None): - if cache_options: - if cache_options.has_key('handler'): - if cache_options['handler'] == 'memcache' and GAE: + if cache_options: + if cache_options.has_key('handler'): + if cache_options['handler'] == 'memcache' and GAE: + return GAEMemcacheHandler(cache_options) + elif cache_options['handler'] == 'memcache' and LOCAL_MEMCACHE: + return MemcacheHandler(cache_options) + elif cache_options['handler'] == 'file': + return FileCacheHandler(cache_options) + if GAE: return GAEMemcacheHandler(cache_options) - elif cache_options['handler'] == 'memcache' and LOCAL_MEMCACHE: + if LOCAL_MEMCACHE and cache_options.has_key('memcache_server'): return MemcacheHandler(cache_options) - elif cache_options['handler'] == 'file': - return FileCacheHandler(cache_options) - if GAE: - return GAEMemcacheHandler(cache_options) - if LOCAL_MEMCACHE and cache_options.has_key('memcache_server'): - return MemcacheHandler(cache_options) - return FileCacheHandler(cache_options) + return NullHandler(cache_options) diff --git a/diffbot/diffbot.py b/diffbot/diffbot.py index 23b0cef..ed210c6 100755 --- a/diffbot/diffbot.py +++ b/diffbot/diffbot.py @@ -2,239 +2,324 @@ # -*- coding: utf-8 -*- # vim:ts=4:sw=4:expandtab """ - py-diffbot - diffbot.py + py-diffbot - diffbot.py - Python client and library for the Diffbot article API and others. + Python client and library for the Diffbot article API and others. - This source file is subject to the new BSD license that is bundled with this - package in the file LICENSE.txt. The license is also available online at the - URL: + This source file is subject to the new BSD license that is bundled with this + package in the file LICENSE.txt. The license is also available online at the + URL: - :copyright: Copyright (C) 2011 Nik Cubrilovic and others, see AUTHORS - :license: new BSD, see LICENSE for more details. + :copyright: Copyright (C) 2011 Nik Cubrilovic and others, see AUTHORS + :license: new BSD, see LICENSE for more details. """ -__version__ = '0.0.1' +__version__ = '0.0.2' __author__ = 'Nik Cubrilovic ' import os, sys, logging, urlparse, urllib try: - import json + import json except ImportError: - try: - import simplejson as json - except ImportError: - _JSON = False + try: + import simplejson as json + except ImportError: + _JSON = False -class DiffBot(): - """DiffBot API Client - - Make requests to the DiffBot API. Client library has built-in support for - multiple http client libraries, caching with a local file cache and memcache - and Google App Engine (defaults to urlfetch and memcache). - - Initialization options are caching options and developer token, which is - required for all requests. +try: + from lxml import etree +except ImportError: + try: + # Python 2.5 + import xml.etree.cElementTree as etree + except ImportError: + try: + # Python 2.5 + import xml.etree.ElementTree as etree + except ImportError: + try: + # normal cElementTree install + import cElementTree as etree + except ImportError: + # normal ElementTree install + try: + import elementtree.ElementTree as etree + except ImportError: + _ETREE = False - Usage: - >>> import diffbot - >>> db = diffbot.DiffBot(dev_token="mydevtoken") - >>> db.get_article("http://www.newssite.com/newsarticle.html") - [parsed article here] - :since: v0.1 - """ +class DiffBot(): + """DiffBot API Client - api_endpoint = "http://www.diffbot.com/api/article" - request_attempts = 3 + Make requests to the DiffBot API. Client library has built-in support for + multiple http client libraries, caching with a local file cache and memcache + and Google App Engine (defaults to urlfetch and memcache). - def __init__(self, cache_options = None, dev_token = None, attempts = 3): - """Initialize the DiffBot API client. Parameters are cache options and the - required developer token. + Initialization options are caching options and developer token, which is + required for all requests. - Cache options as a dict with key: - handler: memcache or file - cache_dir: if file cache, use cache folder (default tmp) - memcache_server: memcache server IP address - memcache_user: memcache username + Usage: - dev_token is a required developer token + >>> import diffbot + >>> db = diffbot.DiffBot(dev_token="mydevtoken") + >>> db.article("http://www.newssite.com/newsarticle.html") + [parsed article here] - attempts is the number of http request attempts to make on failure + :since: v0.1 """ - if not dev_token: - dev_token = os.environ.get('DIFFBOT_TOKEN', False) - - if not dev_token: - raise Exception("Please provide a dev token") - - self.dev_token = dev_token - - from handlers import handler - self._http_handler_class = handler() - self._http_handle = self._http_handler_class(cache_options) - - def http_handler(self): - """Returns the http handler object, which implements handlers.HttpHandler. - Implements a single function, fetch, which has a single argument, the url. - Handler classes wrap Google App Engine urlfetch library and the various - options and exceptions, as well as urllib and urllib2, which will be selected - automatically depending on Python version and environment. - """ - return self._http_handle - - def get_req_args(self, url, format = 'json', comments = False, stats = False): - """Build request arguments for query string in API request. Defaults are - to request JSON output format.""" - # TODO some of these are not implemented. can we order the dict? - api_arguments = { - "token": self.dev_token, - "url": url, - # "tags": '1' - } - if format != 'json': - api_arguments['format'] = format - if comments: - api_arguments['comments'] = True - if stats: - api_arguments['stats'] = True - - query_string = urllib.urlencode(api_arguments) - - return query_string - - def get_article(self, article_url, format = 'json', comments = False, - stats = False, dirty_hack = False): - """Make an API request to the DiffBot server to retrieve an article. - - Requires article_url - """ - api_args = self.get_req_args(article_url) - url = self.api_endpoint + '?' + api_args + api_endpoint_base = "http://www.diffbot.com/api/" + request_attempts = 3 + + def __init__(self, cache_options = None, dev_token = None, attempts = 3): + """Initialize the DiffBot API client. Parameters are cache options and the + required developer token. + + Cache options as a dict with key: + handler: memcache or file + cache_dir: if file cache, use cache folder (default tmp) + memcache_server: memcache server IP address + memcache_user: memcache username + + dev_token is a required developer token + + attempts is the number of http request attempts to make on failure + """ + if not dev_token: + dev_token = os.environ.get('DIFFBOT_TOKEN', False) + + if not dev_token: + raise Exception("Please provide a dev_token") + + self.dev_token = dev_token + + from handlers import handler + + self._http_handle = handler()(cache_options) + + def http_handler(self): + """Returns the http handler object, which implements handlers.HttpHandler. + Implements a single function, fetch, which has a single argument, the url. + Handler classes wrap Google App Engine urlfetch library and the various + options and exceptions, as well as urllib and urllib2, which will be selected + automatically depending on Python version and environment. + """ + return self._http_handle + + def article(self, url, format = 'json', comments = False, stats = False, dirty_hack = False): + """Make an API request to the DiffBot server to retrieve an article. + + Requires article_url + """ + + api_arguments = { + "token": self.dev_token, + "url": url, + # "tags": '1' + } + if format != 'json': + api_arguments['format'] = format + if comments: + api_arguments['comments'] = True + if stats: + api_arguments['stats'] = True + + api_endpoint = self.api_endpoint_base + 'article' + + response = self.http_handler().get(api_endpoint, api_arguments) + + if response: + try: + article_info = json.loads(response) + except Exception, e: + logging.exception(e) + return False + if not article_info.has_key('tags'): + article_info['tags'] = [] + if dirty_hack: + article_info['raw_response'] = response + else: + article_info['raw_response'] = '' + return article_info + + # logging.info(response) + logging.info('DONE!') + return False - response = self.http_handler().fetch(url) + def follow_add(self, url): + """Make an API request to the DiffBot server to follow a page.""" + api_arguments = { + "token": self.dev_token, + "url": url, + } + + api_endpoint = self.api_endpoint_base + 'add' + + + response = self.http_handler().post(api_endpoint, api_arguments) + if response: + try: + tree = etree.fromstring(response) + add_info = { + 'id': tree.get('id'), + 'new': tree[0].get('new') or False + } + for element in tree[0]: + add_info[element.tag] = element.text + return add_info + except Exception, e: + logging.exception(e) + return False + + # logging.info(response) + logging.info('DONE!') + return False - if response: - try: - de = json.loads(response) - except Exception, e: - logging.exception(e) + def follow_read(self, follow_id): + """ + Make an API request to the DiffBot server to read changes from a page. + Returns the following dictionary: + { + 'info': { + #information about the request + }, + 'items': [] # The page's items returned by the API + """ + api_arguments = {"id": str(follow_id)} + + api_endpoint = self.api_endpoint_base + 'dfs/dml/archive' + + + response = self.http_handler().get(api_endpoint, api_arguments) + if response: + try: + tree = etree.fromstring(response) + read_info = { + 'info': { + 'id': tree[0].get('id'), + 'new': tree[0][0].get('new') or False, + }, + 'items': [] + } + for element in tree[0][0]: + read_info['info'][element.tag] = element.text + for item_element in tree[0].findall('item'): + item = item_element.attrib + for element in item_element: + item[element.tag] = element.text + read_info['items'].append(item) + return read_info + except Exception, e: + logging.exception(e) + return False + + # logging.info(response) + logging.info('DONE!') return False - if not de.has_key('tags'): - de['tags'] = [] - if dirty_hack: - de['raw_response'] = response - else: - de['raw_response'] = '' - return de - - # logging.info(response) - logging.info('DONE!') - return False #--------------------------------------------------------------------------- -# Helper Functions +# Helper Functions #--------------------------------------------------------------------------- def init_logger(level, debug = False): - """Sets the logging level for both the command line client and the - client library - """ - if debug: - log_level = logging.DEBUG - elif level: - log_level = level - else: - log_level = logging.WARNING - - try: - return logging.basicConfig(level=log_level) - except Exception: - return False + """Sets the logging level for both the command line client and the + client library + """ + if debug: + log_level = logging.DEBUG + elif level: + log_level = level + else: + log_level = logging.WARNING + + try: + return logging.basicConfig(level=log_level) + except Exception: + return False def unset_gae(): - # sys.path = [path for path in sys.path if 'site-packages' not in path] - pass + # sys.path = [path for path in sys.path if 'site-packages' not in path] + pass def set_gae(): - a = "/Applications/GoogleAppEngineLauncher.app/Contents/Resources/GoogleAppEn" \ - + "gine-default.bundle/Contents/Resources/google_appengine" - sys.path = sys.path + [os.path.abspath(os.path.realpath(a))] + a = "/Applications/GoogleAppEngineLauncher.app/Contents/Resources/GoogleAppEn" \ + + "gine-default.bundle/Contents/Resources/google_appengine" + sys.path = sys.path + [os.path.abspath(os.path.realpath(a))] #--------------------------------------------------------------------------- # Main command line application function +# +# (Only for article API for now.) #--------------------------------------------------------------------------- - def main(debug = False): - import sys - from optparse import OptionParser, SUPPRESS_HELP - - parser = OptionParser(usage="%prog: [options] [url]") - parser.add_option('-d', '--debug', action='store_const', - const=logging.DEBUG, dest='log_level') - parser.add_option('-v', '--verbose', action='store_const', - const=logging.INFO, dest='log_level') - parser.add_option('-q', '--quiet', action='store_const', - const=logging.CRITICAL, dest='log_level') - parser.add_option('-c', '--cache', choices=['memcache', 'file', 'm', 'f'], - dest='cache', help="Cache (memcache or file)") - parser.add_option('-o', '--output', choices=['html', 'raw', 'json', 'pretty'], - dest='oformat', help="Ouput format (html, raw, json, pretty)") - parser.add_option('-k', dest='key', help="Diffbot developer API key") - - parser.add_option('-t', '--test', - choices=["gae", "nogae", "http", "memcache", "filecache", "h", "m", "f"], - help=SUPPRESS_HELP) - - (options, args) = parser.parse_args() - init_logger(options.log_level, debug) - - if len(args) != 1: - parser.print_help() - sys.exit(-1) - - _url_parsed = urlparse.urlparse(args[0]) - _url = urlparse.urlunparse(_url_parsed) - - if not _url_parsed.netloc or not _url_parsed.scheme: - print "Error: Please enter a valid url (%s)" % _url - sys.exit(-1) - - cache_options = {} - - if options.test == 'gae': - set_gae() - elif options.test == 'nogae': - unset_gae() - elif options.test == 'memcache' or options.test == 'm': - logging.info("Testing memcache") - - if options.cache == 'm' or options.cache == 'memcache': - cache_options['handler'] = 'memcache' - elif options.cache == 'f' or options.cache == 'file': - cache_options['handler'] = 'file' - # cache_options = {'handler': 'memcache'} - - try: - db = DiffBot(cache_options) - article = db.get_article(_url) - except Exception, e: - print "Error: ", e - exit(-1) - - # Output document based on options - if options.output == 'raw': - print article - elif options.output == 'json': - from pprint import pprint - pprint(article) - else: - print article + import sys + from optparse import OptionParser, SUPPRESS_HELP + + parser = OptionParser(usage="%prog: [options] [url]") + parser.add_option('-d', '--debug', action='store_const', + const=logging.DEBUG, dest='log_level') + parser.add_option('-v', '--verbose', action='store_const', + const=logging.INFO, dest='log_level') + parser.add_option('-q', '--quiet', action='store_const', + const=logging.CRITICAL, dest='log_level') + parser.add_option('-c', '--cache', choices=['memcache', 'file', 'm', 'f'], + dest='cache', help="Cache (memcache or file)") + parser.add_option('-o', '--output', choices=['html', 'raw', 'json', 'pretty'], + dest='oformat', help="Ouput format (html, raw, json, pretty)") + parser.add_option('-k', dest='key', help="Diffbot developer API key") + + parser.add_option('-t', '--test', + choices=["gae", "nogae", "http", "memcache", "filecache", "h", "m", "f"], + help=SUPPRESS_HELP) + + (options, args) = parser.parse_args() + init_logger(options.log_level, debug) + + if len(args) != 1: + parser.print_help() + sys.exit(-1) + + _url_parsed = urlparse.urlparse(args[0]) + _url = urlparse.urlunparse(_url_parsed) + + if not _url_parsed.netloc or not _url_parsed.scheme: + print "Error: Please enter a valid url (%s)" % _url + sys.exit(-1) + + cache_options = {} + + if options.test == 'gae': + set_gae() + elif options.test == 'nogae': + unset_gae() + elif options.test == 'memcache' or options.test == 'm': + logging.info("Testing memcache") + + if options.cache == 'm' or options.cache == 'memcache': + cache_options['handler'] = 'memcache' + elif options.cache == 'f' or options.cache == 'file': + cache_options['handler'] = 'file' + # cache_options = {'handler': 'memcache'} + + try: + db = DiffBot(cache_options) + article = db.article(_url) + except Exception, e: + print "Error: ", e + exit(-1) + + # Output document based on options + if options.output == 'raw': + print article + elif options.output == 'json': + from pprint import pprint + pprint(article) + else: + print article if __name__ == "__main__": - main(os.environ.get('DIFFBOT_DEBUG', False)) + main(os.environ.get('DIFFBOT_DEBUG', False)) diff --git a/diffbot/handlers.py b/diffbot/handlers.py index 7f032e5..501911f 100644 --- a/diffbot/handlers.py +++ b/diffbot/handlers.py @@ -2,20 +2,20 @@ # -*- coding: utf-8 -*- # vim:ts=4:sw=4:expandtab """ - py-diffbot - http_client.py + py-diffbot - http_client.py - Python http client library to support Google App Engine urlfetch, urllib - and urllib2 + Python http client library to support Google App Engine urlfetch, urllib + and urllib2 - This source file is subject to the new BSD license that is bundled with this - package in the file LICENSE.txt. The license is also available online at the - URL: + This source file is subject to the new BSD license that is bundled with this + package in the file LICENSE.txt. The license is also available online at the + URL: - :copyright: Copyright (C) 2011 Nik Cubrilovic and others, see AUTHORS - :license: new BSD, see LICENSE for more details. + :copyright: Copyright (C) 2011 Nik Cubrilovic and others, see AUTHORS + :license: new BSD, see LICENSE for more details. """ -__version__ = '0.0.1' +__version__ = '0.0.2' __author__ = 'Nik Cubrilovic ' GAE = True @@ -23,110 +23,119 @@ import logging try: - from google.appengine.api import urlfetch + from google.appengine.api import urlfetch except ImportError: - GAE = False + GAE = False import urllib from cache import handler as cache_handler class HttpHandler(object): - """docstring for HttpClient""" - _req_headers = { - "User-Agent": "py-diffbot v0.0.1 <+http://bitbucket.org/nik/py-diffbot>" - } - _req_attempts = 3 + _req_headers = { + "User-Agent": "py-diffbot v0.0.2 <+http://bitbucket.org/nik/py-diffbot>" + } + _req_attempts = 3 - def __init__(self, cache_options = None): - """docstring for __init__""" - self._cache_handle = cache_handler(cache_options) - if self._cache_handle: - self.fetch = self._cache_handle.wrap(self.fetch) + def __init__(self, cache_options = None): + """docstring for __init__""" + self._cache_handle = cache_handler(cache_options) + if self._cache_handle: + self.get = self._cache_handle.wrap(self.get) + self.post = self._cache_handle.wrap(self.post) - def cache_handler(self): - return self._cache_handle + def cache_handler(self): + return self._cache_handle - def __get__(self, **kwargs): - logging.debug("Called __call__ with:") - logging.debug(**kwargs) + def __get__(self, **kwargs): + logging.debug("Called __call__ with:") + logging.debug(**kwargs) + def get(self, url, data): + return self.fetch(url, data, 'GET') + + def post(self, url, data): + return self.fetch(url, data, 'POST') + +# TODO Somebody who knows the Appengine API should fix this to function the same way as the urllib version class UrlfetchHandler(HttpHandler): - """docstring for UrlFetchHttpClient""" - - def fetch(self, url): - attempt = 1 - result = None - self._req_headers['Connection'] = 'Close' - - while attempt <= self._req_attempts: - try: - result = urlfetch.fetch( - url, - method = urlfetch.GET, - headers = self._req_headers, - deadline = 20 - ) - except urlfetch.DownloadError, e: - logging.info("DiffBot: (Download Attempt [%d/%d]) DownloadError: Download timed out" - % (attempt, self._req_attempts)) - attempt += 1 - except Exception, e: - logging.exception("Diffbot: Exception: %s" % e.message) - logging.exception("Diffbot: Exceeded number of attempts allowed") - return False - if result: - if result.status_code == 200: - return result.content.decode('UTF-8') + def fetch(self, url, data, method): + attempt = 1 + result = None + self._req_headers['Connection'] = 'Close' + + while attempt <= self._req_attempts: + try: + result = urlfetch.fetch( + url, + method = urlfetch.GET, + headers = self._req_headers, + deadline = 20 + ) + except urlfetch.DownloadError, e: + logging.info("DiffBot: (Download Attempt [%d/%d]) DownloadError: Download timed out" + % (attempt, self._req_attempts)) + attempt += 1 + except Exception, e: + logging.exception("Diffbot: Exception: %s" % e.message) + logging.exception("Diffbot: Exceeded number of attempts allowed") + return False + + if result: + if result.status_code == 200: + return result.content.decode('UTF-8') + + return False - return False class UrllibHandler(HttpHandler): - """ docstring """ - def fetch(self, url): - result = None + def fetch(self, url, data, method): + assert method in ['GET', 'POST'] - try: - fh = urllib.urlopen(url) - if fh.getcode() != 200: - logging.error("urllib http request returned status code: %s" % fh.getcode()) - return False - result = fh.read().decode('UTF-8') - except Exception, e: - logging.exception("urllib error: %s", str(e)) - return False + result = None + + try: + if method == 'GET': + fh = urllib.urlopen(url + '?' + urllib.urlencode(data)) + elif method == 'POST': + fh = urllib.urlopen(url, urllib.urlencode(data)) + if fh.getcode() != 200: + logging.error("urllib http request returned status code: %s" % fh.getcode()) + return False + result = fh.read().decode('UTF-8') + except Exception, e: + logging.exception("urllib error: %s", str(e)) + return False - return result + return result # TODO implement this? class Urllib2Handler(HttpHandler): - """docstring for UrllibHttpClient""" - - def fetch(self, url): - import urllib2 - - try: - request = urllib2.Request( - url, - # api_args, - # headers - ) - handle = urllib2.urlopen(request) - response = handle.read() - return response - - except urllib2.URLError, e: - logging.exception(e) - return False + def fetch(self, url, data, method): + import urllib2 + + try: + request = urllib2.Request( + url, + # api_args, + # headers + ) + handle = urllib2.urlopen(request) + response = handle.read() + return response + + except urllib2.URLError, e: + logging.exception(e) + return False # TODO implement options # TODO return an instance rather than the class (ie. pass and wrap options) and -# select the class to use based on the options (same as cache.py) +# TODO select the class to use based on the options (same as cache.py) def handler(options = None): - """return a valid HTTP handler for the request""" - if GAE: - return UrlfetchHandler - return UrllibHandler + """return a valid HTTP handler for the request""" + if GAE: + return UrlfetchHandler + return UrllibHandler diff --git a/setup.py b/setup.py index 5e1ded9..44552ed 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,5 @@ from distutils.core import setup setup(name='py-diffbot', - version='0.0.1', + version='0.0.2', py_modules=['urllib'], ) \ No newline at end of file