From 8764f745ad93bce67315c405bcab98eef191333a Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Tue, 28 Mar 2017 15:16:55 +0300 Subject: [PATCH 01/15] Move proxy logic into a separate module --- scrapinghub/client/activity.py | 2 +- scrapinghub/client/collections.py | 5 +- scrapinghub/client/frontiers.py | 3 +- scrapinghub/client/items.py | 2 +- scrapinghub/client/jobs.py | 5 +- scrapinghub/client/logs.py | 2 +- scrapinghub/client/projects.py | 3 +- scrapinghub/client/proxy.py | 178 ++++++++++++++++++++++++++++++ scrapinghub/client/requests.py | 2 +- scrapinghub/client/samples.py | 2 +- scrapinghub/client/utils.py | 174 ----------------------------- tests/client/test_utils.py | 2 +- 12 files changed, 192 insertions(+), 188 deletions(-) create mode 100644 scrapinghub/client/proxy.py diff --git a/scrapinghub/client/activity.py b/scrapinghub/client/activity.py index bb492fed..b888a6e9 100644 --- a/scrapinghub/client/activity.py +++ b/scrapinghub/client/activity.py @@ -1,6 +1,6 @@ from __future__ import absolute_import -from .utils import _Proxy +from .proxy import _Proxy from .utils import parse_job_key diff --git a/scrapinghub/client/collections.py b/scrapinghub/client/collections.py index c135bd39..868fdc5d 100644 --- a/scrapinghub/client/collections.py +++ b/scrapinghub/client/collections.py @@ -5,9 +5,8 @@ from ..hubstorage.collectionsrt import Collection as _Collection -from .utils import ( - _Proxy, format_iter_filters, proxy_methods, wrap_kwargs, update_kwargs, -) +from .proxy import _Proxy, proxy_methods, wrap_kwargs, format_iter_filters +from .utils import update_kwargs class Collections(_Proxy): diff --git a/scrapinghub/client/frontiers.py b/scrapinghub/client/frontiers.py index 052774ea..bf2ae721 100644 --- a/scrapinghub/client/frontiers.py +++ b/scrapinghub/client/frontiers.py @@ -7,7 +7,8 @@ from ..hubstorage.frontier import Frontier as _Frontier from ..hubstorage.utils import urlpathjoin -from .utils import _Proxy, update_kwargs +from .proxy import _Proxy +from .utils import update_kwargs class _HSFrontier(_Frontier): diff --git a/scrapinghub/client/items.py b/scrapinghub/client/items.py index 45329e17..56069344 100644 --- a/scrapinghub/client/items.py +++ b/scrapinghub/client/items.py @@ -1,6 +1,6 @@ from __future__ import absolute_import -from .utils import _Proxy +from .proxy import _Proxy class Items(_Proxy): diff --git a/scrapinghub/client/jobs.py b/scrapinghub/client/jobs.py index b72feed2..a022e85e 100644 --- a/scrapinghub/client/jobs.py +++ b/scrapinghub/client/jobs.py @@ -11,9 +11,8 @@ from .requests import Requests from .samples import Samples from .exceptions import NotFound, BadRequest, DuplicateJobError -from .utils import ( - _MappingProxy, get_tags_for_update, parse_job_key, update_kwargs, -) +from .proxy import _MappingProxy +from .utils import get_tags_for_update, parse_job_key, update_kwargs class Jobs(object): diff --git a/scrapinghub/client/logs.py b/scrapinghub/client/logs.py index 6771607c..61efecce 100644 --- a/scrapinghub/client/logs.py +++ b/scrapinghub/client/logs.py @@ -1,7 +1,7 @@ from __future__ import absolute_import import json -from .utils import _Proxy +from .proxy import _Proxy from .utils import LogLevel diff --git a/scrapinghub/client/projects.py b/scrapinghub/client/projects.py index cbb3e33f..35f93eae 100644 --- a/scrapinghub/client/projects.py +++ b/scrapinghub/client/projects.py @@ -8,8 +8,9 @@ from .collections import Collections from .frontiers import _HSFrontier, Frontiers from .jobs import Jobs +from .proxy import _MappingProxy from .spiders import Spiders -from .utils import _MappingProxy, parse_project_id +from .utils import parse_project_id class Projects(object): diff --git a/scrapinghub/client/proxy.py b/scrapinghub/client/proxy.py new file mode 100644 index 00000000..e0e6a8ce --- /dev/null +++ b/scrapinghub/client/proxy.py @@ -0,0 +1,178 @@ +from __future__ import absolute_import + +import six +import json + +from ..hubstorage.resourcetype import DownloadableResource +from ..hubstorage.resourcetype import ItemsResourceType +from ..hubstorage.collectionsrt import Collections + +from .exceptions import wrap_value_too_large + + +class _Proxy(object): + """A helper to create a class instance and proxy its methods to origin. + + The internal proxy class is useful to link class attributes from its + origin depending on the origin base class as a part of init logic: + + - :class:`~scrapinghub.hubstorage.resourcetype.ItemsResourceType` provides + items-based attributes to access items in an arbitrary collection with + get/write/flush/close/stats/iter methods. + + - :class:`~scrapinghub.hubstorage.resourcetype.DownloadableResource` provides + download-based attributes to iter through collection with or without + msgpack support. + """ + + def __init__(self, cls, client, key): + self.key = key + self._client = client + self._origin = cls(client._hsclient, key) + + if issubclass(cls, ItemsResourceType): + self._proxy_methods(['get', 'write', 'flush', 'close', + 'stats', ('iter', 'list')]) + # redefine write method to wrap hubstorage.ValueTooLarge error + origin_method = getattr(self, 'write') + setattr(self, 'write', wrap_value_too_large(origin_method)) + + # DType iter_values() has more priority than IType list() + # plus Collections interface doesn't need the iter methods + if issubclass(cls, DownloadableResource) and cls is not Collections: + methods = [('iter', 'iter_values'), + ('iter_raw_msgpack', 'iter_msgpack'), + ('iter_raw_json', 'iter_json')] + self._proxy_methods(methods) + self._wrap_iter_methods([method[0] for method in methods]) + + def _proxy_methods(self, methods): + """A little helper for cleaner interface.""" + proxy_methods(self._origin, self, methods) + + def _wrap_iter_methods(self, methods): + """Modify kwargs for all passed self.iter* methods.""" + for method in methods: + wrapped = wrap_kwargs(getattr(self, method), + self._modify_iter_params) + setattr(self, method, wrapped) + + def _modify_iter_params(self, params): + """A helper to modify iter() params on-the-fly. + + The method is internal and should be redefined in subclasses. + + :param params: a dictionary with input parameters. + :return: an updated dictionary with parameters. + :rtype: :class:`dict` + """ + return format_iter_filters(params) + + def list(self, *args, **kwargs): + """Convenient shortcut to list iter results. + + Please note that :meth:`list` method can use a lot of memory and for a + large amount of elements it's recommended to iterate through it via + :meth:`iter` method (all params and available filters are same for both + methods). + """ + return list(self.iter(*args, **kwargs)) + + +class _MappingProxy(_Proxy): + """A helper class to support basic get/set interface for dict-like + collections of elements. + """ + + def get(self, key): + """Get element value by key. + + :param key: a string key + """ + return next(self._origin.apiget(key)) + + def set(self, key, value): + """Set element value. + + :param key: a string key + :param value: new value to set for the key + """ + self._origin.apipost(key, data=json.dumps(value), is_idempotent=True) + + def update(self, values): + """Update multiple elements at once. + + The method provides convenient interface for partial updates. + + :param values: a dictionary with key/values to update. + """ + if not isinstance(values, dict): + raise TypeError("values should be a dict") + data = next(self._origin.apiget()) + data.update(values) + self._origin.apipost(jl={k: v for k, v in six.iteritems(data) + if k not in self._origin.ignore_fields}, + is_idempotent=True) + + def delete(self, key): + """Delete element by key. + + :param key: a string key + """ + self._origin.apidelete(key) + + def iter(self): + """Iterate through key/value pairs. + + :return: an iterator over key/value pairs. + :rtype: :class:`collections.Iterable` + """ + return six.iteritems(next(self._origin.apiget())) + + +def proxy_methods(origin, successor, methods): + """A helper to proxy methods from origin to successor. + + Accepts a list with strings and tuples: + + - each string defines: + a successor method name to proxy 1:1 with origin method + - each tuple should consist of 2 strings: + a successor method name and an origin method name + """ + for method in methods: + if isinstance(method, tuple): + successor_name, origin_name = method + else: + successor_name, origin_name = method, method + if not hasattr(successor, successor_name): + setattr(successor, successor_name, getattr(origin, origin_name)) + + +def format_iter_filters(params): + """Format iter() filter param on-the-fly. + + Support passing multiple filters at once as a list with tuples. + """ + filters = params.get('filter') + if filters and isinstance(filters, list): + filter_data = [] + for elem in params.pop('filter'): + if isinstance(elem, six.string_types): + filter_data.append(elem) + elif isinstance(elem, (list, tuple)): + filter_data.append(json.dumps(elem)) + else: + raise ValueError( + "Filter condition must be string, tuple or list") + if filter_data: + params['filter'] = filter_data + return params + + +def wrap_kwargs(fn, kwargs_fn): + """Tiny wrapper to prepare modified version of function kwargs""" + def wrapped(*args, **kwargs): + kwargs = kwargs_fn(kwargs) + return fn(*args, **kwargs) + return wrapped diff --git a/scrapinghub/client/requests.py b/scrapinghub/client/requests.py index e07cd36e..2f9a217c 100644 --- a/scrapinghub/client/requests.py +++ b/scrapinghub/client/requests.py @@ -1,6 +1,6 @@ from __future__ import absolute_import -from .utils import _Proxy +from .proxy import _Proxy class Requests(_Proxy): diff --git a/scrapinghub/client/samples.py b/scrapinghub/client/samples.py index 828677d2..e966abfe 100644 --- a/scrapinghub/client/samples.py +++ b/scrapinghub/client/samples.py @@ -1,6 +1,6 @@ from __future__ import absolute_import -from .utils import _Proxy +from .proxy import _Proxy class Samples(_Proxy): diff --git a/scrapinghub/client/utils.py b/scrapinghub/client/utils.py index 405eafde..78a51292 100644 --- a/scrapinghub/client/utils.py +++ b/scrapinghub/client/utils.py @@ -8,12 +8,6 @@ import six -from ..hubstorage.resourcetype import DownloadableResource -from ..hubstorage.resourcetype import ItemsResourceType -from ..hubstorage.collectionsrt import Collections - -from .exceptions import wrap_value_too_large - class LogLevel(object): DEBUG = logging.DEBUG @@ -84,174 +78,6 @@ def get_tags_for_update(**kwargs): return params -class _Proxy(object): - """A helper to create a class instance and proxy its methods to origin. - - The internal proxy class is useful to link class attributes from its - origin depending on the origin base class as a part of init logic: - - - :class:`~scrapinghub.hubstorage.resourcetype.ItemsResourceType` provides - items-based attributes to access items in an arbitrary collection with - get/write/flush/close/stats/iter methods. - - - :class:`~scrapinghub.hubstorage.resourcetype.DownloadableResource` provides - download-based attributes to iter through collection with or without - msgpack support. - """ - - def __init__(self, cls, client, key): - self.key = key - self._client = client - self._origin = cls(client._hsclient, key) - - if issubclass(cls, ItemsResourceType): - self._proxy_methods(['get', 'write', 'flush', 'close', - 'stats', ('iter', 'list')]) - # redefine write method to wrap hubstorage.ValueTooLarge error - origin_method = getattr(self, 'write') - setattr(self, 'write', wrap_value_too_large(origin_method)) - - # DType iter_values() has more priority than IType list() - # plus Collections interface doesn't need the iter methods - if issubclass(cls, DownloadableResource) and cls is not Collections: - methods = [('iter', 'iter_values'), - ('iter_raw_msgpack', 'iter_msgpack'), - ('iter_raw_json', 'iter_json')] - self._proxy_methods(methods) - self._wrap_iter_methods([method[0] for method in methods]) - - def _proxy_methods(self, methods): - """A little helper for cleaner interface.""" - proxy_methods(self._origin, self, methods) - - def _wrap_iter_methods(self, methods): - """Modify kwargs for all passed self.iter* methods.""" - for method in methods: - wrapped = wrap_kwargs(getattr(self, method), - self._modify_iter_params) - setattr(self, method, wrapped) - - def _modify_iter_params(self, params): - """A helper to modify iter() params on-the-fly. - - The method is internal and should be redefined in subclasses. - - :param params: a dictionary with input parameters. - :return: an updated dictionary with parameters. - :rtype: :class:`dict` - """ - return format_iter_filters(params) - - def list(self, *args, **kwargs): - """Convenient shortcut to list iter results. - - Please note that :meth:`list` method can use a lot of memory and for a - large amount of elements it's recommended to iterate through it via - :meth:`iter` method (all params and available filters are same for both - methods). - """ - return list(self.iter(*args, **kwargs)) - - -class _MappingProxy(_Proxy): - """A helper class to support basic get/set interface for dict-like - collections of elements. - """ - - def get(self, key): - """Get element value by key. - - :param key: a string key - """ - return next(self._origin.apiget(key)) - - def set(self, key, value): - """Set element value. - - :param key: a string key - :param value: new value to set for the key - """ - self._origin.apipost(key, data=json.dumps(value), is_idempotent=True) - - def update(self, values): - """Update multiple elements at once. - - The method provides convenient interface for partial updates. - - :param values: a dictionary with key/values to update. - """ - if not isinstance(values, dict): - raise TypeError("values should be a dict") - data = next(self._origin.apiget()) - data.update(values) - self._origin.apipost(jl={k: v for k, v in six.iteritems(data) - if k not in self._origin.ignore_fields}, - is_idempotent=True) - - def delete(self, key): - """Delete element by key. - - :param key: a string key - """ - self._origin.apidelete(key) - - def iter(self): - """Iterate through key/value pairs. - - :return: an iterator over key/value pairs. - :rtype: :class:`collections.Iterable` - """ - return six.iteritems(next(self._origin.apiget())) - - -def wrap_kwargs(fn, kwargs_fn): - """Tiny wrapper to prepare modified version of function kwargs""" - def wrapped(*args, **kwargs): - kwargs = kwargs_fn(kwargs) - return fn(*args, **kwargs) - return wrapped - - -def proxy_methods(origin, successor, methods): - """A helper to proxy methods from origin to successor. - - Accepts a list with strings and tuples: - - - each string defines: - a successor method name to proxy 1:1 with origin method - - each tuple should consist of 2 strings: - a successor method name and an origin method name - """ - for method in methods: - if isinstance(method, tuple): - successor_name, origin_name = method - else: - successor_name, origin_name = method, method - if not hasattr(successor, successor_name): - setattr(successor, successor_name, getattr(origin, origin_name)) - - -def format_iter_filters(params): - """Format iter() filter param on-the-fly. - - Support passing multiple filters at once as a list with tuples. - """ - filters = params.get('filter') - if filters and isinstance(filters, list): - filter_data = [] - for elem in params.pop('filter'): - if isinstance(elem, six.string_types): - filter_data.append(elem) - elif isinstance(elem, (list, tuple)): - filter_data.append(json.dumps(elem)) - else: - raise ValueError( - "Filter condition must be string, tuple or list") - if filter_data: - params['filter'] = filter_data - return params - - def update_kwargs(kwargs, **params): """Update kwargs dict with non-empty params with json-encoded values.""" kwargs.update({k: json.dumps(v) if isinstance(v, dict) else v diff --git a/tests/client/test_utils.py b/tests/client/test_utils.py index 03e4362c..57787ccd 100644 --- a/tests/client/test_utils.py +++ b/tests/client/test_utils.py @@ -5,7 +5,7 @@ import mock from scrapinghub.client.utils import parse_auth -from scrapinghub.client.utils import format_iter_filters +from scrapinghub.client.proxy import format_iter_filters def test_format_iter_filters(): From cc0453d7f4355ebdf83ccfbd9981e2dfbd7bd81d Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Tue, 28 Mar 2017 17:30:49 +0300 Subject: [PATCH 02/15] Convert proxy logic to subclasses system --- scrapinghub/client/activity.py | 8 ++- scrapinghub/client/collections.py | 90 ++++++++++++++---------- scrapinghub/client/frontiers.py | 7 +- scrapinghub/client/items.py | 4 +- scrapinghub/client/logs.py | 27 +++++-- scrapinghub/client/proxy.py | 112 ++++++++++++------------------ scrapinghub/client/requests.py | 10 +-- scrapinghub/client/samples.py | 4 +- 8 files changed, 139 insertions(+), 123 deletions(-) diff --git a/scrapinghub/client/activity.py b/scrapinghub/client/activity.py index b888a6e9..5cc6c3ee 100644 --- a/scrapinghub/client/activity.py +++ b/scrapinghub/client/activity.py @@ -1,6 +1,6 @@ from __future__ import absolute_import -from .proxy import _Proxy +from .proxy import _Proxy, format_iter_filters from .utils import parse_job_key @@ -46,8 +46,10 @@ class Activity(_Proxy): """ def __init__(self, *args, **kwargs): super(Activity, self).__init__(*args, **kwargs) - self._proxy_methods([('iter', 'list')]) - self._wrap_iter_methods(['iter']) + + def iter(self, **params): + params = format_iter_filters(params) + return self._origin.list(**params) def add(self, values, **kwargs): """Add new event to the project activity. diff --git a/scrapinghub/client/collections.py b/scrapinghub/client/collections.py index 868fdc5d..521cc008 100644 --- a/scrapinghub/client/collections.py +++ b/scrapinghub/client/collections.py @@ -5,11 +5,11 @@ from ..hubstorage.collectionsrt import Collection as _Collection -from .proxy import _Proxy, proxy_methods, wrap_kwargs, format_iter_filters +from .proxy import _Proxy, _DownloadableProxyMixin, format_iter_filters from .utils import update_kwargs -class Collections(_Proxy): +class Collections(_Proxy, _DownloadableProxyMixin): """Access to project collections. Not a public constructor: use :class:`~scrapinghub.client.projects.Project` @@ -144,42 +144,6 @@ class Collection(object): def __init__(self, client, collections, type_, name): self._client = client self._origin = _Collection(type_, name, collections._origin) - proxy_methods(self._origin, self, [ - 'create_writer', 'count', - ('iter', 'iter_values'), - ('iter_raw_json', 'iter_json'), - ]) - # simplified version of _Proxy._wrap_iter_methods logic - # to provide better support for filter param in iter methods - for method in ['iter', 'iter_raw_json']: - wrapped = wrap_kwargs(getattr(self, method), format_iter_filters) - setattr(self, method, wrapped) - - def list(self, key=None, prefix=None, prefixcount=None, startts=None, - endts=None, requests_params=None, **params): - """Convenient shortcut to list iter results. - - Please note that :meth:`list` method can use a lot of memory and for a - large amount of logs it's recommended to iterate through it - via :meth:`iter` method (all params and available filters are same for - both methods). - - :param key: a string key or a list of keys to filter with. - :param prefix: a string prefix to filter items. - :param prefixcount: maximum number of values to return per prefix. - :param startts: UNIX timestamp at which to begin results. - :param endts: UNIX timestamp at which to end results. - :param requests_params: (optional) a dict with optional requests params. - :param \*\*params: (optional) additional query params for the request. - :return: a list of items where each item is represented with a dict. - :rtype: :class:`list[dict]` - """ - # FIXME there should be similar docstrings for iter/iter_raw_json - # but as we proxy them as-is, it's not in place, should be improved - update_kwargs(params, key=key, prefix=prefix, prefixcount=prefixcount, - startts=startts, endts=endts, - requests_params=requests_params) - return list(self.iter(requests_params=None, **params)) def get(self, key, **params): """Get item from collection by key. @@ -215,6 +179,28 @@ def delete(self, keys): "object providing string keys") self._origin.delete(keys) + def count(self, *args, **kwargs): + return self._origin._collections.count( + self._origin.coltype, self._origin.colname, *args, **kwargs) + + def iter(self, key=None, prefix=None, prefixcount=None, startts=None, + endts=None, requests_params=None, **params): + update_kwargs(params, key=key, prefix=prefix, prefixcount=prefixcount, + startts=startts, endts=endts, + requests_params=requests_params) + params = format_iter_filters(params) + return self._origin._collections.iter_values( + self._origin.coltype, self._origin.colname, **params) + + def iter_raw_json(self, key=None, prefix=None, prefixcount=None, + startts=None, endts=None, requests_params=None, **params): + update_kwargs(params, key=key, prefix=prefix, prefixcount=prefixcount, + startts=startts, endts=endts, + requests_params=requests_params) + params = format_iter_filters(params) + return self._origin._collections.iter_json( + self._origin.coltype, self._origin.colname, **params) + def iter_raw_msgpack(self, key=None, prefix=None, prefixcount=None, startts=None, endts=None, requests_params=None, **params): @@ -234,5 +220,33 @@ def iter_raw_msgpack(self, key=None, prefix=None, prefixcount=None, update_kwargs(params, key=key, prefix=prefix, prefixcount=prefixcount, startts=startts, endts=endts, requests_params=requests_params) + params = format_iter_filters(params) return self._origin._collections.iter_msgpack( self._origin.coltype, self._origin.colname, **params) + + def list(self, key=None, prefix=None, prefixcount=None, startts=None, + endts=None, requests_params=None, **params): + """Convenient shortcut to list iter results. + + Please note that :meth:`list` method can use a lot of memory and for a + large amount of logs it's recommended to iterate through it + via :meth:`iter` method (all params and available filters are same for + both methods). + + :param key: a string key or a list of keys to filter with. + :param prefix: a string prefix to filter items. + :param prefixcount: maximum number of values to return per prefix. + :param startts: UNIX timestamp at which to begin results. + :param endts: UNIX timestamp at which to end results. + :param requests_params: (optional) a dict with optional requests params. + :param \*\*params: (optional) additional query params for the request. + :return: a list of items where each item is represented with a dict. + :rtype: :class:`list[dict]` + """ + update_kwargs(params, key=key, prefix=prefix, prefixcount=prefixcount, + startts=startts, endts=endts) + return list(self.iter(requests_params=requests_params, **params)) + + def create_writer(self, **kwargs): + return self._origin._collections.create_writer( + self._origin.coltype, self._origin.colname, **kwargs) diff --git a/scrapinghub/client/frontiers.py b/scrapinghub/client/frontiers.py index bf2ae721..e9d4db96 100644 --- a/scrapinghub/client/frontiers.py +++ b/scrapinghub/client/frontiers.py @@ -85,7 +85,6 @@ class Frontiers(_Proxy): """ def __init__(self, *args, **kwargs): super(Frontiers, self).__init__(*args, **kwargs) - self._proxy_methods(['close', 'flush']) def get(self, name): """Get a frontier by name. @@ -121,6 +120,12 @@ def newcount(self): """ return sum(self._origin.newcount.values()) + def flush(self): + self._origin.flush() + + def close(self): + self._origin.close() + class Frontier(object): """Representation of a frontier object. diff --git a/scrapinghub/client/items.py b/scrapinghub/client/items.py index 56069344..45d0e7d1 100644 --- a/scrapinghub/client/items.py +++ b/scrapinghub/client/items.py @@ -1,9 +1,9 @@ from __future__ import absolute_import -from .proxy import _Proxy +from .proxy import _ItemsResourceProxy, _DownloadableProxyMixin -class Items(_Proxy): +class Items(_ItemsResourceProxy, _DownloadableProxyMixin): """Representation of collection of job items. Not a public constructor: use :class:`~scrapinghub.client.jobs.Job` instanc diff --git a/scrapinghub/client/logs.py b/scrapinghub/client/logs.py index 61efecce..8b53356a 100644 --- a/scrapinghub/client/logs.py +++ b/scrapinghub/client/logs.py @@ -1,11 +1,13 @@ from __future__ import absolute_import + import json +import logging -from .proxy import _Proxy +from .proxy import _ItemsResourceProxy, _DownloadableProxyMixin from .utils import LogLevel -class Logs(_Proxy): +class Logs(_ItemsResourceProxy, _DownloadableProxyMixin): """Representation of collection of job logs. Not a public constructor: use :class:`~scrapinghub.client.jobs.Job` instance @@ -47,11 +49,24 @@ class Logs(_Proxy): 'time': 1486375511188, }] """ + def log(self, message, level=logging.INFO, ts=None, **other): + self._origin.log(message, level=level, ts=ts, **other) + + def debug(self, message, **other): + self._origin.debug(message, **other) + + def info(self, message, **other): + self._origin.info(message, **other) + + def warn(self, message, **other): + self._origin.warn(message, **other) + warning = warn + + def error(self, message, **other): + self._origin.error(message, **other) - def __init__(self, *args, **kwargs): - super(Logs, self).__init__(*args, **kwargs) - self._proxy_methods(['log', 'debug', 'info', 'warning', 'warn', - 'error', 'batch_write_start']) + def batch_write_start(self): + return self._origin.batch_write_start() def _modify_iter_params(self, params): """Modify iter() filters on-the-fly. diff --git a/scrapinghub/client/proxy.py b/scrapinghub/client/proxy.py index e0e6a8ce..2d2e26da 100644 --- a/scrapinghub/client/proxy.py +++ b/scrapinghub/client/proxy.py @@ -3,10 +3,6 @@ import six import json -from ..hubstorage.resourcetype import DownloadableResource -from ..hubstorage.resourcetype import ItemsResourceType -from ..hubstorage.collectionsrt import Collections - from .exceptions import wrap_value_too_large @@ -30,35 +26,18 @@ def __init__(self, cls, client, key): self._client = client self._origin = cls(client._hsclient, key) - if issubclass(cls, ItemsResourceType): - self._proxy_methods(['get', 'write', 'flush', 'close', - 'stats', ('iter', 'list')]) - # redefine write method to wrap hubstorage.ValueTooLarge error - origin_method = getattr(self, 'write') - setattr(self, 'write', wrap_value_too_large(origin_method)) - - # DType iter_values() has more priority than IType list() - # plus Collections interface doesn't need the iter methods - if issubclass(cls, DownloadableResource) and cls is not Collections: - methods = [('iter', 'iter_values'), - ('iter_raw_msgpack', 'iter_msgpack'), - ('iter_raw_json', 'iter_json')] - self._proxy_methods(methods) - self._wrap_iter_methods([method[0] for method in methods]) - - def _proxy_methods(self, methods): - """A little helper for cleaner interface.""" - proxy_methods(self._origin, self, methods) - - def _wrap_iter_methods(self, methods): - """Modify kwargs for all passed self.iter* methods.""" - for method in methods: - wrapped = wrap_kwargs(getattr(self, method), - self._modify_iter_params) - setattr(self, method, wrapped) + def list(self, *args, **kwargs): + """Convenient shortcut to list iter results. + + Please note that :meth:`list` method can use a lot of memory and for a + large amount of elements it's recommended to iterate through it via + :meth:`iter` method (all params and available filters are same for both + methods). + """ + return list(self.iter(*args, **kwargs)) def _modify_iter_params(self, params): - """A helper to modify iter() params on-the-fly. + """A helper to modify iter*() params on-the-fly. The method is internal and should be redefined in subclasses. @@ -68,15 +47,43 @@ def _modify_iter_params(self, params): """ return format_iter_filters(params) - def list(self, *args, **kwargs): - """Convenient shortcut to list iter results. - Please note that :meth:`list` method can use a lot of memory and for a - large amount of elements it's recommended to iterate through it via - :meth:`iter` method (all params and available filters are same for both - methods). - """ - return list(self.iter(*args, **kwargs)) +class _ItemsResourceProxy(_Proxy): + + def get(self, _key, **params): + return self._origin.get(_key, **params) + + @wrap_value_too_large + def write(self, item): + return self._origin.write(item) + + def iter(self, _key=None, **params): + params = self._modify_iter_params(params) + return self._origin.list(_key, **params) + + def flush(self): + self._origin.flush() + + def stats(self): + return self._origin.stats() + + def close(self, block=True): + self._origin.close(block) + + +class _DownloadableProxyMixin(object): + + def iter(self, _path=None, requests_params=None, **apiparams): + apiparams = self._modify_iter_params(apiparams) + return self._origin.iter_values(_path, requests_params, **apiparams) + + def iter_raw_json(self, _path=None, requests_params=None, **apiparams): + apiparams = self._modify_iter_params(apiparams) + return self._origin.iter_json(_path, requests_params, **apiparams) + + def iter_raw_msgpack(self, _path=None, requests_params=None, **apiparams): + apiparams = self._modify_iter_params(apiparams) + return self._origin.iter_msgpack(_path, requests_params, **apiparams) class _MappingProxy(_Proxy): @@ -130,25 +137,6 @@ def iter(self): return six.iteritems(next(self._origin.apiget())) -def proxy_methods(origin, successor, methods): - """A helper to proxy methods from origin to successor. - - Accepts a list with strings and tuples: - - - each string defines: - a successor method name to proxy 1:1 with origin method - - each tuple should consist of 2 strings: - a successor method name and an origin method name - """ - for method in methods: - if isinstance(method, tuple): - successor_name, origin_name = method - else: - successor_name, origin_name = method, method - if not hasattr(successor, successor_name): - setattr(successor, successor_name, getattr(origin, origin_name)) - - def format_iter_filters(params): """Format iter() filter param on-the-fly. @@ -168,11 +156,3 @@ def format_iter_filters(params): if filter_data: params['filter'] = filter_data return params - - -def wrap_kwargs(fn, kwargs_fn): - """Tiny wrapper to prepare modified version of function kwargs""" - def wrapped(*args, **kwargs): - kwargs = kwargs_fn(kwargs) - return fn(*args, **kwargs) - return wrapped diff --git a/scrapinghub/client/requests.py b/scrapinghub/client/requests.py index 2f9a217c..8a6808f1 100644 --- a/scrapinghub/client/requests.py +++ b/scrapinghub/client/requests.py @@ -1,9 +1,9 @@ from __future__ import absolute_import -from .proxy import _Proxy +from .proxy import _ItemsResourceProxy, _DownloadableProxyMixin -class Requests(_Proxy): +class Requests(_ItemsResourceProxy, _DownloadableProxyMixin): """Representation of collection of job requests. Not a public constructor: use :class:`~scrapinghub.client.jobs.Job` instance @@ -41,6 +41,6 @@ class Requests(_Proxy): 'url': 'https://example.com' }] """ - def __init__(self, *args, **kwargs): - super(Requests, self).__init__(*args, **kwargs) - self._proxy_methods(['add']) + def add(self, url, status, method, rs, parent, duration, ts, fp=None): + return self._origin.add( + url, status, method, rs, parent, duration, ts, fp=None) diff --git a/scrapinghub/client/samples.py b/scrapinghub/client/samples.py index e966abfe..87a8e9bc 100644 --- a/scrapinghub/client/samples.py +++ b/scrapinghub/client/samples.py @@ -1,9 +1,9 @@ from __future__ import absolute_import -from .proxy import _Proxy +from .proxy import _ItemsResourceProxy -class Samples(_Proxy): +class Samples(_ItemsResourceProxy): """Representation of collection of job samples. Not a public constructor: use :class:`~scrapinghub.client.jobs.Job` instance From 54484ee7b24b75dac4212212090417f04b9ee6cd Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Tue, 28 Mar 2017 17:44:53 +0300 Subject: [PATCH 03/15] Hide format_iter_filters method --- scrapinghub/client/activity.py | 4 ++-- scrapinghub/client/collections.py | 9 +++++---- scrapinghub/client/proxy.py | 4 ++-- tests/client/test_utils.py | 18 +++++++++--------- 4 files changed, 18 insertions(+), 17 deletions(-) diff --git a/scrapinghub/client/activity.py b/scrapinghub/client/activity.py index 5cc6c3ee..8aff6f82 100644 --- a/scrapinghub/client/activity.py +++ b/scrapinghub/client/activity.py @@ -1,6 +1,6 @@ from __future__ import absolute_import -from .proxy import _Proxy, format_iter_filters +from .proxy import _Proxy from .utils import parse_job_key @@ -48,7 +48,7 @@ def __init__(self, *args, **kwargs): super(Activity, self).__init__(*args, **kwargs) def iter(self, **params): - params = format_iter_filters(params) + params = self._modify_iter_params(params) return self._origin.list(**params) def add(self, values, **kwargs): diff --git a/scrapinghub/client/collections.py b/scrapinghub/client/collections.py index 521cc008..a7fc9a0d 100644 --- a/scrapinghub/client/collections.py +++ b/scrapinghub/client/collections.py @@ -5,7 +5,7 @@ from ..hubstorage.collectionsrt import Collection as _Collection -from .proxy import _Proxy, _DownloadableProxyMixin, format_iter_filters +from .proxy import _Proxy, _DownloadableProxyMixin from .utils import update_kwargs @@ -143,6 +143,7 @@ class Collection(object): def __init__(self, client, collections, type_, name): self._client = client + self._collections = collections self._origin = _Collection(type_, name, collections._origin) def get(self, key, **params): @@ -188,7 +189,7 @@ def iter(self, key=None, prefix=None, prefixcount=None, startts=None, update_kwargs(params, key=key, prefix=prefix, prefixcount=prefixcount, startts=startts, endts=endts, requests_params=requests_params) - params = format_iter_filters(params) + params = self._collections._modify_iter_params(params) return self._origin._collections.iter_values( self._origin.coltype, self._origin.colname, **params) @@ -197,7 +198,7 @@ def iter_raw_json(self, key=None, prefix=None, prefixcount=None, update_kwargs(params, key=key, prefix=prefix, prefixcount=prefixcount, startts=startts, endts=endts, requests_params=requests_params) - params = format_iter_filters(params) + params = self._collections._modify_iter_params(params) return self._origin._collections.iter_json( self._origin.coltype, self._origin.colname, **params) @@ -220,7 +221,7 @@ def iter_raw_msgpack(self, key=None, prefix=None, prefixcount=None, update_kwargs(params, key=key, prefix=prefix, prefixcount=prefixcount, startts=startts, endts=endts, requests_params=requests_params) - params = format_iter_filters(params) + params = self._collections._modify_iter_params(params) return self._origin._collections.iter_msgpack( self._origin.coltype, self._origin.colname, **params) diff --git a/scrapinghub/client/proxy.py b/scrapinghub/client/proxy.py index 2d2e26da..51708c05 100644 --- a/scrapinghub/client/proxy.py +++ b/scrapinghub/client/proxy.py @@ -45,7 +45,7 @@ def _modify_iter_params(self, params): :return: an updated dictionary with parameters. :rtype: :class:`dict` """ - return format_iter_filters(params) + return _format_iter_filters(params) class _ItemsResourceProxy(_Proxy): @@ -137,7 +137,7 @@ def iter(self): return six.iteritems(next(self._origin.apiget())) -def format_iter_filters(params): +def _format_iter_filters(params): """Format iter() filter param on-the-fly. Support passing multiple filters at once as a list with tuples. diff --git a/tests/client/test_utils.py b/tests/client/test_utils.py index 57787ccd..201ee78d 100644 --- a/tests/client/test_utils.py +++ b/tests/client/test_utils.py @@ -5,42 +5,42 @@ import mock from scrapinghub.client.utils import parse_auth -from scrapinghub.client.proxy import format_iter_filters +from scrapinghub.client.proxy import _format_iter_filters def test_format_iter_filters(): # work with empty params - assert format_iter_filters({}) == {} + assert _format_iter_filters({}) == {} # doesn't affect other params params = {'a': 123, 'b': 456} - assert format_iter_filters(params) == params + assert _format_iter_filters(params) == params # pass filter as-is if not list params = {'filter': 'some-string'} - assert format_iter_filters(params) == params + assert _format_iter_filters(params) == params # work fine with empty filter params = {'filter': []} - assert format_iter_filters(params) == params + assert _format_iter_filters(params) == params # pass string filters as-is params = {'filter': ['str1', 'str2']} - assert format_iter_filters(params) == params + assert _format_iter_filters(params) == params # converts list-formatted filters params = {'filter': [['field', '>=', ['val']], 'filter2']} - assert (format_iter_filters(params) == + assert (_format_iter_filters(params) == {'filter': ['["field", ">=", ["val"]]', 'filter2']}) # works the same with tuple entries params = {'filter': [('field', '==', ['val'])]} - assert (format_iter_filters(params) == + assert (_format_iter_filters(params) == {'filter': ['["field", "==", ["val"]]']}) # exception if entry is not list/tuple or string with pytest.raises(ValueError): - format_iter_filters({'filter': ['test', 123]}) + _format_iter_filters({'filter': ['test', 123]}) def test_parse_auth_none(): From dc76cb7cb3eb2c35afe5d4e41130aa3132e61488 Mon Sep 17 00:00:00 2001 From: Alexander Chekunkov Date: Tue, 28 Mar 2017 15:56:47 +0100 Subject: [PATCH 04/15] Make exception wrappers private Make wrap_http_errors private and remove wrap_value_too_large because it's not needed any more. --- scrapinghub/client/__init__.py | 7 +++---- scrapinghub/client/exceptions.py | 14 +------------- scrapinghub/client/proxy.py | 9 ++++++--- scrapinghub/client/spiders.py | 7 +++---- 4 files changed, 13 insertions(+), 24 deletions(-) diff --git a/scrapinghub/client/__init__.py b/scrapinghub/client/__init__.py index a751075c..0cf5d080 100644 --- a/scrapinghub/client/__init__.py +++ b/scrapinghub/client/__init__.py @@ -1,9 +1,8 @@ from scrapinghub import Connection as _Connection from scrapinghub import HubstorageClient as _HubstorageClient +from .exceptions import _wrap_http_errors from .projects import Projects -from .exceptions import wrap_http_errors - from .utils import parse_auth from .utils import parse_project_id, parse_job_key @@ -13,14 +12,14 @@ class Connection(_Connection): - @wrap_http_errors + @_wrap_http_errors def _request(self, *args, **kwargs): return super(Connection, self)._request(*args, **kwargs) class HubstorageClient(_HubstorageClient): - @wrap_http_errors + @_wrap_http_errors def request(self, *args, **kwargs): return super(HubstorageClient, self).request(*args, **kwargs) diff --git a/scrapinghub/client/exceptions.py b/scrapinghub/client/exceptions.py index d79b2eac..6a4b405c 100644 --- a/scrapinghub/client/exceptions.py +++ b/scrapinghub/client/exceptions.py @@ -5,7 +5,6 @@ from requests import HTTPError from ..legacy import APIError -from ..hubstorage import ValueTooLarge as _ValueTooLarge def _get_http_error_msg(exc): @@ -57,7 +56,7 @@ class ServerError(ScrapinghubAPIError): """Indicates some server error: something unexpected has happened.""" -def wrap_http_errors(method): +def _wrap_http_errors(method): """Internal helper to handle exceptions gracefully.""" @wraps(method) def wrapped(*args, **kwargs): @@ -92,14 +91,3 @@ def wrapped(*args, **kwargs): raise ServerError(http_error=exc) raise ScrapinghubAPIError(msg) return wrapped - - -def wrap_value_too_large(method): - """Internal wrapper for ValueTooLarge exception.""" - @wraps(method) - def wrapped(*args, **kwargs): - try: - return method(*args, **kwargs) - except _ValueTooLarge as exc: - raise ValueTooLarge(str(exc)) - return wrapped diff --git a/scrapinghub/client/proxy.py b/scrapinghub/client/proxy.py index 51708c05..c83a0d85 100644 --- a/scrapinghub/client/proxy.py +++ b/scrapinghub/client/proxy.py @@ -3,7 +3,8 @@ import six import json -from .exceptions import wrap_value_too_large +from ..hubstorage import ValueTooLarge as _ValueTooLarge +from .exceptions import ValueTooLarge class _Proxy(object): @@ -53,9 +54,11 @@ class _ItemsResourceProxy(_Proxy): def get(self, _key, **params): return self._origin.get(_key, **params) - @wrap_value_too_large def write(self, item): - return self._origin.write(item) + try: + return self._origin.write(item) + except _ValueTooLarge as exc: + raise ValueTooLarge(str(exc)) def iter(self, _key=None, **params): params = self._modify_iter_params(params) diff --git a/scrapinghub/client/spiders.py b/scrapinghub/client/spiders.py index 1d8e4e81..1d665801 100644 --- a/scrapinghub/client/spiders.py +++ b/scrapinghub/client/spiders.py @@ -2,9 +2,8 @@ from requests.compat import urljoin +from .exceptions import NotFound, _wrap_http_errors from .jobs import Jobs -from .exceptions import NotFound -from .exceptions import wrap_http_errors from .utils import get_tags_for_update @@ -104,7 +103,7 @@ def __init__(self, client, project_id, spider_id, spider): self.jobs = Jobs(client, project_id, self) self._client = client - @wrap_http_errors + @_wrap_http_errors def update_tags(self, add=None, remove=None): """Update tags for the spider. @@ -118,7 +117,7 @@ def update_tags(self, add=None, remove=None): response = self._client._connection._session.patch(url, json=params) response.raise_for_status() - @wrap_http_errors + @_wrap_http_errors def list_tags(self): """List spider tags. From ac2a05234f8ed42a59f5253ad00d1ad907073b10 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Tue, 28 Mar 2017 18:12:59 +0300 Subject: [PATCH 05/15] Add docstrings for proxy module --- scrapinghub/client/proxy.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/scrapinghub/client/proxy.py b/scrapinghub/client/proxy.py index c83a0d85..638f3bcb 100644 --- a/scrapinghub/client/proxy.py +++ b/scrapinghub/client/proxy.py @@ -55,36 +55,70 @@ def get(self, _key, **params): return self._origin.get(_key, **params) def write(self, item): + """Write new element to collection.""" try: return self._origin.write(item) except _ValueTooLarge as exc: raise ValueTooLarge(str(exc)) def iter(self, _key=None, **params): + """Iterate over elements in collection. + + :return: a generator object over a list of element dictionaries. + :rtype: :class:`types.GeneratorType[dict]` + """ + # TODO describe allowable params params = self._modify_iter_params(params) return self._origin.list(_key, **params) def flush(self): + """Flush data from writer threads.""" self._origin.flush() def stats(self): + """Get resource stats. + + :return: a dictionary with stats data. + :rtype: :class:`dict` + """ return self._origin.stats() def close(self, block=True): + """Close writers one-by-one.""" self._origin.close(block) class _DownloadableProxyMixin(object): def iter(self, _path=None, requests_params=None, **apiparams): + """A general method to iterate through elements. + + :return: an iterator over elements list. + :rtype: :class:`collections.Iterable` + """ + # TODO describe allowable params apiparams = self._modify_iter_params(apiparams) return self._origin.iter_values(_path, requests_params, **apiparams) def iter_raw_json(self, _path=None, requests_params=None, **apiparams): + """A method to iterate through raw json-packed elements. + Can be convenient if data is needed in raw json format. + + :return: an iterator over elements list packed with json. + :rtype: :class:`collections.Iterable[str]` + """ + # TODO describe allowable params apiparams = self._modify_iter_params(apiparams) return self._origin.iter_json(_path, requests_params, **apiparams) def iter_raw_msgpack(self, _path=None, requests_params=None, **apiparams): + """A method to iterate through raw msgpack-ed elements. + Can be convenient if data is needed in same msgpack format. + + :return: an iterator over elements list packed with msgpack. + :rtype: :class:`collections.Iterable[bytes]` + """ + # TODO describe allowable params apiparams = self._modify_iter_params(apiparams) return self._origin.iter_msgpack(_path, requests_params, **apiparams) From 93582c3a8bdb082a2b98cab3d2d81365239b164b Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Tue, 28 Mar 2017 18:38:01 +0300 Subject: [PATCH 06/15] Add missing docstrings --- scrapinghub/client/activity.py | 9 ++++--- scrapinghub/client/collections.py | 45 ++++++++++++++++++++++++++++++- scrapinghub/client/frontiers.py | 2 ++ scrapinghub/client/logs.py | 12 +++++++++ scrapinghub/client/requests.py | 11 ++++++++ 5 files changed, 75 insertions(+), 4 deletions(-) diff --git a/scrapinghub/client/activity.py b/scrapinghub/client/activity.py index 8aff6f82..41ef217b 100644 --- a/scrapinghub/client/activity.py +++ b/scrapinghub/client/activity.py @@ -44,10 +44,13 @@ class Activity(_Proxy): >>> project.activity.add(events) """ - def __init__(self, *args, **kwargs): - super(Activity, self).__init__(*args, **kwargs) - def iter(self, **params): + """Iterate over activity events. + + :return: a generator object over a list of activity event dicts. + :rtype: :class:`types.GeneratorType[dict]` + """ + # TODO describe allowable params params = self._modify_iter_params(params) return self._origin.list(**params) diff --git a/scrapinghub/client/collections.py b/scrapinghub/client/collections.py index a7fc9a0d..751f0213 100644 --- a/scrapinghub/client/collections.py +++ b/scrapinghub/client/collections.py @@ -181,11 +181,29 @@ def delete(self, keys): self._origin.delete(keys) def count(self, *args, **kwargs): + """Count collection items with a given filters. + + :return: amount of elements in collection. + :rtype: :class:`int` + """ + # TODO describe allowable params return self._origin._collections.count( self._origin.coltype, self._origin.colname, *args, **kwargs) def iter(self, key=None, prefix=None, prefixcount=None, startts=None, endts=None, requests_params=None, **params): + """A method to iterate through collection items. + + :param key: a string key or a list of keys to filter with. + :param prefix: a string prefix to filter items. + :param prefixcount: maximum number of values to return per prefix. + :param startts: UNIX timestamp at which to begin results. + :param endts: UNIX timestamp at which to end results. + :param requests_params: (optional) a dict with optional requests params. + :param \*\*params: (optional) additional query params for the request. + :return: an iterator over items list. + :rtype: :class:`collections.Iterable[dict]` + """ update_kwargs(params, key=key, prefix=prefix, prefixcount=prefixcount, startts=startts, endts=endts, requests_params=requests_params) @@ -195,6 +213,19 @@ def iter(self, key=None, prefix=None, prefixcount=None, startts=None, def iter_raw_json(self, key=None, prefix=None, prefixcount=None, startts=None, endts=None, requests_params=None, **params): + """A method to iterate through json pack-ed items. + Can be convenient if data is needed in the json format. + + :param key: a string key or a list of keys to filter with. + :param prefix: a string prefix to filter items. + :param prefixcount: maximum number of values to return per prefix. + :param startts: UNIX timestamp at which to begin results. + :param endts: UNIX timestamp at which to end results. + :param requests_params: (optional) a dict with optional requests params. + :param \*\*params: (optional) additional query params for the request. + :return: an iterator over items list packed with json. + :rtype: :class:`collections.Iterable[str]` + """ update_kwargs(params, key=key, prefix=prefix, prefixcount=prefixcount, startts=startts, endts=endts, requests_params=requests_params) @@ -248,6 +279,18 @@ def list(self, key=None, prefix=None, prefixcount=None, startts=None, startts=startts, endts=endts) return list(self.iter(requests_params=requests_params, **params)) - def create_writer(self, **kwargs): + def create_writer(self, start=0, auth=None, size=1000, interval=15, + qsize=None, content_encoding='identity', + maxitemsize=1024 ** 2, callback=None): + """Create a new writer for a collection. + + :return: a new writer object. + :rtype: :class:`~scrapinghub.hubstorage.batchuploader._BatchWriter` + """ + # TODO describe allowable params + kwargs = {} + update_kwargs(start=start, auth=auth, size=size, interval=interval, + qsize=qsize, content_encoding=content_encoding, + maxitemsize=maxitemsize, callback=callback) return self._origin._collections.create_writer( self._origin.coltype, self._origin.colname, **kwargs) diff --git a/scrapinghub/client/frontiers.py b/scrapinghub/client/frontiers.py index e9d4db96..a6c2f30e 100644 --- a/scrapinghub/client/frontiers.py +++ b/scrapinghub/client/frontiers.py @@ -121,9 +121,11 @@ def newcount(self): return sum(self._origin.newcount.values()) def flush(self): + """Flush data in all frontiers writer threads.""" self._origin.flush() def close(self): + """Close frontier writer threads one-by-one.""" self._origin.close() diff --git a/scrapinghub/client/logs.py b/scrapinghub/client/logs.py index 8b53356a..130e5720 100644 --- a/scrapinghub/client/logs.py +++ b/scrapinghub/client/logs.py @@ -50,22 +50,34 @@ class Logs(_ItemsResourceProxy, _DownloadableProxyMixin): }] """ def log(self, message, level=logging.INFO, ts=None, **other): + """Base method to write a log entry. + + :param message: a string message. + :param level: (optional) logging level, default to INFO. + :param ts: (optional) unix timestamp in milliseconds. + :param \*\*other: other optional kwargs. + """ self._origin.log(message, level=level, ts=ts, **other) def debug(self, message, **other): + """Log a message with DEBUG level.""" self._origin.debug(message, **other) def info(self, message, **other): + """Log a message with INFO level.""" self._origin.info(message, **other) def warn(self, message, **other): + """Log a message with WARN level.""" self._origin.warn(message, **other) warning = warn def error(self, message, **other): + """Log a message with ERROR level.""" self._origin.error(message, **other) def batch_write_start(self): + """Override to set a start parameter when commencing writing.""" return self._origin.batch_write_start() def _modify_iter_params(self, params): diff --git a/scrapinghub/client/requests.py b/scrapinghub/client/requests.py index 8a6808f1..13acf6e6 100644 --- a/scrapinghub/client/requests.py +++ b/scrapinghub/client/requests.py @@ -42,5 +42,16 @@ class Requests(_ItemsResourceProxy, _DownloadableProxyMixin): }] """ def add(self, url, status, method, rs, parent, duration, ts, fp=None): + """ Add a new requests. + + :param url: string url for the request. + :param status: HTTP status of the request. + :param method: stringified request method. + :param rs: response body length. + :param parent: parent request id or ``None``. + :param duration: request duration in milliseconds. + :param ts: unix timestamp in milliseconds. + :param fp: (optional) string fingerprint for the request. + """ return self._origin.add( url, status, method, rs, parent, duration, ts, fp=None) From 22f646f19e59aff98a6b668361ecf7bb7b4359fb Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Tue, 28 Mar 2017 19:35:56 +0300 Subject: [PATCH 07/15] Add count param for iter* methods --- scrapinghub/client/activity.py | 7 ++++--- scrapinghub/client/collections.py | 14 ++++++++++++-- scrapinghub/client/proxy.py | 23 +++++++++++++++-------- 3 files changed, 31 insertions(+), 13 deletions(-) diff --git a/scrapinghub/client/activity.py b/scrapinghub/client/activity.py index 41ef217b..665ee3b6 100644 --- a/scrapinghub/client/activity.py +++ b/scrapinghub/client/activity.py @@ -1,7 +1,7 @@ from __future__ import absolute_import from .proxy import _Proxy -from .utils import parse_job_key +from .utils import parse_job_key, update_kwargs class Activity(_Proxy): @@ -44,13 +44,14 @@ class Activity(_Proxy): >>> project.activity.add(events) """ - def iter(self, **params): + def iter(self, count=None, **params): """Iterate over activity events. + :param count: limit amount of elements. :return: a generator object over a list of activity event dicts. :rtype: :class:`types.GeneratorType[dict]` """ - # TODO describe allowable params + update_kwargs(params, count=count) params = self._modify_iter_params(params) return self._origin.list(**params) diff --git a/scrapinghub/client/collections.py b/scrapinghub/client/collections.py index 751f0213..ee7c4bd1 100644 --- a/scrapinghub/client/collections.py +++ b/scrapinghub/client/collections.py @@ -284,12 +284,22 @@ def create_writer(self, start=0, auth=None, size=1000, interval=15, maxitemsize=1024 ** 2, callback=None): """Create a new writer for a collection. + :param start: (optional) initial offset for writer thread. + :param auth: (optional) set auth credentials for the request. + :param size: (optional) set initial queue size. + :param interval: (optional) set interval for writer thread. + :param qsize: (optional) setup max queue size for the writer. + :param content_encoding: (optional) set different Content-Encoding header. + :param maxitemsize: (optional) max item size in bytes. + :param callback: (optional) some callback function. :return: a new writer object. :rtype: :class:`~scrapinghub.hubstorage.batchuploader._BatchWriter` + + If provided - calllback shouldn't try to inject more items in the queue, + otherwise it can lead to deadlocks. """ - # TODO describe allowable params kwargs = {} - update_kwargs(start=start, auth=auth, size=size, interval=interval, + update_kwargs(kwargs, start=start, auth=auth, size=size, interval=interval, qsize=qsize, content_encoding=content_encoding, maxitemsize=maxitemsize, callback=callback) return self._origin._collections.create_writer( diff --git a/scrapinghub/client/proxy.py b/scrapinghub/client/proxy.py index 638f3bcb..b15af190 100644 --- a/scrapinghub/client/proxy.py +++ b/scrapinghub/client/proxy.py @@ -4,6 +4,7 @@ import json from ..hubstorage import ValueTooLarge as _ValueTooLarge +from .utils import update_kwargs from .exceptions import ValueTooLarge @@ -61,13 +62,14 @@ def write(self, item): except _ValueTooLarge as exc: raise ValueTooLarge(str(exc)) - def iter(self, _key=None, **params): + def iter(self, _key=None, count=None, **params): """Iterate over elements in collection. + :param count: limit amount of elements. :return: a generator object over a list of element dictionaries. :rtype: :class:`types.GeneratorType[dict]` """ - # TODO describe allowable params + update_kwargs(params or {}, count=count) params = self._modify_iter_params(params) return self._origin.list(_key, **params) @@ -90,35 +92,40 @@ def close(self, block=True): class _DownloadableProxyMixin(object): - def iter(self, _path=None, requests_params=None, **apiparams): + def iter(self, _path=None, count=None, requests_params=None, **apiparams): """A general method to iterate through elements. + :param count: limit amount of elements. :return: an iterator over elements list. :rtype: :class:`collections.Iterable` """ - # TODO describe allowable params + update_kwargs(apiparams, count=count) apiparams = self._modify_iter_params(apiparams) return self._origin.iter_values(_path, requests_params, **apiparams) - def iter_raw_json(self, _path=None, requests_params=None, **apiparams): + def iter_raw_json(self, _path=None, count=None, requests_params=None, + **apiparams): """A method to iterate through raw json-packed elements. Can be convenient if data is needed in raw json format. + :param count: limit amount of elements. :return: an iterator over elements list packed with json. :rtype: :class:`collections.Iterable[str]` """ - # TODO describe allowable params + update_kwargs(apiparams, count=count) apiparams = self._modify_iter_params(apiparams) return self._origin.iter_json(_path, requests_params, **apiparams) - def iter_raw_msgpack(self, _path=None, requests_params=None, **apiparams): + def iter_raw_msgpack(self, _path=None, count=None, requests_params=None, + **apiparams): """A method to iterate through raw msgpack-ed elements. Can be convenient if data is needed in same msgpack format. + :param count: limit amount of elements. :return: an iterator over elements list packed with msgpack. :rtype: :class:`collections.Iterable[bytes]` """ - # TODO describe allowable params + update_kwargs(apiparams, count=count) apiparams = self._modify_iter_params(apiparams) return self._origin.iter_msgpack(_path, requests_params, **apiparams) From 5cd822189cd577526d3a3ff0d88d3fb83f7a57a6 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Tue, 28 Mar 2017 19:37:32 +0300 Subject: [PATCH 08/15] Mv proxy tests to a separate module --- tests/client/test_proxy.py | 38 ++++++++++++++++++++++++++++++++++++++ tests/client/test_utils.py | 36 ------------------------------------ 2 files changed, 38 insertions(+), 36 deletions(-) create mode 100644 tests/client/test_proxy.py diff --git a/tests/client/test_proxy.py b/tests/client/test_proxy.py new file mode 100644 index 00000000..7fd4f272 --- /dev/null +++ b/tests/client/test_proxy.py @@ -0,0 +1,38 @@ +import pytest + +from scrapinghub.client.proxy import _format_iter_filters + + +def test_format_iter_filters(): + # work with empty params + assert _format_iter_filters({}) == {} + + # doesn't affect other params + params = {'a': 123, 'b': 456} + assert _format_iter_filters(params) == params + + # pass filter as-is if not list + params = {'filter': 'some-string'} + assert _format_iter_filters(params) == params + + # work fine with empty filter + params = {'filter': []} + assert _format_iter_filters(params) == params + + # pass string filters as-is + params = {'filter': ['str1', 'str2']} + assert _format_iter_filters(params) == params + + # converts list-formatted filters + params = {'filter': [['field', '>=', ['val']], 'filter2']} + assert (_format_iter_filters(params) == + {'filter': ['["field", ">=", ["val"]]', 'filter2']}) + + # works the same with tuple entries + params = {'filter': [('field', '==', ['val'])]} + assert (_format_iter_filters(params) == + {'filter': ['["field", "==", ["val"]]']}) + + # exception if entry is not list/tuple or string + with pytest.raises(ValueError): + _format_iter_filters({'filter': ['test', 123]}) diff --git a/tests/client/test_utils.py b/tests/client/test_utils.py index 201ee78d..f109894c 100644 --- a/tests/client/test_utils.py +++ b/tests/client/test_utils.py @@ -5,42 +5,6 @@ import mock from scrapinghub.client.utils import parse_auth -from scrapinghub.client.proxy import _format_iter_filters - - -def test_format_iter_filters(): - # work with empty params - assert _format_iter_filters({}) == {} - - # doesn't affect other params - params = {'a': 123, 'b': 456} - assert _format_iter_filters(params) == params - - # pass filter as-is if not list - params = {'filter': 'some-string'} - assert _format_iter_filters(params) == params - - # work fine with empty filter - params = {'filter': []} - assert _format_iter_filters(params) == params - - # pass string filters as-is - params = {'filter': ['str1', 'str2']} - assert _format_iter_filters(params) == params - - # converts list-formatted filters - params = {'filter': [['field', '>=', ['val']], 'filter2']} - assert (_format_iter_filters(params) == - {'filter': ['["field", ">=", ["val"]]', 'filter2']}) - - # works the same with tuple entries - params = {'filter': [('field', '==', ['val'])]} - assert (_format_iter_filters(params) == - {'filter': ['["field", "==", ["val"]]']}) - - # exception if entry is not list/tuple or string - with pytest.raises(ValueError): - _format_iter_filters({'filter': ['test', 123]}) def test_parse_auth_none(): From 290664c0a6ed3f3f16cc044ed4488ff8365813e0 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Wed, 29 Mar 2017 10:21:44 +0300 Subject: [PATCH 09/15] Add missing docstring for _ItemsResourceProxy.get --- scrapinghub/client/proxy.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/scrapinghub/client/proxy.py b/scrapinghub/client/proxy.py index b15af190..ffe99837 100644 --- a/scrapinghub/client/proxy.py +++ b/scrapinghub/client/proxy.py @@ -52,8 +52,14 @@ def _modify_iter_params(self, params): class _ItemsResourceProxy(_Proxy): - def get(self, _key, **params): - return self._origin.get(_key, **params) + def get(self, key, **params): + """Get element from collection. + + :param key: element key. + :return: a dictionary with element data. + :rtype: :class:`dict` + """ + return self._origin.get(key, **params) def write(self, item): """Write new element to collection.""" From 401064a63914520b0ea2322409e24b1b91c80d37 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Wed, 29 Mar 2017 10:31:22 +0300 Subject: [PATCH 10/15] Minor fixes, move project settings section down in overview --- docs/client/overview.rst | 52 ++++++++++++++++++------------------- docs/quickstart.rst | 2 +- scrapinghub/client/proxy.py | 5 +++- 3 files changed, 31 insertions(+), 28 deletions(-) diff --git a/docs/client/overview.rst b/docs/client/overview.rst index 2b32314f..bc195c3c 100644 --- a/docs/client/overview.rst +++ b/docs/client/overview.rst @@ -20,7 +20,8 @@ for access to client projects. Projects -------- -You can list the projects available to your account:: +You can list the :class:`~scrapinghub.client.projects.Projects` available to your +account:: >>> client.projects.list() [123, 456] @@ -67,31 +68,6 @@ For example, to schedule a spider run (it returns a > -Settings --------- - -You can work with project settings via :class:`~scrapinghub.client.projects.Settings`. - -To get a list of the project settings:: - - >>> project.settings.list() - [(u'default_job_units', 2), (u'job_runtime_limit', 24)]] - -To get a project setting value by name:: - - >>> project.settings.get('job_runtime_limit') - 24 - -To update a project setting value by name:: - - >>> project.settings.set('job_runtime_limit', 20) - -Or update a few project settings at once:: - - >>> project.settings.update({'default_job_units': 1, - ... 'job_runtime_limit': 20}) - - Spiders ------- @@ -428,6 +404,30 @@ Or post multiple events at once:: >>> project.activity.add(events) +Settings +-------- + +You can work with project settings via :class:`~scrapinghub.client.projects.Settings`. + +To get a list of the project settings:: + + >>> project.settings.list() + [(u'default_job_units', 2), (u'job_runtime_limit', 24)]] + +To get a project setting value by name:: + + >>> project.settings.get('job_runtime_limit') + 24 + +To update a project setting value by name:: + + >>> project.settings.set('job_runtime_limit', 20) + +Or update a few project settings at once:: + + >>> project.settings.update({'default_job_units': 1, + ... 'job_runtime_limit': 20}) + Collections ----------- diff --git a/docs/quickstart.rst b/docs/quickstart.rst index f484bd35..3fced7e0 100644 --- a/docs/quickstart.rst +++ b/docs/quickstart.rst @@ -69,7 +69,7 @@ By default, tests use VCR.py ``once`` mode to: It means that if you add new integration tests and run all tests as usual, only new cassettes will be created, all existing cassettes will stay unmodified. -To ignore existing cassettes and use real service, please provide a flag:: +To ignore existing cassettes and use real services, please provide a flag:: py.test --ignore-cassettes diff --git a/scrapinghub/client/proxy.py b/scrapinghub/client/proxy.py index ffe99837..7d399643 100644 --- a/scrapinghub/client/proxy.py +++ b/scrapinghub/client/proxy.py @@ -62,7 +62,10 @@ def get(self, key, **params): return self._origin.get(key, **params) def write(self, item): - """Write new element to collection.""" + """Write new element to collection. + + :param item: element data dict to write. + """ try: return self._origin.write(item) except _ValueTooLarge as exc: From c1399be162f7a22ff026b104f4a937b599bb59a9 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Wed, 29 Mar 2017 10:46:17 +0300 Subject: [PATCH 11/15] Refactor and minor fixes for overview --- docs/client/overview.rst | 93 +++++++++++++++++++++------------------- 1 file changed, 49 insertions(+), 44 deletions(-) diff --git a/docs/client/overview.rst b/docs/client/overview.rst index bc195c3c..6a9c2c28 100644 --- a/docs/client/overview.rst +++ b/docs/client/overview.rst @@ -136,12 +136,12 @@ Use ``run`` method to run a new job for project/spider:: Scheduling logic supports different options, like -- job_args to provide arguments for the job -- units to specify amount of units to run the job -- job_settings to pass additional settings for the job -- priority to set higher/lower priority of the job -- add_tag to create a job with a set of initial tags -- meta to pass additional custom metadata +- **job_args** to provide arguments for the job +- **units** to specify amount of units to run the job +- **job_settings** to pass additional settings for the job +- **priority** to set higher/lower priority of the job +- **add_tag** to create a job with a set of initial tags +- **meta** to pass additional custom metadata For example, to run a new job for a given spider with custom params:: @@ -211,8 +211,9 @@ To get jobs filtered by tags:: >>> jobs_summary = project.jobs.iter(has_tag=['new', 'verified'], lacks_tag='obsolete') -List of tags has ``OR`` power, so in the case above jobs with 'new' or -'verified' tag are expected. +List of tags in **has_tag** has ``OR`` power, so in the case above jobs with +``new`` or ``verified`` tag are expected (while list of tags in **lacks_tag** +has ``AND`` power). To get certain number of last finished jobs per some spider:: @@ -227,7 +228,7 @@ for filtering by state: - deleted Dict entries returned by ``iter`` method contain some additional meta, -but can be easily converted to ``Job`` instances with:: +but can be easily converted to :class:`~scrapinghub.client.jobs.Job` instances with:: >>> [Job(x['key']) for x in jobs] [ @@ -266,6 +267,25 @@ It's also possible to get last jobs summary (for each spider):: Note that there can be a lot of spiders, so the method above returns an iterator. + +update_tags +^^^^^^^^^^^ + +Tags is a convenient way to mark specific jobs (for better search, postprocessing etc). + + +To mark all spider jobs with tag ``consumed``:: + + >>> spider.jobs.update_tags(add=['consumed']) + +To remove existing tag ``existing`` for all spider jobs:: + + >>> spider.jobs.update_tags(remove=['existing']) + +Modifying tags is available on :class:`~scrapinghub.client.spiders.Spider`/ +:class:`~scrapinghub.client.jobs.Job` levels. + + Job --- @@ -286,6 +306,10 @@ To delete a job:: >>> job.delete() +To mark a job with tag ``consumed``:: + + >>> job.update_tags(add=['consumed']) + .. _job-metadata: Metadata @@ -404,31 +428,6 @@ Or post multiple events at once:: >>> project.activity.add(events) -Settings --------- - -You can work with project settings via :class:`~scrapinghub.client.projects.Settings`. - -To get a list of the project settings:: - - >>> project.settings.list() - [(u'default_job_units', 2), (u'job_runtime_limit', 24)]] - -To get a project setting value by name:: - - >>> project.settings.get('job_runtime_limit') - 24 - -To update a project setting value by name:: - - >>> project.settings.set('job_runtime_limit', 20) - -Or update a few project settings at once:: - - >>> project.settings.update({'default_job_units': 1, - ... 'job_runtime_limit': 20}) - - Collections ----------- @@ -559,24 +558,30 @@ Frontiers are available on project level only. .. _job-tags: -Tags ----- -Tags is a convenient way to mark specific jobs (for better search, postprocessing etc). +Settings +-------- -To mark a job with tag ``consumed``:: +You can work with project settings via :class:`~scrapinghub.client.projects.Settings`. - >>> job.update_tags(add=['consumed']) +To get a list of the project settings:: -To mark all spider jobs with tag ``consumed``:: + >>> project.settings.list() + [(u'default_job_units', 2), (u'job_runtime_limit', 24)]] - >>> spider.jobs.update_tags(add=['consumed']) +To get a project setting value by name:: -To remove existing tag ``existing`` for all spider jobs:: + >>> project.settings.get('job_runtime_limit') + 24 - >>> spider.jobs.update_tags(remove=['existing']) +To update a project setting value by name:: + + >>> project.settings.set('job_runtime_limit', 20) -Modifying tags is available on spider/job levels. +Or update a few project settings at once:: + + >>> project.settings.update({'default_job_units': 1, + ... 'job_runtime_limit': 20}) Exceptions From b06ae7059d0b9a4628e044867e11de87c7186a7e Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Wed, 29 Mar 2017 11:05:19 +0300 Subject: [PATCH 12/15] Drop iter_raw_* methods --- scrapinghub/client/collections.py | 45 ------------------- scrapinghub/client/proxy.py | 26 ----------- .../test_logs/test_logs_iter_raw_json.gz | 1 - .../test_logs/test_logs_iter_raw_msgpack.gz | 1 - .../test_requests_iter_raw_json.gz | 1 - tests/client/test_items.py | 16 ------- tests/client/test_logs.py | 34 -------------- tests/client/test_requests.py | 17 ------- 8 files changed, 141 deletions(-) delete mode 100644 tests/client/cassetes/test_logs/test_logs_iter_raw_json.gz delete mode 100644 tests/client/cassetes/test_logs/test_logs_iter_raw_msgpack.gz delete mode 100644 tests/client/cassetes/test_requests/test_requests_iter_raw_json.gz diff --git a/scrapinghub/client/collections.py b/scrapinghub/client/collections.py index ee7c4bd1..960f0bfd 100644 --- a/scrapinghub/client/collections.py +++ b/scrapinghub/client/collections.py @@ -211,51 +211,6 @@ def iter(self, key=None, prefix=None, prefixcount=None, startts=None, return self._origin._collections.iter_values( self._origin.coltype, self._origin.colname, **params) - def iter_raw_json(self, key=None, prefix=None, prefixcount=None, - startts=None, endts=None, requests_params=None, **params): - """A method to iterate through json pack-ed items. - Can be convenient if data is needed in the json format. - - :param key: a string key or a list of keys to filter with. - :param prefix: a string prefix to filter items. - :param prefixcount: maximum number of values to return per prefix. - :param startts: UNIX timestamp at which to begin results. - :param endts: UNIX timestamp at which to end results. - :param requests_params: (optional) a dict with optional requests params. - :param \*\*params: (optional) additional query params for the request. - :return: an iterator over items list packed with json. - :rtype: :class:`collections.Iterable[str]` - """ - update_kwargs(params, key=key, prefix=prefix, prefixcount=prefixcount, - startts=startts, endts=endts, - requests_params=requests_params) - params = self._collections._modify_iter_params(params) - return self._origin._collections.iter_json( - self._origin.coltype, self._origin.colname, **params) - - def iter_raw_msgpack(self, key=None, prefix=None, prefixcount=None, - startts=None, endts=None, requests_params=None, - **params): - """A method to iterate through raw msgpack-ed items. - Can be convenient if data is needed in same msgpack format. - - :param key: a string key or a list of keys to filter with. - :param prefix: a string prefix to filter items. - :param prefixcount: maximum number of values to return per prefix. - :param startts: UNIX timestamp at which to begin results. - :param endts: UNIX timestamp at which to end results. - :param requests_params: (optional) a dict with optional requests params. - :param \*\*params: (optional) additional query params for the request. - :return: an iterator over items list packed with msgpack. - :rtype: :class:`collections.Iterable[bytes]` - """ - update_kwargs(params, key=key, prefix=prefix, prefixcount=prefixcount, - startts=startts, endts=endts, - requests_params=requests_params) - params = self._collections._modify_iter_params(params) - return self._origin._collections.iter_msgpack( - self._origin.coltype, self._origin.colname, **params) - def list(self, key=None, prefix=None, prefixcount=None, startts=None, endts=None, requests_params=None, **params): """Convenient shortcut to list iter results. diff --git a/scrapinghub/client/proxy.py b/scrapinghub/client/proxy.py index 7d399643..6f247b0d 100644 --- a/scrapinghub/client/proxy.py +++ b/scrapinghub/client/proxy.py @@ -112,32 +112,6 @@ def iter(self, _path=None, count=None, requests_params=None, **apiparams): apiparams = self._modify_iter_params(apiparams) return self._origin.iter_values(_path, requests_params, **apiparams) - def iter_raw_json(self, _path=None, count=None, requests_params=None, - **apiparams): - """A method to iterate through raw json-packed elements. - Can be convenient if data is needed in raw json format. - - :param count: limit amount of elements. - :return: an iterator over elements list packed with json. - :rtype: :class:`collections.Iterable[str]` - """ - update_kwargs(apiparams, count=count) - apiparams = self._modify_iter_params(apiparams) - return self._origin.iter_json(_path, requests_params, **apiparams) - - def iter_raw_msgpack(self, _path=None, count=None, requests_params=None, - **apiparams): - """A method to iterate through raw msgpack-ed elements. - Can be convenient if data is needed in same msgpack format. - - :param count: limit amount of elements. - :return: an iterator over elements list packed with msgpack. - :rtype: :class:`collections.Iterable[bytes]` - """ - update_kwargs(apiparams, count=count) - apiparams = self._modify_iter_params(apiparams) - return self._origin.iter_msgpack(_path, requests_params, **apiparams) - class _MappingProxy(_Proxy): """A helper class to support basic get/set interface for dict-like diff --git a/tests/client/cassetes/test_logs/test_logs_iter_raw_json.gz b/tests/client/cassetes/test_logs/test_logs_iter_raw_json.gz deleted file mode 100644 index bce71e44..00000000 --- a/tests/client/cassetes/test_logs/test_logs_iter_raw_json.gz +++ /dev/null @@ -1 +0,0 @@ -eJyll/1XE2cWx0HrG+tbt12VVbcBDY1K3g1ENCoQ3kQEJOhoO7pD8pAZCJncmQkvulGKFS2K5VWsrgiV2tquortlPSurnnOfX/av2f9gz9lnngSxLnvadZNMcu7NnTz3uffzvTPpXZaCLJuwNisrS4kbRJPChqLGdcgWYZktBcttwmr2lUb0BPMSeC8FK2zCKuaSiRQhmg4rU7DKJrzHPEHJILBahDXCdmZVakqhxbPXUidpFo/LXWxxe0u8/hKXy1JVF4IcSVjJgpqI1kk0+JUIa4VtzD7ZVFWT9jldDo+lvKHHkNW40+vY6yiCdRJf5oSk9cB6ETYI65lVGg6ThFFoKVfVdoXARklYYXpjMbUL3hfh18I6ZjbUN4UKLfUNoZr6Y03wgcS3W66y/cYNe6gnQeBDEX4jbGReKZGIKWHJLIKzTVfjsElK8mVb1EgPbE7BFp64bmhKPAq5wsfMOJ+vG5KR1PNL8tX2/ML8NrVFiTDDk3443U5fkS8/Bb/VM+eawbA1BdvSpewgui5FCWwXljGrvhZ+x1cMqxECH9X+I5nkURqBJNENsKQgzyZvFqzMl9DUNhI2ApmVCvSEwpoSkHW7wULtaRPyZdakHTZeitIkq6imnOM7hJ0iWAUPc5dJuhK2nO74pc9TRVAgCTnszGadaPbSKKskfCyCjfc+wdtm18OalGBlkpMtTo/DxZ4R0umCXZKw4XXn7BVxtk+zlrtF2MNTjJ5TEoWWCGmNmUAVpknJ9NcugkOoNYuja62G2k7inkBLUArWdRc1x6uUruLiU+XdSijS1tJVU+U2fDHDr4aPlEKoo12PNJeHaoqrG4/G1NZOzZWoIKQoFPM3gHMJIFwiuIW8t4Dotnd1ddlbVa3DntRixMycRMCTLgQ7PU64esArwl7uayckYZdiSicBn8R5XVjkKIlHDRmKRCjmXfcWgz+903RdYJ8IJcJyZu927ob9DMKVnBRW2Agc4HyYVEOAxyQ1BQ4KO01ZGkaixOn0eh3pl89d4nf5XU7WCKeWjDs404eSyRQctslMzaU2E44y21LJlYsQFLKZ3wUVkrA1k5uu2804TY3Zuczs9ZoSVeJQKUIVj94N1UsUtEaEI0Iu8xqk23AmYpIS328Jy5KmEyPQHKq0+6FWSspMYUflXIHFZUGdLjORHLPJr4VRL3M9yEwDDUwD4bNmB8J6TroVOdAo/JMFfoTXVmfty6PjwjL6Cr+hw3Qwh/a51mbRKTqPky105gBOXzToOD7Dr7bU4jR9hDP4iP7FRf9KH3fiBD4kZXQ4fOEgjuBQLo7AavwWb+LADjqNIzUVESfObyo8Jus4icPb/Hgd6FD71iC9H9+0wdGWU+vFV914bxd+W1aDj/F7P52nM+wYpQ9P0hsX6fTmA/QencPv2xP4N3xOn9IJfEpv0j/TvoPYT6dwkE7QV/TJ6T05eC0Pr0v0koqX8XN3aQ+dpZP7cYI+WIMT2bS/oYDVBY5zNJhYlLgbmvohdByazaaesL0t0JMiCL9AoKckmQ2G0+86GD6RZKblT5fQsijJTBpnZJ8kM/DPcnTdvn3we4nP3nCGF7IwEiQRWjjq5s9AWJKZKCIyE4P8hgKIfFA4sEi+bqgaG6aOzg7HG9tzhNUOZ0yN6s6fDOVDbBZrRsAFraYiolwRMleEsqQi2kRoT2dd7IZYOuuFkNeDrEOE+GLW6hJaSIgAQsF/DBdTmzElTvS3haH9vPp0EQy5Oi2gpJwrNwr/WlBC+oHjdKjOlMHGFQyxF/QK3kpalQv4AJ8y4IdxILymCB/tbMRhOoLzn0CYflZgMOi/pFfolXCl/wx9uMKHV+ksix9R6NjRw9l0uAT78HlolaMRp+i0IGVvD+B3eKfpDPbi/Y14uxMfH8Yn9DGDe5hO4uUTtA9/3Mo0Oap90IMP6CjL4wX7zR/Y6QPY25JNb9OXHpwJ4F2mihnad74IX+KzVGt+vICOnmFbeErHaN8OOrsK762kgxsEOoi3CnG8VDlxYcuuIN6ik3Qg2oWPvKYwOmWmhq7j0M3nSM8bc+Tc4hw5z+bIMbPjf7CZcKZMOJlOLshpFVx8VxX0chV8Jouc2r4MtSY7VRUhuMSgPf3/QZvxWT2VbnYwP3v3FLArhBQ420564HMT6csc6X6O9JUlkb4qwhe8LPu8MPBfiL4mwvVFogeXIPqGCF/+L0QP/TzRwyKMLBA9yonu/ynRVjpQmoO9Fgb0EM7Tv+McnZZLGLIM5014KXgEp94PGvQZ9jEqntO71XjnU7yKTw514I9RnF3+Id7AuRz8Grw4GOvJX49/aqAzOMbCv6vG6SN0tpa+zGEXhjl8sQ7nogeceQE6htMN5qVpzERr/Djc5GhNvIHWrUW0vlpA6zZH604GrT9m0Lr7rmhNcrTuZdCaegutaYaW/q5ovcanoFWJsb8DAauvzOrxxEgniVlN2Mr3mB/eCqs3uGD7yva6rL4ge8HXySThN6vsDl4374Tu12YnHf8GcstbzQ== \ No newline at end of file diff --git a/tests/client/cassetes/test_logs/test_logs_iter_raw_msgpack.gz b/tests/client/cassetes/test_logs/test_logs_iter_raw_msgpack.gz deleted file mode 100644 index dc115941..00000000 --- a/tests/client/cassetes/test_logs/test_logs_iter_raw_msgpack.gz +++ /dev/null @@ -1 +0,0 @@ -eJyllv1TFGcSx5GoIfieixoNXlaSxT1l32UhGEyQd1F5W5LRc9Rh92FnYNnZnpflRTeGGGIiEhNEPSWK5IQ7zouVGMvEnFWp6qeu6v6i+/l6ngVBi1Qsb3Z2t7q3Z55+uj/f3vk4Pwt5HmltXl6elrKYocQsTU+ZsEKGfE8WXvJIBfSTwcw0eRmszMIqj/QyuVSmxJlhwuosvOyRVpKnVrEYFMjwirSDrHpDK3WF9roOK4YrFAiWu4LhynBFZbDM1XA4CoWKtJqCOpiRYQaskWGtVET2hx0NTTmfP+ALuWpaBy1VT/nDvr2+CKxTxDIfKMYgrJdhg7SerOpYjKWtUleNrvdqDDYq0irHm0zq/bBJhleldWS2tnRES10trdGmliMd8AdFbLdGp/2mLG90MM3gNRk2SxvJq6TTSS2mOEXw95h6CrYotli2S48PwtYsvC4SNy1DSyVgm7SLjNPFpqVYtllcWaz3FpcW9+hdWpyMUO7wB/1lkUhxFrab89c6wfBGFopypexjpqkkGOyQ8slqaYY/ihVjepzBm83/sW0RZTCwmWmBKws7PepWyU2+tKH3sJhVNb9SiZnWqClVqum1KNSbM6FYpSa95RGlqLapooY2JHYIb8vglkLkPqCYWsx1rO95X0cjUKJIhXRlp8kMb3WCKgm7ZPCI3qdF27xmzFDSVCbV7vKHfAF6xVkmAH9SpA1POuetS9E+nVrulmGPSDExpKVLXXHWnXSAKs2RMt9frww+qdkpjml0W3ovS4WqumqV2sMDkc5Ug9ZfXn60ZkCLxnu6+psaglZZ0qrQYwerIdrXa8Y7a6JN5Y1th5J6d8YIpOsYi0STFa3gXwaIgAxBaeczQAx4+/v7vd260ee1jSRzMmdxCOUKQZenmFAPhGXYK3y9jKW9SlLLMChTBK8LixxiqYSlQkSGctH1cDlU5Haaqwu8I0Ol9BLZu/27YR9BuFqQQoWNw7uCD4dqqBIxtqHBfultR5aWla70+8NhX+4sC1ZWBCoCfmqE37BTPsH0e7adhfc9Kqm52uPAccCzXHI1MtRKK8gfgDpFemM+N9P0OnGGnvQKmXlbDC2hpaBehgYRvRsalylokwwHpW3ktdiA5U8nFS21zxVTFcNkVlVntN5bAc2KrZLCDqnbJIrLg8OmSiI54lGfCKNFFXpQSQOtpIHYSacDMbMw14pCaJP+S4Fv4mhBXuNOfkXK57/iDP+ajxXyc661efz2aZzCb/kdfIzD2/ld/k9+C2djeL2gB89n+CRO8wf8Jr92gj98i3+3McMv8ksNr4b4J6eq+DD+wGfxPsO/46c9OFnPp/He6oZ+wzyB37/Dv8eJ9gRejTTyqyd38O+yoSKc7TnDf8RzfHwtPuIThfamrXzqTLaST9HyP/MRPrPhI5zll/lYKf6ykv8N7/LJNJ9r9nbzh5TxyCm69BGex2vH8PpRvIA/4CU/f9z2umcdfsWn8F8U9SXODeWfLcGRTAnVB9oFIiQaLRWEjs8g2g6dTnM/8Dwr1A9lkJ5DqEcVlQbEsRcdEH9WVNL08WU0LSsqSeSEWqaoJICTAuFgJAynFDGDY/PcsIXRoMjQJZB3bgMxRSVxxFUShbpECUzdL727qADT0g0aqr5Mn2/J9nwxvc+f1BOm/6nh/B7NZMOqCkC3o4yEUIYqlKEtq4weGXpzWYeDkMxlvRDyZKD1yZBazFpfRhNpGUCo9ukh02cm0kqs91l5GL+vQVMGS23MychWt6lt0r8X9JA7ek+04zhex8khvIXDOBzEGwT9j2f5T3wcv+ETeJ9PFOE9PoOPcI4gHcEv+BzO4V0cDZtrjHbbxFF+Gcdwsi3KH+JtnDtOdxvHG1G8sb4dP+/EmwWH+E28wC+Sxh6coXte4Hf47S14bdcWPsP4LJ+ObMR7xqpG9xG2Pc1/wpktm/nMVpzGf6zBXzfh433r+Vi2+WA7jjbn0boPPHhlM40VyKjEdH87DIipMLhkKgwtToXTNBWOOH0743EQyzqIEe0fqTmWz74oyx8LlodVWbD3ibT5t5oG5wSVDhsNdVH4lKA89v9BOe9zh+qD9CY/fYZK6J9AqTrZywZhxEH2M4HseYHs58si+4UMF0TBIgEY/Q1iL8owtkjsl8sQe0mGr56f2K9/n9hxGS4vEDshiD3+nMReWkrs2LPE7unAUaLwPo5aAtkV+4v4yDqP87dyxQHpajtcEyD9ZQlI1xdBurEA0qQA6Zt5kG7Og3TrRUGaEiDdngdpWn0al28JF/NFcXmCREm3lqRH+Sp32QF3KJRkGZZ0OwDV7HG+wnXucO2CXXZgb8BdVksn/NW2mXjQpKdv03mKudO8wvb9D/2gOpI= \ No newline at end of file diff --git a/tests/client/cassetes/test_requests/test_requests_iter_raw_json.gz b/tests/client/cassetes/test_requests/test_requests_iter_raw_json.gz deleted file mode 100644 index 85e51ebd..00000000 --- a/tests/client/cassetes/test_requests/test_requests_iter_raw_json.gz +++ /dev/null @@ -1 +0,0 @@ -eJyllvt3FFUSx4cgD2PAALKIBpgMCEPMPPMaw0bJOyGEBDLBDtILnZmbdCeT7ql+5AGOxABRBOUVyHFXObIGc6K7sj4OKiCeU/WLf9ZW94THYs7R407PD1M11ffWrfp+qnuyIAe+oFTk8/k03RamkrI1Q7dgmQwFwRwsD0qr+S9TWFn2CngmByuC0ip2qUJJC9OClTlYFZSeYU+TYgtYLcOz0ha2Wkyt3B+v9Hcqpj8ejdX4YxW1Fa/VxmL+1s4kFCrSSg7qEeaoMOE5GYqkErbf7Gltz/si0XDc39g9YauGHqkIV4arYY3ibXNYMSdgrQzPS2vZqk+lRNYu9zcaxrAmoFiRVrjeTMYYg3UyrJfWsNnd1ZMs93d1J9u7DvTABsU7bqPB59XtUHIiK+AFGTZKxexVstmMllLcIkSGLEOHvyiOt22/kZ6ATTl40Uvcsk1NH4TN0i42TgYsW7EdK1AbMIYD5YEho19LsxHPfyKxSFUiEcjBS9bivW4wvJyDknwpR4RlKYMCtkgFbHV1wFZvx5SRFrCt41fH8aJMAY6wbPDnoDSobpJeYV/WNIZEyq5b3GmnldW4KXWqFbI5NJQ3IaByk7YHvVLUO1xRUzvhnRB2yPCKFGd3g2JpKf+RkT969VXDTkUq5Dt7LWGG6ge5krBLhqDX+6zXtpCVMpUsl0l1+iPxcJSvtBiNwm5Fev5R50LNOp/TrWWZDK96KQ6e0LLl/rQYyLiCKs8rZbG/IRnCUodbHMscsI1hocfr+puUps7x6l69VRurqelrHNeS6aH+sfbWmF2VsRNGal89JEeGrXRvY7K9pu3g/owxMGpGs81CVCcziW6ILCGIqAwxqfQpQYyHxsbGQgOGORJyzIxwMxdpiOcLwbfrwqMHKmSo9HzDQmRDSkYbFVCleHp9uMl+oQ/aKlTLUON1vaIGEvmT5usCr8lQKy1nuyxSBntYhCs9pXBh0/BXTx+uqqHOi3FMDV6XdrhY2na2NhKpqAjnv1Wx2kQ0EY1wIyKmo4c9Tb/hODnYG1SZ5vqgK46G4FLJNcrQJC1jfxSaFenlxdwsK+TGmUYm5GEW6jK1QU2HFhlavegyaFuioO0y7JM2s9cW43Ykm1E0fY8/pSqmJey63mRLKAEdiqMyYfvVzRLH+aDTUhmSA0H1ERhdqseDygx0MwOpY24HUlZhvhWFcFB6jjPYhudX+3CylK5JBfQLXqOPOtbiVLR4JX2GP+O8H7+rjuMt9s/SDfyRZprqfHj5pTa1VODdAN2gT2mhBc/tw3/iPfqJ5vGaUTCEH+J0fH1Zh4UzDXQLF+gCza4r4jWu4vXx4zmaBbqDX9PnvjV0uZumY7SAl6vdDegMTW6vwemTuyvwa4NO01c454by6nfwB/qEPjZLcCGw7fCGJvyopYeu8k03af6dMZyme0N0d+du+mL90YEdXavC2+lMCd6kG8X0Hp3W2vnnf1rxLN1P4mQbzuHUqRKcpclxH02KU/Q5znIp4JCnGyZJ02PQMw3JQ9Drdvxw8Gl635RB+gP09ikqT40jf3ZqvKWoDPrRJUCXFZW5+ZtapahMxTFP17GaBBxXvMGcWhSTeDgvFBn6PQ7cZSClqExMWmVS1CfwEOrr0t7HWFi2YfKkDY+OhJ84XjhljEQWx6sV+Z+p/QYPa9Oui8KAi8ygh4zqIaMticyQDMP5zBNVkMln/jDk0aQbkUF/nLmxBCxZGUDa+Zvp48Kb0XRhPU2O+ft4WjLYalueMEfdrB6Utj5EJf9xOaHzhTi19wUGJYcX6Ru8je/RF/jlAfwOL+L1FXStdscunKFvcWrrplJ8cHL5uo1ButxHD2ie7qxJyPTzlrX0Zd9R+pbePxKmyb3NtEA3ca69/1X8vrKaruP1UvqK/oEz+AHexSk8/2LtPobpNpM5U1zEUM7jfU5jCm/Rh3S3GO9w7KUO/PcGnMYHDO/Zyl1DJbzZ3J4jeINRupI6WsQI/kK3aI5ub2zEq0zVg0CCvqFzeMlghn7qp3/xMvdwtpEulNIM3m/o3I0X6Me3gO7jObqk0Gd0Bc/gdJL+fvxtH10cxnN4Fq+49IyqjMzYIRj3JtHEE5PoxONJdJIn0QFXEm8HXQXnXAUzTO+oeVRO/VlUJj1U3lVlT9pTi9J2xdXanITTrOyG/0fZ/CxR6o4Niwk44zjCe8Xg9y7LfX6d7VjmhP8LWciWDQ== \ No newline at end of file diff --git a/tests/client/test_items.py b/tests/client/test_items.py index 38af756a..1dfaeaf1 100644 --- a/tests/client/test_items.py +++ b/tests/client/test_items.py @@ -1,10 +1,6 @@ -import json - import pytest from six.moves import range -from scrapinghub.hubstorage.serialization import mpdecode - def _add_test_items(job): for i in range(3): @@ -29,18 +25,6 @@ def test_items_iter(spider): with pytest.raises(StopIteration): next(o) - o = job.items.iter_raw_json(offset=2) - item = json.loads(next(o)) - assert item['id'] == 2 - assert item['data'] == 'data2' - with pytest.raises(StopIteration): - next(o) - - msgpacked_o = job.items.iter_raw_msgpack(offset=2) - o = mpdecode(msgpacked_o) - assert item['id'] == 2 - assert item['data'] == 'data2' - def test_items_list(spider): job = spider.jobs.run(meta={'state': 'running'}) diff --git a/tests/client/test_logs.py b/tests/client/test_logs.py index 52b42ec3..88cb4b85 100644 --- a/tests/client/test_logs.py +++ b/tests/client/test_logs.py @@ -1,11 +1,9 @@ -import json import types from numbers import Integral import pytest from scrapinghub.client.utils import LogLevel -from scrapinghub.hubstorage.serialization import mpdecode from .conftest import TEST_TS @@ -103,35 +101,3 @@ def test_logs_list_filter(spider): logs3 = job.logs.list(filter=[('message', 'contains', ['simple'])]) assert len(logs3) == 3 - - -def test_logs_iter_raw_json(spider): - job = spider.jobs.run() - _add_test_logs(job) - - logs0 = job.logs.iter_raw_json(offset=2) - raw_log0 = next(logs0) - log0 = json.loads(raw_log0) - assert log0.get('message') == 'simple-msg3' - assert log0.get('_key') - assert isinstance(log0.get('time'), Integral) - assert log0.get('level') == 10 - - logs1 = job.logs.iter_raw_json(level='ERROR') - raw_log1 = next(logs1) - log1 = json.loads(raw_log1) - assert log1.get('message') == 'error-msg' - - -def test_logs_iter_raw_msgpack(spider): - job = spider.jobs.run() - _add_test_logs(job) - - logs1 = job.logs.iter_raw_msgpack(offset=2) - assert isinstance(logs1, types.GeneratorType) - unpacked_logs1 = list(mpdecode(logs1)) - assert unpacked_logs1[0].get('message') == 'simple-msg3' - - logs2 = job.logs.iter_raw_msgpack(level='ERROR') - unpacked_logs2 = list(mpdecode(logs2)) - assert unpacked_logs2[0].get('message') == 'error-msg' diff --git a/tests/client/test_requests.py b/tests/client/test_requests.py index 1d2e3bca..a71b3820 100644 --- a/tests/client/test_requests.py +++ b/tests/client/test_requests.py @@ -1,5 +1,3 @@ -import json - import pytest from .conftest import TEST_TS @@ -39,18 +37,3 @@ def test_requests_iter(spider): } with pytest.raises(StopIteration): next(rr) - - -def test_requests_iter_raw_json(spider): - job = spider.jobs.run() - _add_test_requests(job) - job.requests.close() - - rr = job.requests.iter_raw_json() - raw_req = next(rr) - req = json.loads(raw_req) - assert req.get('url') == 'http://test.com/' - assert req.get('status') == 200 - next(rr), next(rr) - with pytest.raises(StopIteration): - next(rr) From efa665ae4a1921d1eba65ede3803110b23ee7867 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Wed, 29 Mar 2017 11:52:26 +0300 Subject: [PATCH 13/15] Minor docstring fixes: links and formatting --- scrapinghub/client/__init__.py | 4 ++-- scrapinghub/client/collections.py | 4 ++-- scrapinghub/client/frontiers.py | 20 +++++--------------- scrapinghub/client/items.py | 6 +++--- scrapinghub/client/jobs.py | 21 +++++++++++---------- scrapinghub/client/logs.py | 2 +- scrapinghub/client/requests.py | 2 +- 7 files changed, 25 insertions(+), 34 deletions(-) diff --git a/scrapinghub/client/__init__.py b/scrapinghub/client/__init__.py index 0cf5d080..5e9fbafa 100644 --- a/scrapinghub/client/__init__.py +++ b/scrapinghub/client/__init__.py @@ -70,9 +70,9 @@ def get_project(self, project_id): return self.projects.get(parse_project_id(project_id)) def get_job(self, job_key): - """Get Job with a given job key. + """Get :class:`~scrapinghub.client.jobs.Job` with a given job key. - :param job_key: job key string in format 'project_id/spider_id/job_id', + :param job_key: job key string in format ``project_id/spider_id/job_id``, where all the components are integers. :return: a job instance. :rtype: :class:`~scrapinghub.client.jobs.Job` diff --git a/scrapinghub/client/collections.py b/scrapinghub/client/collections.py index 960f0bfd..afa72917 100644 --- a/scrapinghub/client/collections.py +++ b/scrapinghub/client/collections.py @@ -27,7 +27,7 @@ class Collections(_Proxy, _DownloadableProxyMixin): def get(self, type_, name): """Base method to get a collection with a given type and name. - :param type_: a collection type string. + :param `type_`: a collection type string. :param name: a collection name string. :return: a collection object. :rtype: :class:`Collection` @@ -248,7 +248,7 @@ def create_writer(self, start=0, auth=None, size=1000, interval=15, :param maxitemsize: (optional) max item size in bytes. :param callback: (optional) some callback function. :return: a new writer object. - :rtype: :class:`~scrapinghub.hubstorage.batchuploader._BatchWriter` + :rtype: :class:`scrapinghub.hubstorage.batchuploader._BatchWriter` If provided - calllback shouldn't try to inject more items in the queue, otherwise it can lead to deadlocks. diff --git a/scrapinghub/client/frontiers.py b/scrapinghub/client/frontiers.py index a6c2f30e..f4d63887 100644 --- a/scrapinghub/client/frontiers.py +++ b/scrapinghub/client/frontiers.py @@ -113,11 +113,7 @@ def list(self): @property def newcount(self): - """Amount of new entries added to all frontiers. - - :return: amount of new entries. - :rtype: :class:`int` - """ + """Integer amount of new entries added to all frontiers.""" return sum(self._origin.newcount.values()) def flush(self): @@ -199,11 +195,7 @@ def flush(self): @property def newcount(self): - """Amount of new entries added to frontier. - - :return: amount of new entries. - :rtype: :class:`int` - """ + """Integer amount of new entries added to frontier.""" newcount_values = self._frontiers._origin.newcount return sum(v for (frontier, _), v in newcount_values.items() if frontier == self.key) @@ -298,16 +290,13 @@ def flush(self): @property def newcount(self): - """Amount of new entries added to slot. - - :return: amount of new entries. - :rtype: :class:`int` - """ + """Integer amount of new entries added to slot.""" newcount_values = self._frontier._frontiers._origin.newcount return newcount_values.get((self._frontier.key, self.key), 0) class FrontierSlotFingerprints(object): + """Representation of request fingerprints collection stored in slot.""" def __init__(self, slot): self.key = slot.key @@ -350,6 +339,7 @@ def list(self, **params): class FrontierSlotQueue(object): + """Representation of request batches queue stored in slot.""" def __init__(self, slot): self.key = slot.key diff --git a/scrapinghub/client/items.py b/scrapinghub/client/items.py index 45d0e7d1..0ffeedfb 100644 --- a/scrapinghub/client/items.py +++ b/scrapinghub/client/items.py @@ -6,9 +6,9 @@ class Items(_ItemsResourceProxy, _DownloadableProxyMixin): """Representation of collection of job items. - Not a public constructor: use :class:`~scrapinghub.client.jobs.Job` instanc - e to get a :class:`Items` instance. - See :attr:`~scrapinghub.client.jobs.Job.items` attribute. + Not a public constructor: use :class:`~scrapinghub.client.jobs.Job` + instance to get a :class:`Items` instance. See + :attr:`~scrapinghub.client.jobs.Job.items` attribute. Please note that :meth:`list` method can use a lot of memory and for a large amount of logs it's recommended to iterate through it via diff --git a/scrapinghub/client/jobs.py b/scrapinghub/client/jobs.py index a022e85e..1996264e 100644 --- a/scrapinghub/client/jobs.py +++ b/scrapinghub/client/jobs.py @@ -24,7 +24,7 @@ class Jobs(object): and :attr:`scrapinghub.client.spiders.Spider.jobs` attributes. :ivar project_id: a string project id. - :ivar spider: :class:`Spider` object if defined. + :ivar spider: :class:`~scrapinghub.client.spiders.Spider` object if defined. Usage:: @@ -112,16 +112,16 @@ def iter(self, count=None, start=None, spider=None, state=None, >>> [job['key'] for job in jobs_summary] ['123/1/3', '123/1/2', '123/1/1'] - - job summary fieldset is less detailed than job.metadata but contains - few new fields as well. Additional fields can be requested using - ``meta`` parameter. If it's used, then it's up to the user to list - all the required fields, so only few default fields would be added - except requested ones:: + - job summary fieldset is less detailed than :class:`JobMeta` but + contains a few new fields as well. Additional fields can be requested + using ``meta`` parameter. If it's used, then it's up to the user to + list all the required fields, so only few default fields would be + added except requested ones:: >>> jobs_summary = project.jobs.iter(meta=['scheduled_by', ]) - by default :meth:`Jobs.iter` returns maximum last 1000 results. - Pagination is available using start parameter:: + Pagination is available using start parameter:: >>> jobs_summary = spider.jobs.iter(start=1000) @@ -227,13 +227,14 @@ def run(self, spider=None, units=None, priority=None, meta=None, return Job(self._client, response['jobid']) def get(self, job_key): - """Get a Job with a given job_key. + """Get a :class:`Job` with a given job_key. :param job_key: a string job key. job_key's project component should match the project used to get :class:`Jobs` instance, and job_key's spider component should match - the spider (if :attr:`Spider.jobs` was used). + the spider (if :class:`~scrapinghub.client.spiders.Spider` was used + to get :class:`Jobs` instance). :return: a job object. :rtype: :class:`Job` @@ -509,7 +510,7 @@ class JobMeta(_MappingProxy): """Class representing job metadata. Not a public constructor: use :class:`Job` instance to get a - :class:`JobMeta` instance. See :attr:`Job.metadata` attribute. + :class:`JobMeta` instance. See :attr:`~Job.metadata` attribute. Usage: diff --git a/scrapinghub/client/logs.py b/scrapinghub/client/logs.py index 130e5720..261934a9 100644 --- a/scrapinghub/client/logs.py +++ b/scrapinghub/client/logs.py @@ -54,7 +54,7 @@ def log(self, message, level=logging.INFO, ts=None, **other): :param message: a string message. :param level: (optional) logging level, default to INFO. - :param ts: (optional) unix timestamp in milliseconds. + :param ts: (optional) UNIX timestamp in milliseconds. :param \*\*other: other optional kwargs. """ self._origin.log(message, level=level, ts=ts, **other) diff --git a/scrapinghub/client/requests.py b/scrapinghub/client/requests.py index 13acf6e6..7f5428ef 100644 --- a/scrapinghub/client/requests.py +++ b/scrapinghub/client/requests.py @@ -50,7 +50,7 @@ def add(self, url, status, method, rs, parent, duration, ts, fp=None): :param rs: response body length. :param parent: parent request id or ``None``. :param duration: request duration in milliseconds. - :param ts: unix timestamp in milliseconds. + :param ts: UNIX timestamp in milliseconds. :param fp: (optional) string fingerprint for the request. """ return self._origin.add( From 4acc40efcb5fd6fc6a952082de0b334163a72a10 Mon Sep 17 00:00:00 2001 From: Alexander Chekunkov Date: Wed, 29 Mar 2017 11:35:14 +0100 Subject: [PATCH 14/15] Fix Python examples in documentation --- docs/client/overview.rst | 16 ++++++++-------- docs/legacy/hubstorage.rst | 10 +++++----- docs/quickstart.rst | 2 +- scrapinghub/client/activity.py | 10 +++++----- scrapinghub/client/collections.py | 4 ++-- scrapinghub/client/items.py | 2 +- scrapinghub/client/jobs.py | 2 +- scrapinghub/client/logs.py | 2 +- 8 files changed, 24 insertions(+), 24 deletions(-) diff --git a/docs/client/overview.rst b/docs/client/overview.rst index 6a9c2c28..126e4fee 100644 --- a/docs/client/overview.rst +++ b/docs/client/overview.rst @@ -145,8 +145,8 @@ Scheduling logic supports different options, like For example, to run a new job for a given spider with custom params:: - >>> job = spider.jobs.run(units=2, job_settings={'SETTING': 'VALUE'}, - priority=1, add_tag=['tagA','tagB'], meta={'custom-data': 'val1'}) + >>> job = spider.jobs.run(units=2, job_settings={'SETTING': 'VALUE'}, priority=1, + ... add_tag=['tagA','tagB'], meta={'custom-data': 'val1'}) Note that if you run a job on project level, spider name is required:: @@ -192,7 +192,7 @@ ones:: >>> job_summary = next(project.jobs.iter()) >>> job_summary.get('spider', 'missing') 'foo' - >>> jobs_summary = project.jobs.iter(jobmeta=['scheduled_by', ]) + >>> jobs_summary = project.jobs.iter(jobmeta=['scheduled_by']) >>> job_summary = next(jobs_summary) >>> job_summary.get('scheduled_by', 'missing') 'John' @@ -227,10 +227,10 @@ for filtering by state: - finished - deleted -Dict entries returned by ``iter`` method contain some additional meta, +Dictionary entries returned by ``iter`` method contain some additional meta, but can be easily converted to :class:`~scrapinghub.client.jobs.Job` instances with:: - >>> [Job(x['key']) for x in jobs] + >>> [Job(client, x['key']) for x in jobs] [ , , @@ -422,9 +422,9 @@ To post a new activity event:: Or post multiple events at once:: >>> events = [ - {'event': 'job:completed', 'job': '123/2/5', 'user': 'john'}, - {'event': 'job:cancelled', 'job': '123/2/6', 'user': 'john'}, - ] + ... {'event': 'job:completed', 'job': '123/2/5', 'user': 'john'}, + ... {'event': 'job:cancelled', 'job': '123/2/6', 'user': 'john'}, + ... ] >>> project.activity.add(events) diff --git a/docs/legacy/hubstorage.rst b/docs/legacy/hubstorage.rst index 5a024faa..d1e06ea4 100644 --- a/docs/legacy/hubstorage.rst +++ b/docs/legacy/hubstorage.rst @@ -130,7 +130,7 @@ If it used, then it's up to the user to list all the required fields, so only fe >>> metadata = next(project.jobq.list()) >>> metadata.get('spider', 'missing') u'foo' - >>> jobs_metadata = project.jobq.list(jobmeta=['scheduled_by', ]) + >>> jobs_metadata = project.jobq.list(jobmeta=['scheduled_by']) >>> metadata = next(jobs_metadata) >>> metadata.get('scheduled_by', 'missing') u'John' @@ -150,7 +150,7 @@ List of tags has ``OR`` power, so in the case above jobs with 'new' or 'verified To get certain number of last finished jobs per some spider:: - >>> jobs_metadata = project.jobq.list(spider='foo', state='finished' count=3) + >>> jobs_metadata = project.jobq.list(spider='foo', state='finished', count=3) There are 4 possible job states, which can be used as values for filtering by state: @@ -167,7 +167,7 @@ To iterate through items:: >>> items = job.items.iter_values() >>> for item in items: - # do something, item is just a dict + ... # do something, item is just a dict Logs ^^^^ @@ -176,7 +176,7 @@ To iterate through 10 first logs for example:: >>> logs = job.logs.iter_values(count=10) >>> for log in logs: - # do something, log is a dict with log level, message and time keys + ... # do something, log is a dict with log level, message and time keys Collections ^^^^^^^^^^^ @@ -246,4 +246,4 @@ Module contents :undoc-members: :show-inheritance: -.. _scrapinghub.ScrapinghubClient: ../client/overview.html +.. _scrapinghub.ScrapinghubClient: ../client/overview.html diff --git a/docs/quickstart.rst b/docs/quickstart.rst index 3fced7e0..426e0475 100644 --- a/docs/quickstart.rst +++ b/docs/quickstart.rst @@ -36,7 +36,7 @@ Work with your projects:: Run new jobs from the client:: >>> project = client.get_project(123) - >>> project.jobs.run('spider1', job_args={'arg1':'val1'}) + >>> project.jobs.run('spider1', job_args={'arg1': 'val1'}) > Access your jobs data:: diff --git a/scrapinghub/client/activity.py b/scrapinghub/client/activity.py index 665ee3b6..b5d1777f 100644 --- a/scrapinghub/client/activity.py +++ b/scrapinghub/client/activity.py @@ -31,16 +31,16 @@ class Activity(_Proxy): - post a new event:: >>> event = {'event': 'job:completed', - 'job': '123/2/4', - 'user': 'jobrunner'} + ... 'job': '123/2/4', + ... 'user': 'jobrunner'} >>> project.activity.add(event) - post multiple events at once:: >>> events = [ - {'event': 'job:completed', 'job': '123/2/5', 'user': 'jobrunner'}, - {'event': 'job:cancelled', 'job': '123/2/6', 'user': 'john'}, - ] + ... {'event': 'job:completed', 'job': '123/2/5', 'user': 'jobrunner'}, + ... {'event': 'job:cancelled', 'job': '123/2/6', 'user': 'john'}, + ... ] >>> project.activity.add(events) """ diff --git a/scrapinghub/client/collections.py b/scrapinghub/client/collections.py index afa72917..10f78e2e 100644 --- a/scrapinghub/client/collections.py +++ b/scrapinghub/client/collections.py @@ -108,7 +108,7 @@ class Collection(object): - add a new item to collection:: >>> foo_store.set({'_key': '002d050ee3ff6192dcbecc4e4b4457d7', - 'value': '1447221694537'}) + ... 'value': '1447221694537'}) - count items in collection:: @@ -128,7 +128,7 @@ class Collection(object): - iterate iterate over _key & value pair:: >>> for elem in foo_store.iter(count=1)): - >>> ... print(elem) + ... print(elem) [{'_key': '002d050ee3ff6192dcbecc4e4b4457d7', 'value': '1447221694537'}] - filter by multiple keys, only values for keys that exist will be returned:: diff --git a/scrapinghub/client/items.py b/scrapinghub/client/items.py index 0ffeedfb..c3d5828a 100644 --- a/scrapinghub/client/items.py +++ b/scrapinghub/client/items.py @@ -25,7 +25,7 @@ class Items(_ItemsResourceProxy, _DownloadableProxyMixin): - iterate through first 100 items and print them:: >>> for log in job.logs.iter(count=100): - >>> ... print(log) + ... print(log) - retrieve items with timestamp greater or equal to given timestamp (item here is an arbitrary dictionary depending on your code):: diff --git a/scrapinghub/client/jobs.py b/scrapinghub/client/jobs.py index 1996264e..f6813976 100644 --- a/scrapinghub/client/jobs.py +++ b/scrapinghub/client/jobs.py @@ -540,7 +540,7 @@ class JobMeta(_MappingProxy): - update multiple meta fields at once - >>> job.metadata.update({'my-meta1': 'test1', 'my-meta2': 'test2}) + >>> job.metadata.update({'my-meta1': 'test1', 'my-meta2': 'test2'}) - delete meta field by name:: diff --git a/scrapinghub/client/logs.py b/scrapinghub/client/logs.py index 261934a9..2c68d800 100644 --- a/scrapinghub/client/logs.py +++ b/scrapinghub/client/logs.py @@ -28,7 +28,7 @@ class Logs(_ItemsResourceProxy, _DownloadableProxyMixin): - iterate through first 100 log entries and print them:: >>> for log in job.logs.iter(count=100): - >>> ... print(log) + ... print(log) - retrieve a single log entry from a job:: From 350472e456cfc2c8e578f237e82e8892f0d255ce Mon Sep 17 00:00:00 2001 From: Alexander Chekunkov Date: Wed, 29 Mar 2017 11:44:53 +0100 Subject: [PATCH 15/15] Fix typo in Frontier.iter() rtype --- scrapinghub/client/frontiers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapinghub/client/frontiers.py b/scrapinghub/client/frontiers.py index f4d63887..72f4edd4 100644 --- a/scrapinghub/client/frontiers.py +++ b/scrapinghub/client/frontiers.py @@ -174,7 +174,7 @@ def iter(self): """Iterate through slots. :return: an iterator over frontier slots names. - :rtype: :class:`collections.Iterate[str]` + :rtype: :class:`collections.Iterable[str]` """ return iter(self.list())