diff --git a/docs/client/overview.rst b/docs/client/overview.rst index 2b32314f..126e4fee 100644 --- a/docs/client/overview.rst +++ b/docs/client/overview.rst @@ -20,7 +20,8 @@ for access to client projects. Projects -------- -You can list the projects available to your account:: +You can list the :class:`~scrapinghub.client.projects.Projects` available to your +account:: >>> client.projects.list() [123, 456] @@ -67,31 +68,6 @@ For example, to schedule a spider run (it returns a > -Settings --------- - -You can work with project settings via :class:`~scrapinghub.client.projects.Settings`. - -To get a list of the project settings:: - - >>> project.settings.list() - [(u'default_job_units', 2), (u'job_runtime_limit', 24)]] - -To get a project setting value by name:: - - >>> project.settings.get('job_runtime_limit') - 24 - -To update a project setting value by name:: - - >>> project.settings.set('job_runtime_limit', 20) - -Or update a few project settings at once:: - - >>> project.settings.update({'default_job_units': 1, - ... 'job_runtime_limit': 20}) - - Spiders ------- @@ -160,17 +136,17 @@ Use ``run`` method to run a new job for project/spider:: Scheduling logic supports different options, like -- job_args to provide arguments for the job -- units to specify amount of units to run the job -- job_settings to pass additional settings for the job -- priority to set higher/lower priority of the job -- add_tag to create a job with a set of initial tags -- meta to pass additional custom metadata +- **job_args** to provide arguments for the job +- **units** to specify amount of units to run the job +- **job_settings** to pass additional settings for the job +- **priority** to set higher/lower priority of the job +- **add_tag** to create a job with a set of initial tags +- **meta** to pass additional custom metadata For example, to run a new job for a given spider with custom params:: - >>> job = spider.jobs.run(units=2, job_settings={'SETTING': 'VALUE'}, - priority=1, add_tag=['tagA','tagB'], meta={'custom-data': 'val1'}) + >>> job = spider.jobs.run(units=2, job_settings={'SETTING': 'VALUE'}, priority=1, + ... add_tag=['tagA','tagB'], meta={'custom-data': 'val1'}) Note that if you run a job on project level, spider name is required:: @@ -216,7 +192,7 @@ ones:: >>> job_summary = next(project.jobs.iter()) >>> job_summary.get('spider', 'missing') 'foo' - >>> jobs_summary = project.jobs.iter(jobmeta=['scheduled_by', ]) + >>> jobs_summary = project.jobs.iter(jobmeta=['scheduled_by']) >>> job_summary = next(jobs_summary) >>> job_summary.get('scheduled_by', 'missing') 'John' @@ -235,8 +211,9 @@ To get jobs filtered by tags:: >>> jobs_summary = project.jobs.iter(has_tag=['new', 'verified'], lacks_tag='obsolete') -List of tags has ``OR`` power, so in the case above jobs with 'new' or -'verified' tag are expected. +List of tags in **has_tag** has ``OR`` power, so in the case above jobs with +``new`` or ``verified`` tag are expected (while list of tags in **lacks_tag** +has ``AND`` power). To get certain number of last finished jobs per some spider:: @@ -250,10 +227,10 @@ for filtering by state: - finished - deleted -Dict entries returned by ``iter`` method contain some additional meta, -but can be easily converted to ``Job`` instances with:: +Dictionary entries returned by ``iter`` method contain some additional meta, +but can be easily converted to :class:`~scrapinghub.client.jobs.Job` instances with:: - >>> [Job(x['key']) for x in jobs] + >>> [Job(client, x['key']) for x in jobs] [ , , @@ -290,6 +267,25 @@ It's also possible to get last jobs summary (for each spider):: Note that there can be a lot of spiders, so the method above returns an iterator. + +update_tags +^^^^^^^^^^^ + +Tags is a convenient way to mark specific jobs (for better search, postprocessing etc). + + +To mark all spider jobs with tag ``consumed``:: + + >>> spider.jobs.update_tags(add=['consumed']) + +To remove existing tag ``existing`` for all spider jobs:: + + >>> spider.jobs.update_tags(remove=['existing']) + +Modifying tags is available on :class:`~scrapinghub.client.spiders.Spider`/ +:class:`~scrapinghub.client.jobs.Job` levels. + + Job --- @@ -310,6 +306,10 @@ To delete a job:: >>> job.delete() +To mark a job with tag ``consumed``:: + + >>> job.update_tags(add=['consumed']) + .. _job-metadata: Metadata @@ -422,13 +422,12 @@ To post a new activity event:: Or post multiple events at once:: >>> events = [ - {'event': 'job:completed', 'job': '123/2/5', 'user': 'john'}, - {'event': 'job:cancelled', 'job': '123/2/6', 'user': 'john'}, - ] + ... {'event': 'job:completed', 'job': '123/2/5', 'user': 'john'}, + ... {'event': 'job:cancelled', 'job': '123/2/6', 'user': 'john'}, + ... ] >>> project.activity.add(events) - Collections ----------- @@ -559,24 +558,30 @@ Frontiers are available on project level only. .. _job-tags: -Tags ----- -Tags is a convenient way to mark specific jobs (for better search, postprocessing etc). +Settings +-------- -To mark a job with tag ``consumed``:: +You can work with project settings via :class:`~scrapinghub.client.projects.Settings`. - >>> job.update_tags(add=['consumed']) +To get a list of the project settings:: -To mark all spider jobs with tag ``consumed``:: + >>> project.settings.list() + [(u'default_job_units', 2), (u'job_runtime_limit', 24)]] - >>> spider.jobs.update_tags(add=['consumed']) +To get a project setting value by name:: -To remove existing tag ``existing`` for all spider jobs:: + >>> project.settings.get('job_runtime_limit') + 24 - >>> spider.jobs.update_tags(remove=['existing']) +To update a project setting value by name:: + + >>> project.settings.set('job_runtime_limit', 20) -Modifying tags is available on spider/job levels. +Or update a few project settings at once:: + + >>> project.settings.update({'default_job_units': 1, + ... 'job_runtime_limit': 20}) Exceptions diff --git a/docs/legacy/hubstorage.rst b/docs/legacy/hubstorage.rst index 5a024faa..d1e06ea4 100644 --- a/docs/legacy/hubstorage.rst +++ b/docs/legacy/hubstorage.rst @@ -130,7 +130,7 @@ If it used, then it's up to the user to list all the required fields, so only fe >>> metadata = next(project.jobq.list()) >>> metadata.get('spider', 'missing') u'foo' - >>> jobs_metadata = project.jobq.list(jobmeta=['scheduled_by', ]) + >>> jobs_metadata = project.jobq.list(jobmeta=['scheduled_by']) >>> metadata = next(jobs_metadata) >>> metadata.get('scheduled_by', 'missing') u'John' @@ -150,7 +150,7 @@ List of tags has ``OR`` power, so in the case above jobs with 'new' or 'verified To get certain number of last finished jobs per some spider:: - >>> jobs_metadata = project.jobq.list(spider='foo', state='finished' count=3) + >>> jobs_metadata = project.jobq.list(spider='foo', state='finished', count=3) There are 4 possible job states, which can be used as values for filtering by state: @@ -167,7 +167,7 @@ To iterate through items:: >>> items = job.items.iter_values() >>> for item in items: - # do something, item is just a dict + ... # do something, item is just a dict Logs ^^^^ @@ -176,7 +176,7 @@ To iterate through 10 first logs for example:: >>> logs = job.logs.iter_values(count=10) >>> for log in logs: - # do something, log is a dict with log level, message and time keys + ... # do something, log is a dict with log level, message and time keys Collections ^^^^^^^^^^^ @@ -246,4 +246,4 @@ Module contents :undoc-members: :show-inheritance: -.. _scrapinghub.ScrapinghubClient: ../client/overview.html +.. _scrapinghub.ScrapinghubClient: ../client/overview.html diff --git a/docs/quickstart.rst b/docs/quickstart.rst index f484bd35..426e0475 100644 --- a/docs/quickstart.rst +++ b/docs/quickstart.rst @@ -36,7 +36,7 @@ Work with your projects:: Run new jobs from the client:: >>> project = client.get_project(123) - >>> project.jobs.run('spider1', job_args={'arg1':'val1'}) + >>> project.jobs.run('spider1', job_args={'arg1': 'val1'}) > Access your jobs data:: @@ -69,7 +69,7 @@ By default, tests use VCR.py ``once`` mode to: It means that if you add new integration tests and run all tests as usual, only new cassettes will be created, all existing cassettes will stay unmodified. -To ignore existing cassettes and use real service, please provide a flag:: +To ignore existing cassettes and use real services, please provide a flag:: py.test --ignore-cassettes diff --git a/scrapinghub/client/__init__.py b/scrapinghub/client/__init__.py index a751075c..5e9fbafa 100644 --- a/scrapinghub/client/__init__.py +++ b/scrapinghub/client/__init__.py @@ -1,9 +1,8 @@ from scrapinghub import Connection as _Connection from scrapinghub import HubstorageClient as _HubstorageClient +from .exceptions import _wrap_http_errors from .projects import Projects -from .exceptions import wrap_http_errors - from .utils import parse_auth from .utils import parse_project_id, parse_job_key @@ -13,14 +12,14 @@ class Connection(_Connection): - @wrap_http_errors + @_wrap_http_errors def _request(self, *args, **kwargs): return super(Connection, self)._request(*args, **kwargs) class HubstorageClient(_HubstorageClient): - @wrap_http_errors + @_wrap_http_errors def request(self, *args, **kwargs): return super(HubstorageClient, self).request(*args, **kwargs) @@ -71,9 +70,9 @@ def get_project(self, project_id): return self.projects.get(parse_project_id(project_id)) def get_job(self, job_key): - """Get Job with a given job key. + """Get :class:`~scrapinghub.client.jobs.Job` with a given job key. - :param job_key: job key string in format 'project_id/spider_id/job_id', + :param job_key: job key string in format ``project_id/spider_id/job_id``, where all the components are integers. :return: a job instance. :rtype: :class:`~scrapinghub.client.jobs.Job` diff --git a/scrapinghub/client/activity.py b/scrapinghub/client/activity.py index bb492fed..b5d1777f 100644 --- a/scrapinghub/client/activity.py +++ b/scrapinghub/client/activity.py @@ -1,7 +1,7 @@ from __future__ import absolute_import -from .utils import _Proxy -from .utils import parse_job_key +from .proxy import _Proxy +from .utils import parse_job_key, update_kwargs class Activity(_Proxy): @@ -31,23 +31,29 @@ class Activity(_Proxy): - post a new event:: >>> event = {'event': 'job:completed', - 'job': '123/2/4', - 'user': 'jobrunner'} + ... 'job': '123/2/4', + ... 'user': 'jobrunner'} >>> project.activity.add(event) - post multiple events at once:: >>> events = [ - {'event': 'job:completed', 'job': '123/2/5', 'user': 'jobrunner'}, - {'event': 'job:cancelled', 'job': '123/2/6', 'user': 'john'}, - ] + ... {'event': 'job:completed', 'job': '123/2/5', 'user': 'jobrunner'}, + ... {'event': 'job:cancelled', 'job': '123/2/6', 'user': 'john'}, + ... ] >>> project.activity.add(events) """ - def __init__(self, *args, **kwargs): - super(Activity, self).__init__(*args, **kwargs) - self._proxy_methods([('iter', 'list')]) - self._wrap_iter_methods(['iter']) + def iter(self, count=None, **params): + """Iterate over activity events. + + :param count: limit amount of elements. + :return: a generator object over a list of activity event dicts. + :rtype: :class:`types.GeneratorType[dict]` + """ + update_kwargs(params, count=count) + params = self._modify_iter_params(params) + return self._origin.list(**params) def add(self, values, **kwargs): """Add new event to the project activity. diff --git a/scrapinghub/client/collections.py b/scrapinghub/client/collections.py index c135bd39..10f78e2e 100644 --- a/scrapinghub/client/collections.py +++ b/scrapinghub/client/collections.py @@ -5,12 +5,11 @@ from ..hubstorage.collectionsrt import Collection as _Collection -from .utils import ( - _Proxy, format_iter_filters, proxy_methods, wrap_kwargs, update_kwargs, -) +from .proxy import _Proxy, _DownloadableProxyMixin +from .utils import update_kwargs -class Collections(_Proxy): +class Collections(_Proxy, _DownloadableProxyMixin): """Access to project collections. Not a public constructor: use :class:`~scrapinghub.client.projects.Project` @@ -28,7 +27,7 @@ class Collections(_Proxy): def get(self, type_, name): """Base method to get a collection with a given type and name. - :param type_: a collection type string. + :param `type_`: a collection type string. :param name: a collection name string. :return: a collection object. :rtype: :class:`Collection` @@ -109,7 +108,7 @@ class Collection(object): - add a new item to collection:: >>> foo_store.set({'_key': '002d050ee3ff6192dcbecc4e4b4457d7', - 'value': '1447221694537'}) + ... 'value': '1447221694537'}) - count items in collection:: @@ -129,7 +128,7 @@ class Collection(object): - iterate iterate over _key & value pair:: >>> for elem in foo_store.iter(count=1)): - >>> ... print(elem) + ... print(elem) [{'_key': '002d050ee3ff6192dcbecc4e4b4457d7', 'value': '1447221694537'}] - filter by multiple keys, only values for keys that exist will be returned:: @@ -144,43 +143,8 @@ class Collection(object): def __init__(self, client, collections, type_, name): self._client = client + self._collections = collections self._origin = _Collection(type_, name, collections._origin) - proxy_methods(self._origin, self, [ - 'create_writer', 'count', - ('iter', 'iter_values'), - ('iter_raw_json', 'iter_json'), - ]) - # simplified version of _Proxy._wrap_iter_methods logic - # to provide better support for filter param in iter methods - for method in ['iter', 'iter_raw_json']: - wrapped = wrap_kwargs(getattr(self, method), format_iter_filters) - setattr(self, method, wrapped) - - def list(self, key=None, prefix=None, prefixcount=None, startts=None, - endts=None, requests_params=None, **params): - """Convenient shortcut to list iter results. - - Please note that :meth:`list` method can use a lot of memory and for a - large amount of logs it's recommended to iterate through it - via :meth:`iter` method (all params and available filters are same for - both methods). - - :param key: a string key or a list of keys to filter with. - :param prefix: a string prefix to filter items. - :param prefixcount: maximum number of values to return per prefix. - :param startts: UNIX timestamp at which to begin results. - :param endts: UNIX timestamp at which to end results. - :param requests_params: (optional) a dict with optional requests params. - :param \*\*params: (optional) additional query params for the request. - :return: a list of items where each item is represented with a dict. - :rtype: :class:`list[dict]` - """ - # FIXME there should be similar docstrings for iter/iter_raw_json - # but as we proxy them as-is, it's not in place, should be improved - update_kwargs(params, key=key, prefix=prefix, prefixcount=prefixcount, - startts=startts, endts=endts, - requests_params=requests_params) - return list(self.iter(requests_params=None, **params)) def get(self, key, **params): """Get item from collection by key. @@ -216,11 +180,19 @@ def delete(self, keys): "object providing string keys") self._origin.delete(keys) - def iter_raw_msgpack(self, key=None, prefix=None, prefixcount=None, - startts=None, endts=None, requests_params=None, - **params): - """A method to iterate through raw msgpack-ed items. - Can be convenient if data is needed in same msgpack format. + def count(self, *args, **kwargs): + """Count collection items with a given filters. + + :return: amount of elements in collection. + :rtype: :class:`int` + """ + # TODO describe allowable params + return self._origin._collections.count( + self._origin.coltype, self._origin.colname, *args, **kwargs) + + def iter(self, key=None, prefix=None, prefixcount=None, startts=None, + endts=None, requests_params=None, **params): + """A method to iterate through collection items. :param key: a string key or a list of keys to filter with. :param prefix: a string prefix to filter items. @@ -229,11 +201,61 @@ def iter_raw_msgpack(self, key=None, prefix=None, prefixcount=None, :param endts: UNIX timestamp at which to end results. :param requests_params: (optional) a dict with optional requests params. :param \*\*params: (optional) additional query params for the request. - :return: an iterator over items list packed with msgpack. - :rtype: :class:`collections.Iterable[bytes]` + :return: an iterator over items list. + :rtype: :class:`collections.Iterable[dict]` """ update_kwargs(params, key=key, prefix=prefix, prefixcount=prefixcount, startts=startts, endts=endts, requests_params=requests_params) - return self._origin._collections.iter_msgpack( + params = self._collections._modify_iter_params(params) + return self._origin._collections.iter_values( self._origin.coltype, self._origin.colname, **params) + + def list(self, key=None, prefix=None, prefixcount=None, startts=None, + endts=None, requests_params=None, **params): + """Convenient shortcut to list iter results. + + Please note that :meth:`list` method can use a lot of memory and for a + large amount of logs it's recommended to iterate through it + via :meth:`iter` method (all params and available filters are same for + both methods). + + :param key: a string key or a list of keys to filter with. + :param prefix: a string prefix to filter items. + :param prefixcount: maximum number of values to return per prefix. + :param startts: UNIX timestamp at which to begin results. + :param endts: UNIX timestamp at which to end results. + :param requests_params: (optional) a dict with optional requests params. + :param \*\*params: (optional) additional query params for the request. + :return: a list of items where each item is represented with a dict. + :rtype: :class:`list[dict]` + """ + update_kwargs(params, key=key, prefix=prefix, prefixcount=prefixcount, + startts=startts, endts=endts) + return list(self.iter(requests_params=requests_params, **params)) + + def create_writer(self, start=0, auth=None, size=1000, interval=15, + qsize=None, content_encoding='identity', + maxitemsize=1024 ** 2, callback=None): + """Create a new writer for a collection. + + :param start: (optional) initial offset for writer thread. + :param auth: (optional) set auth credentials for the request. + :param size: (optional) set initial queue size. + :param interval: (optional) set interval for writer thread. + :param qsize: (optional) setup max queue size for the writer. + :param content_encoding: (optional) set different Content-Encoding header. + :param maxitemsize: (optional) max item size in bytes. + :param callback: (optional) some callback function. + :return: a new writer object. + :rtype: :class:`scrapinghub.hubstorage.batchuploader._BatchWriter` + + If provided - calllback shouldn't try to inject more items in the queue, + otherwise it can lead to deadlocks. + """ + kwargs = {} + update_kwargs(kwargs, start=start, auth=auth, size=size, interval=interval, + qsize=qsize, content_encoding=content_encoding, + maxitemsize=maxitemsize, callback=callback) + return self._origin._collections.create_writer( + self._origin.coltype, self._origin.colname, **kwargs) diff --git a/scrapinghub/client/exceptions.py b/scrapinghub/client/exceptions.py index d79b2eac..6a4b405c 100644 --- a/scrapinghub/client/exceptions.py +++ b/scrapinghub/client/exceptions.py @@ -5,7 +5,6 @@ from requests import HTTPError from ..legacy import APIError -from ..hubstorage import ValueTooLarge as _ValueTooLarge def _get_http_error_msg(exc): @@ -57,7 +56,7 @@ class ServerError(ScrapinghubAPIError): """Indicates some server error: something unexpected has happened.""" -def wrap_http_errors(method): +def _wrap_http_errors(method): """Internal helper to handle exceptions gracefully.""" @wraps(method) def wrapped(*args, **kwargs): @@ -92,14 +91,3 @@ def wrapped(*args, **kwargs): raise ServerError(http_error=exc) raise ScrapinghubAPIError(msg) return wrapped - - -def wrap_value_too_large(method): - """Internal wrapper for ValueTooLarge exception.""" - @wraps(method) - def wrapped(*args, **kwargs): - try: - return method(*args, **kwargs) - except _ValueTooLarge as exc: - raise ValueTooLarge(str(exc)) - return wrapped diff --git a/scrapinghub/client/frontiers.py b/scrapinghub/client/frontiers.py index 052774ea..72f4edd4 100644 --- a/scrapinghub/client/frontiers.py +++ b/scrapinghub/client/frontiers.py @@ -7,7 +7,8 @@ from ..hubstorage.frontier import Frontier as _Frontier from ..hubstorage.utils import urlpathjoin -from .utils import _Proxy, update_kwargs +from .proxy import _Proxy +from .utils import update_kwargs class _HSFrontier(_Frontier): @@ -84,7 +85,6 @@ class Frontiers(_Proxy): """ def __init__(self, *args, **kwargs): super(Frontiers, self).__init__(*args, **kwargs) - self._proxy_methods(['close', 'flush']) def get(self, name): """Get a frontier by name. @@ -113,13 +113,17 @@ def list(self): @property def newcount(self): - """Amount of new entries added to all frontiers. - - :return: amount of new entries. - :rtype: :class:`int` - """ + """Integer amount of new entries added to all frontiers.""" return sum(self._origin.newcount.values()) + def flush(self): + """Flush data in all frontiers writer threads.""" + self._origin.flush() + + def close(self): + """Close frontier writer threads one-by-one.""" + self._origin.close() + class Frontier(object): """Representation of a frontier object. @@ -170,7 +174,7 @@ def iter(self): """Iterate through slots. :return: an iterator over frontier slots names. - :rtype: :class:`collections.Iterate[str]` + :rtype: :class:`collections.Iterable[str]` """ return iter(self.list()) @@ -191,11 +195,7 @@ def flush(self): @property def newcount(self): - """Amount of new entries added to frontier. - - :return: amount of new entries. - :rtype: :class:`int` - """ + """Integer amount of new entries added to frontier.""" newcount_values = self._frontiers._origin.newcount return sum(v for (frontier, _), v in newcount_values.items() if frontier == self.key) @@ -290,16 +290,13 @@ def flush(self): @property def newcount(self): - """Amount of new entries added to slot. - - :return: amount of new entries. - :rtype: :class:`int` - """ + """Integer amount of new entries added to slot.""" newcount_values = self._frontier._frontiers._origin.newcount return newcount_values.get((self._frontier.key, self.key), 0) class FrontierSlotFingerprints(object): + """Representation of request fingerprints collection stored in slot.""" def __init__(self, slot): self.key = slot.key @@ -342,6 +339,7 @@ def list(self, **params): class FrontierSlotQueue(object): + """Representation of request batches queue stored in slot.""" def __init__(self, slot): self.key = slot.key diff --git a/scrapinghub/client/items.py b/scrapinghub/client/items.py index 45329e17..c3d5828a 100644 --- a/scrapinghub/client/items.py +++ b/scrapinghub/client/items.py @@ -1,14 +1,14 @@ from __future__ import absolute_import -from .utils import _Proxy +from .proxy import _ItemsResourceProxy, _DownloadableProxyMixin -class Items(_Proxy): +class Items(_ItemsResourceProxy, _DownloadableProxyMixin): """Representation of collection of job items. - Not a public constructor: use :class:`~scrapinghub.client.jobs.Job` instanc - e to get a :class:`Items` instance. - See :attr:`~scrapinghub.client.jobs.Job.items` attribute. + Not a public constructor: use :class:`~scrapinghub.client.jobs.Job` + instance to get a :class:`Items` instance. See + :attr:`~scrapinghub.client.jobs.Job.items` attribute. Please note that :meth:`list` method can use a lot of memory and for a large amount of logs it's recommended to iterate through it via @@ -25,7 +25,7 @@ class Items(_Proxy): - iterate through first 100 items and print them:: >>> for log in job.logs.iter(count=100): - >>> ... print(log) + ... print(log) - retrieve items with timestamp greater or equal to given timestamp (item here is an arbitrary dictionary depending on your code):: diff --git a/scrapinghub/client/jobs.py b/scrapinghub/client/jobs.py index b72feed2..f6813976 100644 --- a/scrapinghub/client/jobs.py +++ b/scrapinghub/client/jobs.py @@ -11,9 +11,8 @@ from .requests import Requests from .samples import Samples from .exceptions import NotFound, BadRequest, DuplicateJobError -from .utils import ( - _MappingProxy, get_tags_for_update, parse_job_key, update_kwargs, -) +from .proxy import _MappingProxy +from .utils import get_tags_for_update, parse_job_key, update_kwargs class Jobs(object): @@ -25,7 +24,7 @@ class Jobs(object): and :attr:`scrapinghub.client.spiders.Spider.jobs` attributes. :ivar project_id: a string project id. - :ivar spider: :class:`Spider` object if defined. + :ivar spider: :class:`~scrapinghub.client.spiders.Spider` object if defined. Usage:: @@ -113,16 +112,16 @@ def iter(self, count=None, start=None, spider=None, state=None, >>> [job['key'] for job in jobs_summary] ['123/1/3', '123/1/2', '123/1/1'] - - job summary fieldset is less detailed than job.metadata but contains - few new fields as well. Additional fields can be requested using - ``meta`` parameter. If it's used, then it's up to the user to list - all the required fields, so only few default fields would be added - except requested ones:: + - job summary fieldset is less detailed than :class:`JobMeta` but + contains a few new fields as well. Additional fields can be requested + using ``meta`` parameter. If it's used, then it's up to the user to + list all the required fields, so only few default fields would be + added except requested ones:: >>> jobs_summary = project.jobs.iter(meta=['scheduled_by', ]) - by default :meth:`Jobs.iter` returns maximum last 1000 results. - Pagination is available using start parameter:: + Pagination is available using start parameter:: >>> jobs_summary = spider.jobs.iter(start=1000) @@ -228,13 +227,14 @@ def run(self, spider=None, units=None, priority=None, meta=None, return Job(self._client, response['jobid']) def get(self, job_key): - """Get a Job with a given job_key. + """Get a :class:`Job` with a given job_key. :param job_key: a string job key. job_key's project component should match the project used to get :class:`Jobs` instance, and job_key's spider component should match - the spider (if :attr:`Spider.jobs` was used). + the spider (if :class:`~scrapinghub.client.spiders.Spider` was used + to get :class:`Jobs` instance). :return: a job object. :rtype: :class:`Job` @@ -510,7 +510,7 @@ class JobMeta(_MappingProxy): """Class representing job metadata. Not a public constructor: use :class:`Job` instance to get a - :class:`JobMeta` instance. See :attr:`Job.metadata` attribute. + :class:`JobMeta` instance. See :attr:`~Job.metadata` attribute. Usage: @@ -540,7 +540,7 @@ class JobMeta(_MappingProxy): - update multiple meta fields at once - >>> job.metadata.update({'my-meta1': 'test1', 'my-meta2': 'test2}) + >>> job.metadata.update({'my-meta1': 'test1', 'my-meta2': 'test2'}) - delete meta field by name:: diff --git a/scrapinghub/client/logs.py b/scrapinghub/client/logs.py index 6771607c..2c68d800 100644 --- a/scrapinghub/client/logs.py +++ b/scrapinghub/client/logs.py @@ -1,11 +1,13 @@ from __future__ import absolute_import + import json +import logging -from .utils import _Proxy +from .proxy import _ItemsResourceProxy, _DownloadableProxyMixin from .utils import LogLevel -class Logs(_Proxy): +class Logs(_ItemsResourceProxy, _DownloadableProxyMixin): """Representation of collection of job logs. Not a public constructor: use :class:`~scrapinghub.client.jobs.Job` instance @@ -26,7 +28,7 @@ class Logs(_Proxy): - iterate through first 100 log entries and print them:: >>> for log in job.logs.iter(count=100): - >>> ... print(log) + ... print(log) - retrieve a single log entry from a job:: @@ -47,11 +49,36 @@ class Logs(_Proxy): 'time': 1486375511188, }] """ + def log(self, message, level=logging.INFO, ts=None, **other): + """Base method to write a log entry. + + :param message: a string message. + :param level: (optional) logging level, default to INFO. + :param ts: (optional) UNIX timestamp in milliseconds. + :param \*\*other: other optional kwargs. + """ + self._origin.log(message, level=level, ts=ts, **other) + + def debug(self, message, **other): + """Log a message with DEBUG level.""" + self._origin.debug(message, **other) + + def info(self, message, **other): + """Log a message with INFO level.""" + self._origin.info(message, **other) + + def warn(self, message, **other): + """Log a message with WARN level.""" + self._origin.warn(message, **other) + warning = warn + + def error(self, message, **other): + """Log a message with ERROR level.""" + self._origin.error(message, **other) - def __init__(self, *args, **kwargs): - super(Logs, self).__init__(*args, **kwargs) - self._proxy_methods(['log', 'debug', 'info', 'warning', 'warn', - 'error', 'batch_write_start']) + def batch_write_start(self): + """Override to set a start parameter when commencing writing.""" + return self._origin.batch_write_start() def _modify_iter_params(self, params): """Modify iter() filters on-the-fly. diff --git a/scrapinghub/client/projects.py b/scrapinghub/client/projects.py index cbb3e33f..35f93eae 100644 --- a/scrapinghub/client/projects.py +++ b/scrapinghub/client/projects.py @@ -8,8 +8,9 @@ from .collections import Collections from .frontiers import _HSFrontier, Frontiers from .jobs import Jobs +from .proxy import _MappingProxy from .spiders import Spiders -from .utils import _MappingProxy, parse_project_id +from .utils import parse_project_id class Projects(object): diff --git a/scrapinghub/client/proxy.py b/scrapinghub/client/proxy.py new file mode 100644 index 00000000..6f247b0d --- /dev/null +++ b/scrapinghub/client/proxy.py @@ -0,0 +1,185 @@ +from __future__ import absolute_import + +import six +import json + +from ..hubstorage import ValueTooLarge as _ValueTooLarge +from .utils import update_kwargs +from .exceptions import ValueTooLarge + + +class _Proxy(object): + """A helper to create a class instance and proxy its methods to origin. + + The internal proxy class is useful to link class attributes from its + origin depending on the origin base class as a part of init logic: + + - :class:`~scrapinghub.hubstorage.resourcetype.ItemsResourceType` provides + items-based attributes to access items in an arbitrary collection with + get/write/flush/close/stats/iter methods. + + - :class:`~scrapinghub.hubstorage.resourcetype.DownloadableResource` provides + download-based attributes to iter through collection with or without + msgpack support. + """ + + def __init__(self, cls, client, key): + self.key = key + self._client = client + self._origin = cls(client._hsclient, key) + + def list(self, *args, **kwargs): + """Convenient shortcut to list iter results. + + Please note that :meth:`list` method can use a lot of memory and for a + large amount of elements it's recommended to iterate through it via + :meth:`iter` method (all params and available filters are same for both + methods). + """ + return list(self.iter(*args, **kwargs)) + + def _modify_iter_params(self, params): + """A helper to modify iter*() params on-the-fly. + + The method is internal and should be redefined in subclasses. + + :param params: a dictionary with input parameters. + :return: an updated dictionary with parameters. + :rtype: :class:`dict` + """ + return _format_iter_filters(params) + + +class _ItemsResourceProxy(_Proxy): + + def get(self, key, **params): + """Get element from collection. + + :param key: element key. + :return: a dictionary with element data. + :rtype: :class:`dict` + """ + return self._origin.get(key, **params) + + def write(self, item): + """Write new element to collection. + + :param item: element data dict to write. + """ + try: + return self._origin.write(item) + except _ValueTooLarge as exc: + raise ValueTooLarge(str(exc)) + + def iter(self, _key=None, count=None, **params): + """Iterate over elements in collection. + + :param count: limit amount of elements. + :return: a generator object over a list of element dictionaries. + :rtype: :class:`types.GeneratorType[dict]` + """ + update_kwargs(params or {}, count=count) + params = self._modify_iter_params(params) + return self._origin.list(_key, **params) + + def flush(self): + """Flush data from writer threads.""" + self._origin.flush() + + def stats(self): + """Get resource stats. + + :return: a dictionary with stats data. + :rtype: :class:`dict` + """ + return self._origin.stats() + + def close(self, block=True): + """Close writers one-by-one.""" + self._origin.close(block) + + +class _DownloadableProxyMixin(object): + + def iter(self, _path=None, count=None, requests_params=None, **apiparams): + """A general method to iterate through elements. + + :param count: limit amount of elements. + :return: an iterator over elements list. + :rtype: :class:`collections.Iterable` + """ + update_kwargs(apiparams, count=count) + apiparams = self._modify_iter_params(apiparams) + return self._origin.iter_values(_path, requests_params, **apiparams) + + +class _MappingProxy(_Proxy): + """A helper class to support basic get/set interface for dict-like + collections of elements. + """ + + def get(self, key): + """Get element value by key. + + :param key: a string key + """ + return next(self._origin.apiget(key)) + + def set(self, key, value): + """Set element value. + + :param key: a string key + :param value: new value to set for the key + """ + self._origin.apipost(key, data=json.dumps(value), is_idempotent=True) + + def update(self, values): + """Update multiple elements at once. + + The method provides convenient interface for partial updates. + + :param values: a dictionary with key/values to update. + """ + if not isinstance(values, dict): + raise TypeError("values should be a dict") + data = next(self._origin.apiget()) + data.update(values) + self._origin.apipost(jl={k: v for k, v in six.iteritems(data) + if k not in self._origin.ignore_fields}, + is_idempotent=True) + + def delete(self, key): + """Delete element by key. + + :param key: a string key + """ + self._origin.apidelete(key) + + def iter(self): + """Iterate through key/value pairs. + + :return: an iterator over key/value pairs. + :rtype: :class:`collections.Iterable` + """ + return six.iteritems(next(self._origin.apiget())) + + +def _format_iter_filters(params): + """Format iter() filter param on-the-fly. + + Support passing multiple filters at once as a list with tuples. + """ + filters = params.get('filter') + if filters and isinstance(filters, list): + filter_data = [] + for elem in params.pop('filter'): + if isinstance(elem, six.string_types): + filter_data.append(elem) + elif isinstance(elem, (list, tuple)): + filter_data.append(json.dumps(elem)) + else: + raise ValueError( + "Filter condition must be string, tuple or list") + if filter_data: + params['filter'] = filter_data + return params diff --git a/scrapinghub/client/requests.py b/scrapinghub/client/requests.py index e07cd36e..7f5428ef 100644 --- a/scrapinghub/client/requests.py +++ b/scrapinghub/client/requests.py @@ -1,9 +1,9 @@ from __future__ import absolute_import -from .utils import _Proxy +from .proxy import _ItemsResourceProxy, _DownloadableProxyMixin -class Requests(_Proxy): +class Requests(_ItemsResourceProxy, _DownloadableProxyMixin): """Representation of collection of job requests. Not a public constructor: use :class:`~scrapinghub.client.jobs.Job` instance @@ -41,6 +41,17 @@ class Requests(_Proxy): 'url': 'https://example.com' }] """ - def __init__(self, *args, **kwargs): - super(Requests, self).__init__(*args, **kwargs) - self._proxy_methods(['add']) + def add(self, url, status, method, rs, parent, duration, ts, fp=None): + """ Add a new requests. + + :param url: string url for the request. + :param status: HTTP status of the request. + :param method: stringified request method. + :param rs: response body length. + :param parent: parent request id or ``None``. + :param duration: request duration in milliseconds. + :param ts: UNIX timestamp in milliseconds. + :param fp: (optional) string fingerprint for the request. + """ + return self._origin.add( + url, status, method, rs, parent, duration, ts, fp=None) diff --git a/scrapinghub/client/samples.py b/scrapinghub/client/samples.py index 828677d2..87a8e9bc 100644 --- a/scrapinghub/client/samples.py +++ b/scrapinghub/client/samples.py @@ -1,9 +1,9 @@ from __future__ import absolute_import -from .utils import _Proxy +from .proxy import _ItemsResourceProxy -class Samples(_Proxy): +class Samples(_ItemsResourceProxy): """Representation of collection of job samples. Not a public constructor: use :class:`~scrapinghub.client.jobs.Job` instance diff --git a/scrapinghub/client/spiders.py b/scrapinghub/client/spiders.py index 1d8e4e81..1d665801 100644 --- a/scrapinghub/client/spiders.py +++ b/scrapinghub/client/spiders.py @@ -2,9 +2,8 @@ from requests.compat import urljoin +from .exceptions import NotFound, _wrap_http_errors from .jobs import Jobs -from .exceptions import NotFound -from .exceptions import wrap_http_errors from .utils import get_tags_for_update @@ -104,7 +103,7 @@ def __init__(self, client, project_id, spider_id, spider): self.jobs = Jobs(client, project_id, self) self._client = client - @wrap_http_errors + @_wrap_http_errors def update_tags(self, add=None, remove=None): """Update tags for the spider. @@ -118,7 +117,7 @@ def update_tags(self, add=None, remove=None): response = self._client._connection._session.patch(url, json=params) response.raise_for_status() - @wrap_http_errors + @_wrap_http_errors def list_tags(self): """List spider tags. diff --git a/scrapinghub/client/utils.py b/scrapinghub/client/utils.py index 405eafde..78a51292 100644 --- a/scrapinghub/client/utils.py +++ b/scrapinghub/client/utils.py @@ -8,12 +8,6 @@ import six -from ..hubstorage.resourcetype import DownloadableResource -from ..hubstorage.resourcetype import ItemsResourceType -from ..hubstorage.collectionsrt import Collections - -from .exceptions import wrap_value_too_large - class LogLevel(object): DEBUG = logging.DEBUG @@ -84,174 +78,6 @@ def get_tags_for_update(**kwargs): return params -class _Proxy(object): - """A helper to create a class instance and proxy its methods to origin. - - The internal proxy class is useful to link class attributes from its - origin depending on the origin base class as a part of init logic: - - - :class:`~scrapinghub.hubstorage.resourcetype.ItemsResourceType` provides - items-based attributes to access items in an arbitrary collection with - get/write/flush/close/stats/iter methods. - - - :class:`~scrapinghub.hubstorage.resourcetype.DownloadableResource` provides - download-based attributes to iter through collection with or without - msgpack support. - """ - - def __init__(self, cls, client, key): - self.key = key - self._client = client - self._origin = cls(client._hsclient, key) - - if issubclass(cls, ItemsResourceType): - self._proxy_methods(['get', 'write', 'flush', 'close', - 'stats', ('iter', 'list')]) - # redefine write method to wrap hubstorage.ValueTooLarge error - origin_method = getattr(self, 'write') - setattr(self, 'write', wrap_value_too_large(origin_method)) - - # DType iter_values() has more priority than IType list() - # plus Collections interface doesn't need the iter methods - if issubclass(cls, DownloadableResource) and cls is not Collections: - methods = [('iter', 'iter_values'), - ('iter_raw_msgpack', 'iter_msgpack'), - ('iter_raw_json', 'iter_json')] - self._proxy_methods(methods) - self._wrap_iter_methods([method[0] for method in methods]) - - def _proxy_methods(self, methods): - """A little helper for cleaner interface.""" - proxy_methods(self._origin, self, methods) - - def _wrap_iter_methods(self, methods): - """Modify kwargs for all passed self.iter* methods.""" - for method in methods: - wrapped = wrap_kwargs(getattr(self, method), - self._modify_iter_params) - setattr(self, method, wrapped) - - def _modify_iter_params(self, params): - """A helper to modify iter() params on-the-fly. - - The method is internal and should be redefined in subclasses. - - :param params: a dictionary with input parameters. - :return: an updated dictionary with parameters. - :rtype: :class:`dict` - """ - return format_iter_filters(params) - - def list(self, *args, **kwargs): - """Convenient shortcut to list iter results. - - Please note that :meth:`list` method can use a lot of memory and for a - large amount of elements it's recommended to iterate through it via - :meth:`iter` method (all params and available filters are same for both - methods). - """ - return list(self.iter(*args, **kwargs)) - - -class _MappingProxy(_Proxy): - """A helper class to support basic get/set interface for dict-like - collections of elements. - """ - - def get(self, key): - """Get element value by key. - - :param key: a string key - """ - return next(self._origin.apiget(key)) - - def set(self, key, value): - """Set element value. - - :param key: a string key - :param value: new value to set for the key - """ - self._origin.apipost(key, data=json.dumps(value), is_idempotent=True) - - def update(self, values): - """Update multiple elements at once. - - The method provides convenient interface for partial updates. - - :param values: a dictionary with key/values to update. - """ - if not isinstance(values, dict): - raise TypeError("values should be a dict") - data = next(self._origin.apiget()) - data.update(values) - self._origin.apipost(jl={k: v for k, v in six.iteritems(data) - if k not in self._origin.ignore_fields}, - is_idempotent=True) - - def delete(self, key): - """Delete element by key. - - :param key: a string key - """ - self._origin.apidelete(key) - - def iter(self): - """Iterate through key/value pairs. - - :return: an iterator over key/value pairs. - :rtype: :class:`collections.Iterable` - """ - return six.iteritems(next(self._origin.apiget())) - - -def wrap_kwargs(fn, kwargs_fn): - """Tiny wrapper to prepare modified version of function kwargs""" - def wrapped(*args, **kwargs): - kwargs = kwargs_fn(kwargs) - return fn(*args, **kwargs) - return wrapped - - -def proxy_methods(origin, successor, methods): - """A helper to proxy methods from origin to successor. - - Accepts a list with strings and tuples: - - - each string defines: - a successor method name to proxy 1:1 with origin method - - each tuple should consist of 2 strings: - a successor method name and an origin method name - """ - for method in methods: - if isinstance(method, tuple): - successor_name, origin_name = method - else: - successor_name, origin_name = method, method - if not hasattr(successor, successor_name): - setattr(successor, successor_name, getattr(origin, origin_name)) - - -def format_iter_filters(params): - """Format iter() filter param on-the-fly. - - Support passing multiple filters at once as a list with tuples. - """ - filters = params.get('filter') - if filters and isinstance(filters, list): - filter_data = [] - for elem in params.pop('filter'): - if isinstance(elem, six.string_types): - filter_data.append(elem) - elif isinstance(elem, (list, tuple)): - filter_data.append(json.dumps(elem)) - else: - raise ValueError( - "Filter condition must be string, tuple or list") - if filter_data: - params['filter'] = filter_data - return params - - def update_kwargs(kwargs, **params): """Update kwargs dict with non-empty params with json-encoded values.""" kwargs.update({k: json.dumps(v) if isinstance(v, dict) else v diff --git a/tests/client/cassetes/test_logs/test_logs_iter_raw_json.gz b/tests/client/cassetes/test_logs/test_logs_iter_raw_json.gz deleted file mode 100644 index bce71e44..00000000 --- a/tests/client/cassetes/test_logs/test_logs_iter_raw_json.gz +++ /dev/null @@ -1 +0,0 @@ -eJyll/1XE2cWx0HrG+tbt12VVbcBDY1K3g1ENCoQ3kQEJOhoO7pD8pAZCJncmQkvulGKFS2K5VWsrgiV2tquortlPSurnnOfX/av2f9gz9lnngSxLnvadZNMcu7NnTz3uffzvTPpXZaCLJuwNisrS4kbRJPChqLGdcgWYZktBcttwmr2lUb0BPMSeC8FK2zCKuaSiRQhmg4rU7DKJrzHPEHJILBahDXCdmZVakqhxbPXUidpFo/LXWxxe0u8/hKXy1JVF4IcSVjJgpqI1kk0+JUIa4VtzD7ZVFWT9jldDo+lvKHHkNW40+vY6yiCdRJf5oSk9cB6ETYI65lVGg6ThFFoKVfVdoXARklYYXpjMbUL3hfh18I6ZjbUN4UKLfUNoZr6Y03wgcS3W66y/cYNe6gnQeBDEX4jbGReKZGIKWHJLIKzTVfjsElK8mVb1EgPbE7BFp64bmhKPAq5wsfMOJ+vG5KR1PNL8tX2/ML8NrVFiTDDk3443U5fkS8/Bb/VM+eawbA1BdvSpewgui5FCWwXljGrvhZ+x1cMqxECH9X+I5nkURqBJNENsKQgzyZvFqzMl9DUNhI2ApmVCvSEwpoSkHW7wULtaRPyZdakHTZeitIkq6imnOM7hJ0iWAUPc5dJuhK2nO74pc9TRVAgCTnszGadaPbSKKskfCyCjfc+wdtm18OalGBlkpMtTo/DxZ4R0umCXZKw4XXn7BVxtk+zlrtF2MNTjJ5TEoWWCGmNmUAVpknJ9NcugkOoNYuja62G2k7inkBLUArWdRc1x6uUruLiU+XdSijS1tJVU+U2fDHDr4aPlEKoo12PNJeHaoqrG4/G1NZOzZWoIKQoFPM3gHMJIFwiuIW8t4Dotnd1ddlbVa3DntRixMycRMCTLgQ7PU64esArwl7uayckYZdiSicBn8R5XVjkKIlHDRmKRCjmXfcWgz+903RdYJ8IJcJyZu927ob9DMKVnBRW2Agc4HyYVEOAxyQ1BQ4KO01ZGkaixOn0eh3pl89d4nf5XU7WCKeWjDs404eSyRQctslMzaU2E44y21LJlYsQFLKZ3wUVkrA1k5uu2804TY3Zuczs9ZoSVeJQKUIVj94N1UsUtEaEI0Iu8xqk23AmYpIS328Jy5KmEyPQHKq0+6FWSspMYUflXIHFZUGdLjORHLPJr4VRL3M9yEwDDUwD4bNmB8J6TroVOdAo/JMFfoTXVmfty6PjwjL6Cr+hw3Qwh/a51mbRKTqPky105gBOXzToOD7Dr7bU4jR9hDP4iP7FRf9KH3fiBD4kZXQ4fOEgjuBQLo7AavwWb+LADjqNIzUVESfObyo8Jus4icPb/Hgd6FD71iC9H9+0wdGWU+vFV914bxd+W1aDj/F7P52nM+wYpQ9P0hsX6fTmA/QencPv2xP4N3xOn9IJfEpv0j/TvoPYT6dwkE7QV/TJ6T05eC0Pr0v0koqX8XN3aQ+dpZP7cYI+WIMT2bS/oYDVBY5zNJhYlLgbmvohdByazaaesL0t0JMiCL9AoKckmQ2G0+86GD6RZKblT5fQsijJTBpnZJ8kM/DPcnTdvn3we4nP3nCGF7IwEiQRWjjq5s9AWJKZKCIyE4P8hgKIfFA4sEi+bqgaG6aOzg7HG9tzhNUOZ0yN6s6fDOVDbBZrRsAFraYiolwRMleEsqQi2kRoT2dd7IZYOuuFkNeDrEOE+GLW6hJaSIgAQsF/DBdTmzElTvS3haH9vPp0EQy5Oi2gpJwrNwr/WlBC+oHjdKjOlMHGFQyxF/QK3kpalQv4AJ8y4IdxILymCB/tbMRhOoLzn0CYflZgMOi/pFfolXCl/wx9uMKHV+ksix9R6NjRw9l0uAT78HlolaMRp+i0IGVvD+B3eKfpDPbi/Y14uxMfH8Yn9DGDe5hO4uUTtA9/3Mo0Oap90IMP6CjL4wX7zR/Y6QPY25JNb9OXHpwJ4F2mihnad74IX+KzVGt+vICOnmFbeErHaN8OOrsK762kgxsEOoi3CnG8VDlxYcuuIN6ik3Qg2oWPvKYwOmWmhq7j0M3nSM8bc+Tc4hw5z+bIMbPjf7CZcKZMOJlOLshpFVx8VxX0chV8Jouc2r4MtSY7VRUhuMSgPf3/QZvxWT2VbnYwP3v3FLArhBQ420564HMT6csc6X6O9JUlkb4qwhe8LPu8MPBfiL4mwvVFogeXIPqGCF/+L0QP/TzRwyKMLBA9yonu/ynRVjpQmoO9Fgb0EM7Tv+McnZZLGLIM5014KXgEp94PGvQZ9jEqntO71XjnU7yKTw514I9RnF3+Id7AuRz8Grw4GOvJX49/aqAzOMbCv6vG6SN0tpa+zGEXhjl8sQ7nogeceQE6htMN5qVpzERr/Djc5GhNvIHWrUW0vlpA6zZH604GrT9m0Lr7rmhNcrTuZdCaegutaYaW/q5ovcanoFWJsb8DAauvzOrxxEgniVlN2Mr3mB/eCqs3uGD7yva6rL4ge8HXySThN6vsDl4374Tu12YnHf8GcstbzQ== \ No newline at end of file diff --git a/tests/client/cassetes/test_logs/test_logs_iter_raw_msgpack.gz b/tests/client/cassetes/test_logs/test_logs_iter_raw_msgpack.gz deleted file mode 100644 index dc115941..00000000 --- a/tests/client/cassetes/test_logs/test_logs_iter_raw_msgpack.gz +++ /dev/null @@ -1 +0,0 @@ -eJyllv1TFGcSx5GoIfieixoNXlaSxT1l32UhGEyQd1F5W5LRc9Rh92FnYNnZnpflRTeGGGIiEhNEPSWK5IQ7zouVGMvEnFWp6qeu6v6i+/l6ngVBi1Qsb3Z2t7q3Z55+uj/f3vk4Pwt5HmltXl6elrKYocQsTU+ZsEKGfE8WXvJIBfSTwcw0eRmszMIqj/QyuVSmxJlhwuosvOyRVpKnVrEYFMjwirSDrHpDK3WF9roOK4YrFAiWu4LhynBFZbDM1XA4CoWKtJqCOpiRYQaskWGtVET2hx0NTTmfP+ALuWpaBy1VT/nDvr2+CKxTxDIfKMYgrJdhg7SerOpYjKWtUleNrvdqDDYq0irHm0zq/bBJhleldWS2tnRES10trdGmliMd8AdFbLdGp/2mLG90MM3gNRk2SxvJq6TTSS2mOEXw95h6CrYotli2S48PwtYsvC4SNy1DSyVgm7SLjNPFpqVYtllcWaz3FpcW9+hdWpyMUO7wB/1lkUhxFrab89c6wfBGFopypexjpqkkGOyQ8slqaYY/ihVjepzBm83/sW0RZTCwmWmBKws7PepWyU2+tKH3sJhVNb9SiZnWqClVqum1KNSbM6FYpSa95RGlqLapooY2JHYIb8vglkLkPqCYWsx1rO95X0cjUKJIhXRlp8kMb3WCKgm7ZPCI3qdF27xmzFDSVCbV7vKHfAF6xVkmAH9SpA1POuetS9E+nVrulmGPSDExpKVLXXHWnXSAKs2RMt9frww+qdkpjml0W3ovS4WqumqV2sMDkc5Ug9ZfXn60ZkCLxnu6+psaglZZ0qrQYwerIdrXa8Y7a6JN5Y1th5J6d8YIpOsYi0STFa3gXwaIgAxBaeczQAx4+/v7vd260ee1jSRzMmdxCOUKQZenmFAPhGXYK3y9jKW9SlLLMChTBK8LixxiqYSlQkSGctH1cDlU5Haaqwu8I0Ol9BLZu/27YR9BuFqQQoWNw7uCD4dqqBIxtqHBfultR5aWla70+8NhX+4sC1ZWBCoCfmqE37BTPsH0e7adhfc9Kqm52uPAccCzXHI1MtRKK8gfgDpFemM+N9P0OnGGnvQKmXlbDC2hpaBehgYRvRsalylokwwHpW3ktdiA5U8nFS21zxVTFcNkVlVntN5bAc2KrZLCDqnbJIrLg8OmSiI54lGfCKNFFXpQSQOtpIHYSacDMbMw14pCaJP+S4Fv4mhBXuNOfkXK57/iDP+ajxXyc661efz2aZzCb/kdfIzD2/ld/k9+C2djeL2gB89n+CRO8wf8Jr92gj98i3+3McMv8ksNr4b4J6eq+DD+wGfxPsO/46c9OFnPp/He6oZ+wzyB37/Dv8eJ9gRejTTyqyd38O+yoSKc7TnDf8RzfHwtPuIThfamrXzqTLaST9HyP/MRPrPhI5zll/lYKf6ykv8N7/LJNJ9r9nbzh5TxyCm69BGex2vH8PpRvIA/4CU/f9z2umcdfsWn8F8U9SXODeWfLcGRTAnVB9oFIiQaLRWEjs8g2g6dTnM/8Dwr1A9lkJ5DqEcVlQbEsRcdEH9WVNL08WU0LSsqSeSEWqaoJICTAuFgJAynFDGDY/PcsIXRoMjQJZB3bgMxRSVxxFUShbpECUzdL727qADT0g0aqr5Mn2/J9nwxvc+f1BOm/6nh/B7NZMOqCkC3o4yEUIYqlKEtq4weGXpzWYeDkMxlvRDyZKD1yZBazFpfRhNpGUCo9ukh02cm0kqs91l5GL+vQVMGS23MychWt6lt0r8X9JA7ek+04zhex8khvIXDOBzEGwT9j2f5T3wcv+ETeJ9PFOE9PoOPcI4gHcEv+BzO4V0cDZtrjHbbxFF+Gcdwsi3KH+JtnDtOdxvHG1G8sb4dP+/EmwWH+E28wC+Sxh6coXte4Hf47S14bdcWPsP4LJ+ObMR7xqpG9xG2Pc1/wpktm/nMVpzGf6zBXzfh433r+Vi2+WA7jjbn0boPPHhlM40VyKjEdH87DIipMLhkKgwtToXTNBWOOH0743EQyzqIEe0fqTmWz74oyx8LlodVWbD3ibT5t5oG5wSVDhsNdVH4lKA89v9BOe9zh+qD9CY/fYZK6J9AqTrZywZhxEH2M4HseYHs58si+4UMF0TBIgEY/Q1iL8owtkjsl8sQe0mGr56f2K9/n9hxGS4vEDshiD3+nMReWkrs2LPE7unAUaLwPo5aAtkV+4v4yDqP87dyxQHpajtcEyD9ZQlI1xdBurEA0qQA6Zt5kG7Og3TrRUGaEiDdngdpWn0al28JF/NFcXmCREm3lqRH+Sp32QF3KJRkGZZ0OwDV7HG+wnXucO2CXXZgb8BdVksn/NW2mXjQpKdv03mKudO8wvb9D/2gOpI= \ No newline at end of file diff --git a/tests/client/cassetes/test_requests/test_requests_iter_raw_json.gz b/tests/client/cassetes/test_requests/test_requests_iter_raw_json.gz deleted file mode 100644 index 85e51ebd..00000000 --- a/tests/client/cassetes/test_requests/test_requests_iter_raw_json.gz +++ /dev/null @@ -1 +0,0 @@ -eJyllvt3FFUSx4cgD2PAALKIBpgMCEPMPPMaw0bJOyGEBDLBDtILnZmbdCeT7ql+5AGOxABRBOUVyHFXObIGc6K7sj4OKiCeU/WLf9ZW94THYs7R407PD1M11ffWrfp+qnuyIAe+oFTk8/k03RamkrI1Q7dgmQwFwRwsD0qr+S9TWFn2CngmByuC0ip2qUJJC9OClTlYFZSeYU+TYgtYLcOz0ha2Wkyt3B+v9Hcqpj8ejdX4YxW1Fa/VxmL+1s4kFCrSSg7qEeaoMOE5GYqkErbf7Gltz/si0XDc39g9YauGHqkIV4arYY3ibXNYMSdgrQzPS2vZqk+lRNYu9zcaxrAmoFiRVrjeTMYYg3UyrJfWsNnd1ZMs93d1J9u7DvTABsU7bqPB59XtUHIiK+AFGTZKxexVstmMllLcIkSGLEOHvyiOt22/kZ6ATTl40Uvcsk1NH4TN0i42TgYsW7EdK1AbMIYD5YEho19LsxHPfyKxSFUiEcjBS9bivW4wvJyDknwpR4RlKYMCtkgFbHV1wFZvx5SRFrCt41fH8aJMAY6wbPDnoDSobpJeYV/WNIZEyq5b3GmnldW4KXWqFbI5NJQ3IaByk7YHvVLUO1xRUzvhnRB2yPCKFGd3g2JpKf+RkT969VXDTkUq5Dt7LWGG6ge5krBLhqDX+6zXtpCVMpUsl0l1+iPxcJSvtBiNwm5Fev5R50LNOp/TrWWZDK96KQ6e0LLl/rQYyLiCKs8rZbG/IRnCUodbHMscsI1hocfr+puUps7x6l69VRurqelrHNeS6aH+sfbWmF2VsRNGal89JEeGrXRvY7K9pu3g/owxMGpGs81CVCcziW6ILCGIqAwxqfQpQYyHxsbGQgOGORJyzIxwMxdpiOcLwbfrwqMHKmSo9HzDQmRDSkYbFVCleHp9uMl+oQ/aKlTLUON1vaIGEvmT5usCr8lQKy1nuyxSBntYhCs9pXBh0/BXTx+uqqHOi3FMDV6XdrhY2na2NhKpqAjnv1Wx2kQ0EY1wIyKmo4c9Tb/hODnYG1SZ5vqgK46G4FLJNcrQJC1jfxSaFenlxdwsK+TGmUYm5GEW6jK1QU2HFhlavegyaFuioO0y7JM2s9cW43Ykm1E0fY8/pSqmJey63mRLKAEdiqMyYfvVzRLH+aDTUhmSA0H1ERhdqseDygx0MwOpY24HUlZhvhWFcFB6jjPYhudX+3CylK5JBfQLXqOPOtbiVLR4JX2GP+O8H7+rjuMt9s/SDfyRZprqfHj5pTa1VODdAN2gT2mhBc/tw3/iPfqJ5vGaUTCEH+J0fH1Zh4UzDXQLF+gCza4r4jWu4vXx4zmaBbqDX9PnvjV0uZumY7SAl6vdDegMTW6vwemTuyvwa4NO01c454by6nfwB/qEPjZLcCGw7fCGJvyopYeu8k03af6dMZyme0N0d+du+mL90YEdXavC2+lMCd6kG8X0Hp3W2vnnf1rxLN1P4mQbzuHUqRKcpclxH02KU/Q5znIp4JCnGyZJ02PQMw3JQ9Drdvxw8Gl635RB+gP09ikqT40jf3ZqvKWoDPrRJUCXFZW5+ZtapahMxTFP17GaBBxXvMGcWhSTeDgvFBn6PQ7cZSClqExMWmVS1CfwEOrr0t7HWFi2YfKkDY+OhJ84XjhljEQWx6sV+Z+p/QYPa9Oui8KAi8ygh4zqIaMticyQDMP5zBNVkMln/jDk0aQbkUF/nLmxBCxZGUDa+Zvp48Kb0XRhPU2O+ft4WjLYalueMEfdrB6Utj5EJf9xOaHzhTi19wUGJYcX6Ru8je/RF/jlAfwOL+L1FXStdscunKFvcWrrplJ8cHL5uo1ButxHD2ie7qxJyPTzlrX0Zd9R+pbePxKmyb3NtEA3ca69/1X8vrKaruP1UvqK/oEz+AHexSk8/2LtPobpNpM5U1zEUM7jfU5jCm/Rh3S3GO9w7KUO/PcGnMYHDO/Zyl1DJbzZ3J4jeINRupI6WsQI/kK3aI5ub2zEq0zVg0CCvqFzeMlghn7qp3/xMvdwtpEulNIM3m/o3I0X6Me3gO7jObqk0Gd0Bc/gdJL+fvxtH10cxnN4Fq+49IyqjMzYIRj3JtHEE5PoxONJdJIn0QFXEm8HXQXnXAUzTO+oeVRO/VlUJj1U3lVlT9pTi9J2xdXanITTrOyG/0fZ/CxR6o4Niwk44zjCe8Xg9y7LfX6d7VjmhP8LWciWDQ== \ No newline at end of file diff --git a/tests/client/test_items.py b/tests/client/test_items.py index 38af756a..1dfaeaf1 100644 --- a/tests/client/test_items.py +++ b/tests/client/test_items.py @@ -1,10 +1,6 @@ -import json - import pytest from six.moves import range -from scrapinghub.hubstorage.serialization import mpdecode - def _add_test_items(job): for i in range(3): @@ -29,18 +25,6 @@ def test_items_iter(spider): with pytest.raises(StopIteration): next(o) - o = job.items.iter_raw_json(offset=2) - item = json.loads(next(o)) - assert item['id'] == 2 - assert item['data'] == 'data2' - with pytest.raises(StopIteration): - next(o) - - msgpacked_o = job.items.iter_raw_msgpack(offset=2) - o = mpdecode(msgpacked_o) - assert item['id'] == 2 - assert item['data'] == 'data2' - def test_items_list(spider): job = spider.jobs.run(meta={'state': 'running'}) diff --git a/tests/client/test_logs.py b/tests/client/test_logs.py index 52b42ec3..88cb4b85 100644 --- a/tests/client/test_logs.py +++ b/tests/client/test_logs.py @@ -1,11 +1,9 @@ -import json import types from numbers import Integral import pytest from scrapinghub.client.utils import LogLevel -from scrapinghub.hubstorage.serialization import mpdecode from .conftest import TEST_TS @@ -103,35 +101,3 @@ def test_logs_list_filter(spider): logs3 = job.logs.list(filter=[('message', 'contains', ['simple'])]) assert len(logs3) == 3 - - -def test_logs_iter_raw_json(spider): - job = spider.jobs.run() - _add_test_logs(job) - - logs0 = job.logs.iter_raw_json(offset=2) - raw_log0 = next(logs0) - log0 = json.loads(raw_log0) - assert log0.get('message') == 'simple-msg3' - assert log0.get('_key') - assert isinstance(log0.get('time'), Integral) - assert log0.get('level') == 10 - - logs1 = job.logs.iter_raw_json(level='ERROR') - raw_log1 = next(logs1) - log1 = json.loads(raw_log1) - assert log1.get('message') == 'error-msg' - - -def test_logs_iter_raw_msgpack(spider): - job = spider.jobs.run() - _add_test_logs(job) - - logs1 = job.logs.iter_raw_msgpack(offset=2) - assert isinstance(logs1, types.GeneratorType) - unpacked_logs1 = list(mpdecode(logs1)) - assert unpacked_logs1[0].get('message') == 'simple-msg3' - - logs2 = job.logs.iter_raw_msgpack(level='ERROR') - unpacked_logs2 = list(mpdecode(logs2)) - assert unpacked_logs2[0].get('message') == 'error-msg' diff --git a/tests/client/test_proxy.py b/tests/client/test_proxy.py new file mode 100644 index 00000000..7fd4f272 --- /dev/null +++ b/tests/client/test_proxy.py @@ -0,0 +1,38 @@ +import pytest + +from scrapinghub.client.proxy import _format_iter_filters + + +def test_format_iter_filters(): + # work with empty params + assert _format_iter_filters({}) == {} + + # doesn't affect other params + params = {'a': 123, 'b': 456} + assert _format_iter_filters(params) == params + + # pass filter as-is if not list + params = {'filter': 'some-string'} + assert _format_iter_filters(params) == params + + # work fine with empty filter + params = {'filter': []} + assert _format_iter_filters(params) == params + + # pass string filters as-is + params = {'filter': ['str1', 'str2']} + assert _format_iter_filters(params) == params + + # converts list-formatted filters + params = {'filter': [['field', '>=', ['val']], 'filter2']} + assert (_format_iter_filters(params) == + {'filter': ['["field", ">=", ["val"]]', 'filter2']}) + + # works the same with tuple entries + params = {'filter': [('field', '==', ['val'])]} + assert (_format_iter_filters(params) == + {'filter': ['["field", "==", ["val"]]']}) + + # exception if entry is not list/tuple or string + with pytest.raises(ValueError): + _format_iter_filters({'filter': ['test', 123]}) diff --git a/tests/client/test_requests.py b/tests/client/test_requests.py index 1d2e3bca..a71b3820 100644 --- a/tests/client/test_requests.py +++ b/tests/client/test_requests.py @@ -1,5 +1,3 @@ -import json - import pytest from .conftest import TEST_TS @@ -39,18 +37,3 @@ def test_requests_iter(spider): } with pytest.raises(StopIteration): next(rr) - - -def test_requests_iter_raw_json(spider): - job = spider.jobs.run() - _add_test_requests(job) - job.requests.close() - - rr = job.requests.iter_raw_json() - raw_req = next(rr) - req = json.loads(raw_req) - assert req.get('url') == 'http://test.com/' - assert req.get('status') == 200 - next(rr), next(rr) - with pytest.raises(StopIteration): - next(rr) diff --git a/tests/client/test_utils.py b/tests/client/test_utils.py index 03e4362c..f109894c 100644 --- a/tests/client/test_utils.py +++ b/tests/client/test_utils.py @@ -5,42 +5,6 @@ import mock from scrapinghub.client.utils import parse_auth -from scrapinghub.client.utils import format_iter_filters - - -def test_format_iter_filters(): - # work with empty params - assert format_iter_filters({}) == {} - - # doesn't affect other params - params = {'a': 123, 'b': 456} - assert format_iter_filters(params) == params - - # pass filter as-is if not list - params = {'filter': 'some-string'} - assert format_iter_filters(params) == params - - # work fine with empty filter - params = {'filter': []} - assert format_iter_filters(params) == params - - # pass string filters as-is - params = {'filter': ['str1', 'str2']} - assert format_iter_filters(params) == params - - # converts list-formatted filters - params = {'filter': [['field', '>=', ['val']], 'filter2']} - assert (format_iter_filters(params) == - {'filter': ['["field", ">=", ["val"]]', 'filter2']}) - - # works the same with tuple entries - params = {'filter': [('field', '==', ['val'])]} - assert (format_iter_filters(params) == - {'filter': ['["field", "==", ["val"]]']}) - - # exception if entry is not list/tuple or string - with pytest.raises(ValueError): - format_iter_filters({'filter': ['test', 123]}) def test_parse_auth_none():