From b6c86ff3da99e20ed6a5205674ffc95ee15e05a9 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Tue, 14 Mar 2017 12:31:36 +0300 Subject: [PATCH 1/2] Improve docstrings, add kwargs hints, unification Add return types in docstrings Add hints for kwargs, fix docstrings Use update_kwargs helper to unify logic Rename spider_args -> job_args Unify spider param for different methods Don't return count from job.update_tags --- README.rst | 6 +- scrapinghub/client/__init__.py | 27 ++- scrapinghub/client/activity.py | 9 +- scrapinghub/client/collections.py | 152 +++++++++++---- scrapinghub/client/frontiers.py | 166 ++++++++++++---- scrapinghub/client/items.py | 9 +- scrapinghub/client/jobs.py | 312 +++++++++++++++++++----------- scrapinghub/client/logs.py | 11 +- scrapinghub/client/projects.py | 71 +++---- scrapinghub/client/requests.py | 14 +- scrapinghub/client/spiders.py | 64 +++--- scrapinghub/client/utils.py | 83 ++++++-- tests/client/test_job.py | 21 +- tests/client/test_projects.py | 2 +- tests/client/test_spiders.py | 9 +- 15 files changed, 643 insertions(+), 313 deletions(-) diff --git a/README.rst b/README.rst index d6b5b595..7974d77d 100644 --- a/README.rst +++ b/README.rst @@ -88,7 +88,7 @@ Jobs instance is described well in ``Jobs`` section below. For example, to schedule a spider run (it returns a job object):: - >>> project.jobs.schedule('spider1', spider_args={'arg1':'val1'}) + >>> project.jobs.schedule('spider1', job_args={'arg1':'val1'}) > Project instance also has the following fields: @@ -151,7 +151,7 @@ Like project instance, spider instance has ``jobs`` field to work with the spide To schedule a spider run:: - >>> spider.jobs.schedule(spider_args={'arg1:'val1'}) + >>> spider.jobs.schedule(job_args={'arg1:'val1'}) > Note that you don't need to specify spider name explicitly. @@ -750,7 +750,7 @@ To see last jobs summaries:: To get job summary per spider:: - >>> summary = project.spiders.lastjobsummary(spiderid='1') + >>> summary = project.spiders.lastjobsummary(spider_id='1') Job --- diff --git a/scrapinghub/client/__init__.py b/scrapinghub/client/__init__.py index a0201aad..b201b12b 100644 --- a/scrapinghub/client/__init__.py +++ b/scrapinghub/client/__init__.py @@ -51,44 +51,43 @@ def __init__(self, auth=None, dash_endpoint=None, **kwargs): url=dash_endpoint) self._hsclient = HubstorageClient(auth=(login, password), **kwargs) - def get_project(self, projectid): + def get_project(self, project_id): """Get :class:`Project` instance with a given project id. The method is a shortcut for client.projects.get(). - :param projectid: integer or string numeric project id. + :param project_id: integer or string numeric project id. :return: :class:`Project` object. - :rtype: scrapinghub.client.Project. + :rtype: scrapinghub.client.projects.Project Usage:: >>> project = client.get_project(123) >>> project - + """ - return self.projects.get(parse_project_id(projectid)) + return self.projects.get(parse_project_id(project_id)) - def get_job(self, jobkey): - """Get Job with a given jobkey. + def get_job(self, job_key): + """Get Job with a given job key. - :param jobkey: job key string in format 'project/spider/job', + :param job_key: job key string in format 'project_id/spider_id/job_id', where all the components are integers. :return: :class:`Job` object. - :rtype: scrapinghub.client.Job. + :rtype: scrapinghub.client.jobs.Job Usage:: >>> job = client.get_job('123/1/1') >>> job - + """ - projectid = parse_job_key(jobkey).projectid - return self.projects.get(projectid).jobs.get(jobkey) + project_id = parse_job_key(job_key).project_id + return self.projects.get(project_id).jobs.get(job_key) def close(self, timeout=None): """Close client instance. - :param timeout: (optional) float timeout secs to stop everything - gracefully. + :param timeout: (optional) float timeout secs to stop gracefully. """ self._hsclient.close(timeout=timeout) diff --git a/scrapinghub/client/activity.py b/scrapinghub/client/activity.py index e561b23f..40f02fc1 100644 --- a/scrapinghub/client/activity.py +++ b/scrapinghub/client/activity.py @@ -49,12 +49,17 @@ def __init__(self, *args, **kwargs): self._wrap_iter_methods(['iter']) def add(self, values, **kwargs): + """Add new event to the project activity. + + :param values: a single event or a list of events, where event is + represented with a dictionary of ('event', 'job', 'user') keys. + """ if not isinstance(values, list): values = list(values) for activity in values: if not isinstance(activity, dict): raise ValueError("Please pass events as dictionaries") - jobkey = activity.get('job') - if jobkey and parse_job_key(jobkey).projectid != self.key: + job_key = activity.get('job') + if job_key and parse_job_key(job_key).project_id != self.key: raise ValueError('Please use same project id') self._origin.post(values, **kwargs) diff --git a/scrapinghub/client/collections.py b/scrapinghub/client/collections.py index aa56f0a0..a2d5e22f 100644 --- a/scrapinghub/client/collections.py +++ b/scrapinghub/client/collections.py @@ -5,10 +5,9 @@ from ..hubstorage.collectionsrt import Collection as _Collection -from .utils import _Proxy -from .utils import format_iter_filters -from .utils import proxy_methods -from .utils import wrap_kwargs +from .utils import ( + _Proxy, format_iter_filters, proxy_methods, wrap_kwargs, update_kwargs, +) class Collections(_Proxy): @@ -25,29 +24,75 @@ class Collections(_Proxy): >>> foo_store = collections.get_store('foo_store') """ - def get(self, coltype, colname): - """Base method to get a collection with a given type and name.""" - self._origin._validate_collection(coltype, colname) - return Collection(self._client, self, coltype, colname) + def get(self, type_, name): + """Base method to get a collection with a given type and name. - def get_store(self, colname): - return self.get('s', colname) + :param type_: a collection type string. + :param name: a collection name string. + :return: :class:`Collection` object. + :rtype: Collection + """ + self._origin._validate_collection(type_, name) + return Collection(self._client, self, type_, name) + + def get_store(self, name): + """Method to get a store collection by name. + + :param name: a collection name string. + :return: :class:`Collection` object. + :rtype: Collection + """ + return self.get('s', name) + + def get_cached_store(self, name): + """Method to get a cashed-store collection by name. + + The collection type means that items expire after a month. + + :param name: a collection name string. + :return: :class:`Collection` object. + :rtype: Collection + """ + return self.get('cs', name) + + def get_versioned_store(self, name): + """Method to get a versioned-store collection by name. + + The collection type retains up to 3 copies of each item. + + :param name: a collection name string. + :return: :class:`Collection` object. + :rtype: Collection + """ + return self.get('vs', name) - def get_cached_store(self, colname): - return self.get('cs', colname) + def get_versioned_cached_store(self, name): + """Method to get a versioned-cached-store collection by name. - def get_versioned_store(self, colname): - return self.get('vs', colname) + Multiple copies are retained, and each one expires after a month. - def get_versioned_cached_store(self, colname): - return self.get('vcs', colname) + :param name: a collection name string. + :return: :class:`Collection` object. + :rtype: Collection + """ + return self.get('vcs', name) def iter(self): - """Iterate through collections of a project.""" + """Iterate through collections of a project. + + :return: an iterator over collections list where each collection is + represented by a dictionary with ('name','type') fields. + :rtype: collections.Iterable[dict] + """ return self._origin.apiget('list') def list(self): - """List collections of a project.""" + """List collections of a project. + + :return: a list of collections where each collection is + represented by a dictionary with ('name','type') fields. + :rtype: list[dict] + """ return list(self.iter()) @@ -56,7 +101,7 @@ class Collection(object): Not a public constructor: use :class:`Collections` instance to get a :class:`Collection` instance. See :meth:`Collections.get_store` and - similar methods. # noqa + similar methods. Usage: @@ -84,8 +129,7 @@ class Collection(object): >>> for elem in foo_store.iter(count=1)): >>> ... print(elem) - [{'_key': '002d050ee3ff6192dcbecc4e4b4457d7', - 'value': '1447221694537'}] + [{'_key': '002d050ee3ff6192dcbecc4e4b4457d7', 'value': '1447221694537'}] - filter by multiple keys, only values for keys that exist will be returned:: @@ -97,9 +141,9 @@ class Collection(object): >>> foo_store.delete('002d050ee3ff6192dcbecc4e4b4457d7') """ - def __init__(self, client, collections, coltype, colname): + def __init__(self, client, collections, type_, name): self._client = client - self._origin = _Collection(coltype, colname, collections._origin) + self._origin = _Collection(type_, name, collections._origin) proxy_methods(self._origin, self, [ 'create_writer', 'count', ('iter', 'iter_values'), @@ -111,35 +155,58 @@ def __init__(self, client, collections, coltype, colname): wrapped = wrap_kwargs(getattr(self, method), format_iter_filters) setattr(self, method, wrapped) - def list(self, *args, **kwargs): + def list(self, key=None, prefix=None, prefixcount=None, startts=None, + endts=None, requests_params=None, **params): """Convenient shortcut to list iter results. Please note that list() method can use a lot of memory and for a large amount of elements it's recommended to iterate through it via iter() method (all params and available filters are same for both methods). + + :param key: a string key or a list of keys to filter with. + :param prefix: a string prefix to filter items. + :param prefixcount: maximum number of values to return per prefix. + :param startts: UNIX timestamp at which to begin results. + :param endts: UNIX timestamp at which to end results. + :param requests_params: (optional) a dict with optional requests params. + :param \*\*params: (optional) additional query params for the request. + :return: a list of items where each item is represented with a dict. + :rtype: list[dict] + + # FIXME there should be similar docstrings for iter/iter_raw_json + # but as we proxy them as-is, it's not in place, should be improved """ - return list(self.iter(*args, **kwargs)) + update_kwargs(params, key=key, prefix=prefix, prefixcount=prefixcount, + startts=startts, endts=endts, + requests_params=requests_params) + return list(self.iter(requests_params=None, **params)) - def get(self, key, *args, **kwargs): + def get(self, key, **params): """Get item from collection by key. - :param key: string item key - :return: an item dictionary if exists + :param key: string item key. + :param \*\*params: (optional) additional query params for the request. + :return: an item dictionary if exists. + :rtype: dict """ if key is None: raise ValueError("key cannot be None") - return self._origin.get(key, *args, **kwargs) + return self._origin.get(key, **params) - def set(self, *args, **kwargs): + def set(self, value): """Set item to collection by key. + :param value: a dict representing a collection item. + The method returns None (original method returns an empty generator). """ - self._origin.set(*args, **kwargs) + self._origin.set(value) def delete(self, keys): """Delete item(s) from collection by key(s). + :param keys: a single key or a list of keys. + The method returns None (original method returns an empty generator). """ if (not isinstance(keys, string_types) and @@ -148,7 +215,24 @@ def delete(self, keys): "object providing string keys") self._origin.delete(keys) - def iter_raw_msgpack(self, requests_params=None, **apiparams): + def iter_raw_msgpack(self, key=None, prefix=None, prefixcount=None, + startts=None, endts=None, requests_params=None, + **params): + """A method to iterate through raw msgpack-ed items. + Can be convenient if data is needed in same msgpack format. + + :param key: a string key or a list of keys to filter with. + :param prefix: a string prefix to filter items. + :param prefixcount: maximum number of values to return per prefix. + :param startts: UNIX timestamp at which to begin results. + :param endts: UNIX timestamp at which to end results. + :param requests_params: (optional) a dict with optional requests params. + :param \*\*params: (optional) additional query params for the request. + :return: an iterator over items list packed with msgpack. + :rtype: collections.Iterable[bytes] + """ + update_kwargs(params, key=key, prefix=prefix, prefixcount=prefixcount, + startts=startts, endts=endts, + requests_params=requests_params) return self._origin._collections.iter_msgpack( - self._origin.coltype, self._origin.colname, - requests_params=requests_params, **apiparams) + self._origin.coltype, self._origin.colname, **params) diff --git a/scrapinghub/client/frontiers.py b/scrapinghub/client/frontiers.py index 684e1743..bf2b460b 100644 --- a/scrapinghub/client/frontiers.py +++ b/scrapinghub/client/frontiers.py @@ -7,7 +7,7 @@ from ..hubstorage.frontier import Frontier as _Frontier from ..hubstorage.utils import urlpathjoin -from .utils import _Proxy +from .utils import _Proxy, update_kwargs class _HSFrontier(_Frontier): @@ -18,6 +18,12 @@ def __init__(self, *args, **kwargs): self.newcount = defaultdict(int) def _get_writer(self, frontier, slot): + """Modified helper method to create a batchuploader writer with updated + callback to write newcount data per slot. + + :return: a batchuploader writer instance. + :rtype: scrapinghub.hubstorage.batchuploader._BatchWriter + """ key = (frontier, slot) writer = self._writers.get(key) if not writer: @@ -47,25 +53,31 @@ class Frontiers(_Proxy): Usage: - get all frontiers from a project:: + >>> project.frontiers.iter() - - list all frontiers + - list all frontiers:: + >>> project.frontiers.list() ['test', 'test1', 'test2'] - - get a frontier by name + - get a frontier by name:: + >>> project.frontiers.get('test') - + + + - flush data of all frontiers of a project:: - - flush data of all frontiers of a project >>> project.frontiers.flush() - - show amount of new requests added for all frontiers + - show amount of new requests added for all frontiers:: + >>> project.frontiers.newcount 3 - - close batch writers of all frontiers of a project + - close batch writers of all frontiers of a project:: + >>> project.frontiers.close() """ def __init__(self, *args, **kwargs): @@ -73,15 +85,28 @@ def __init__(self, *args, **kwargs): self._proxy_methods(['close', 'flush']) def get(self, name): - """Get a frontier by name.""" + """Get a frontier by name. + + :param name: a frontier name string. + :return: class:`Frontier` instance. + :rtype: Frontier + """ return Frontier(self._client, self, name) def iter(self): - """Iterate through frontiers.""" + """Iterate through frontiers. + + :return: an iterator over frontiers names. + :rtype: collections.Iterable[str] + """ return iter(self.list()) def list(self): - """List frontiers.""" + """List frontiers names. + + :return: a list of frontiers names. + :rtype: list[str] + """ return next(self._origin.apiget('list')) @property @@ -97,22 +122,27 @@ class Frontier(object): Usage: - - get iterator with all slots + - get iterator with all slots:: + >>> frontier.iter() - - list all slots + - list all slots:: + >>> frontier.list() ['example.com', 'example.com2'] - - get a slot by name + - get a slot by name:: + >>> frontier.get('example.com') - + + + - flush frontier data:: - - flush frontier data >>> frontier.flush() - - show amount of new requests added to frontier + - show amount of new requests added to frontier:: + >>> frontier.newcount 3 """ @@ -122,15 +152,27 @@ def __init__(self, client, frontiers, name): self._frontiers = frontiers def get(self, slot): - """Get a slot by name.""" + """Get a slot by name. + + :return: class:`FrontierSlot` instance. + :rtype: FrontierSlot + """ return FrontierSlot(self._client, self, slot) def iter(self): - """Iterate through slots.""" + """Iterate through slots. + + :return: an iterator over frontier slots names. + :rtype: collections.Iterate[str] + """ return iter(self.list()) def list(self): - """List all slots.""" + """List all slots. + + :return: a list of frontier slots names. + :rtype: list[str] + """ return next(self._frontiers._origin.apiget((self.key, 'list'))) def flush(self): @@ -155,39 +197,46 @@ class FrontierSlot(object): Usage: - - add request to a queue + - add request to a queue:: + >>> data = [{'fp': 'page1.html', 'p': 1, 'qdata': {'depth': 1}}] >>> slot.q.add('example.com', data) - - add fingerprints to a slot + - add fingerprints to a slot:: + >>> slot.f.add(['fp1', 'fp2']) - - flush data for a slot + - flush data for a slot:: + >>> slot.flush() - - show amount of new requests added to a slot + - show amount of new requests added to a slot:: + >>> slot.newcount 2 - - read requests from a slot + - read requests from a slot:: + >>> slot.q.iter() >>> slot.q.list() [{'id': '0115a8579633600006', 'requests': [['page1.html', {'depth': 1}]]}] - - read fingerprints from a slot + - read fingerprints from a slot:: + >>> slot.f.iter() >>> slot.f.list() ['page1.html'] - - delete a batch with requests from a slot + - delete a batch with requests from a slot:: + >>> slot.q.delete('0115a8579633600006') - - delete a whole slot - >>> slot.delete() + - delete a whole slot:: + >>> slot.delete() """ def __init__(self, client, frontier, slot): self.key = slot @@ -198,10 +247,20 @@ def __init__(self, client, frontier, slot): @property def f(self): + """Shortcut to have quick access to slot fingerprints. + + :return: class:`FrontierSlotFingerprints` instance. + :rtype: FrontierSlotFingerprints + """ return self.fingerprints @property def q(self): + """Shortcut to have quick access to a slot queue. + + :return: class:`FrontierSlotQueue` instance. + :rtype: FrontierSlotQueue + """ return self.queue def delete(self): @@ -239,16 +298,26 @@ def add(self, fps): for fp in fps: writer.write({'fp': fp}) - def iter(self, **kwargs): - """Iterate through fingerprints in the slot.""" + def iter(self, **params): + """Iterate through fingerprints in the slot. + + :param \*\*params: (optional) additional query params for the request. + :return: an iterator over fingerprints. + :rtype: collections.Iterable[str] + """ origin = self._frontier._frontiers._origin path = (self._frontier.key, 's', self.key, 'f') - for fp in origin.apiget(path, params=kwargs): + for fp in origin.apiget(path, params=params): yield fp.get('fp') - def list(self, **kwargs): - """List fingerprints in the slot.""" - return list(self.iter(**kwargs)) + def list(self, **params): + """List fingerprints in the slot. + + :param \*\*params: (optional) additional query params for the request. + :return: a list of fingerprints. + :rtype: list[str] + """ + return list(self.iter(**params)) class FrontierSlotQueue(object): @@ -263,15 +332,30 @@ def add(self, fps): origin = self._frontier._frontiers._origin return origin.add(self._frontier.key, self.key, fps) - def iter(self, **kwargs): - """Iterate through batches in the queue.""" + def iter(self, mincount=None, **params): + """Iterate through batches in the queue. + + :param mincount: (optional) limit results with min amount of requests. + :param \*\*params: (optional) additional query params for the request. + :return: an iterator over request batches in the queue where each + batch is represented with a dict with ('id', 'requests') field. + :rtype: collections.Iterable[dict] + """ origin = self._frontier._frontiers._origin path = (self._frontier.key, 's', self.key, 'q') - return origin.apiget(path, params=kwargs) - - def list(self, **kwargs): - """List request batches in the queue.""" - return list(self.iter(**kwargs)) + update_kwargs(params, mincount=mincount) + return origin.apiget(path, params=params) + + def list(self, mincount=None, **params): + """List request batches in the queue. + + :param mincount: (optional) limit results with min amount of requests. + :param \*\*params: (optional) additional query params for the request. + :return: a list of request batches in the queue where each batch + is represented with a dict with ('id', 'requests') field. + :rtype: list[dict] + """ + return list(self.iter(mincount=mincount, **params)) def delete(self, ids): """Delete request batches from the queue.""" diff --git a/scrapinghub/client/items.py b/scrapinghub/client/items.py index 3a2dea5c..a127e2e0 100644 --- a/scrapinghub/client/items.py +++ b/scrapinghub/client/items.py @@ -35,21 +35,22 @@ class Items(_Proxy): 'size': 100000, }] - - retrieve 1 item with multiple filters: + - retrieve 1 item with multiple filters:: + >>> filters = [("size", ">", [30000]), ("size", "<", [40000])] >>> job.items.list(count=1, filter=filters) [{ 'name': ['Some other item'], 'url': 'http://some-url/other-item.html', - 'size': 50000, + 'size': 35000, }] """ def _modify_iter_params(self, params): """Modify iter filter to convert offset to start parameter. - Returns: - dict: updated set of params + :return: a dict with updated set of params. + :rtype: dict """ params = super(Items, self)._modify_iter_params(params) offset = params.pop('offset', None) diff --git a/scrapinghub/client/jobs.py b/scrapinghub/client/jobs.py index 0203613b..fc17d1fc 100644 --- a/scrapinghub/client/jobs.py +++ b/scrapinghub/client/jobs.py @@ -12,7 +12,9 @@ from .requests import Requests from .samples import Samples from .exceptions import NotFound, BadRequest, DuplicateJobError -from .utils import _MappingProxy, get_tags_for_update, parse_job_key +from .utils import ( + _MappingProxy, get_tags_for_update, parse_job_key, update_kwargs, +) class Jobs(object): @@ -22,30 +24,42 @@ class Jobs(object): instance to get a :class:`Jobs` instance. See :attr:`Project.jobs` and :attr:`Spider.jobs` attributes. - :ivar projectid: an integer project id. + :ivar project_id: a string project id. :ivar spider: :class:`Spider` object if defined. Usage:: >>> project.jobs - + >>> spider = project.spiders.get('spider1') >>> spider.jobs - + """ - def __init__(self, client, projectid, spider=None): - self.projectid = projectid + def __init__(self, client, project_id, spider=None): + self.project_id = project_id self.spider = spider self._client = client - self._project = client._hsclient.get_project(projectid) + self._project = client._hsclient.get_project(project_id) + + def count(self, spider=None, state=None, has_tag=None, lacks_tag=None, + startts=None, endts=None, **params): + """Count jobs with a given set of filters. + + :param spider: (optional) filter by spider name. + :param state: (optional) a job state, a string or a list of strings. + :param has_tag: (optional) filter results by existing tag(s), a string + or a list of strings. + :param lacks_tag: (optional) filter results by missing tag(s), a string + or a list of strings. + :param startts: (optional) UNIX timestamp at which to begin results, + in millisecons. + :param endts: (optional) UNIX timestamp at which to end results, + in millisecons. + :param \*\*params: (optional) other filter params. - def count(self, **params): - """Count jobs for a given set of parameters. - - :param \*\*params: (optional) a set of filters to apply when counting - jobs (e.g. spider, state, has_tag, lacks_tag, startts and endts). :return: jobs count. + :rtype: int Usage:: @@ -55,17 +69,36 @@ def count(self, **params): >>> project.jobs.count(spider='spider2', state='finished') 2 """ + update_kwargs(params, spider=spider, state=state, has_tag=has_tag, + lacks_tag=lacks_tag, startts=startts, endts=endts) if self.spider: params['spider'] = self.spider.name return next(self._project.jobq.apiget(('count',), params=params)) - def iter(self, **params): + def iter(self, count=None, start=None, spider=None, state=None, + has_tag=None, lacks_tag=None, startts=None, endts=None, + meta=None, **params): """Iterate over jobs collection for a given set of params. - :param \*\*params: (optional) a set of filters to apply when counting - jobs (e.g. spider, state, has_tag, lacks_tag, startts and endts). + :param count: (optional) limit amount of returned jobs. + :param start: (optional) number of jobs to skip in the beginning. + :param spider: (optional) filter by spider name. + :param state: (optional) a job state, a string or a list of strings. + :param has_tag: (optional) filter results by existing tag(s), a string + or a list of strings. + :param lacks_tag: (optional) filter results by missing tag(s), a string + or a list of strings. + :param startts: (optional) UNIX timestamp at which to begin results, + in millisecons. + :param endts: (optional) UNIX timestamp at which to end results, + in millisecons. + :param meta: (optional) request for additional fields, a single + field name or a list of field names to return. + :param \*\*params: (optional) other filter params. + :return: a generator object over a list of dictionaries of jobs summary for a given filter params. + :rtype: types.GeneratorType[dict] Usage: @@ -80,13 +113,13 @@ def iter(self, **params): >>> [job['key'] for job in jobs_summary] ['123/1/3', '123/1/2', '123/1/1'] - - job summary fieldset is less detailed than job.metadata but contains - few new fields as well. Additional fields can be requested using - ``jobmeta`` parameter. If it's used, then it's up to the user to list - all the required fields, so only few default fields would be added - except requested ones:: + - job summary fieldset is less detailed than job.metadata but + contains few new fields as well. Additional fields can be requested + using ``meta`` parameter. If it's used, then it's up to the user + to list all the required fields, so only few default fields would + be added except requested ones:: - >>> jobs_summary = project.jobs.iter(jobmeta=['scheduled_by', ]) + >>> jobs_summary = project.jobs.iter(meta=['scheduled_by', ]) - by default :meth:`Jobs.iter` returns maximum last 1000 results. Pagination is available using start parameter:: @@ -103,48 +136,88 @@ def iter(self, **params): >>> jobs_summary = project.jobs.iter( ... spider='spider2', state='finished', count=3) """ + update_kwargs(params, count=count, start=start, jobmeta=meta, + spider=spider, state=state, has_tag=has_tag, + lacks_tag=lacks_tag, startts=startts, endts=endts) if self.spider: params['spider'] = self.spider.name return self._project.jobq.list(**params) - def list(self, **params): + def list(self, count=None, start=None, spider=None, state=None, + has_tag=None, lacks_tag=None, startts=None, endts=None, + meta=None, **params): """Convenient shortcut to list iter results. + :param count: (optional) limit amount of returned jobs. + :param start: (optional) number of jobs to skip in the beginning. + :param spider: (optional) filter by spider name. + :param state: (optional) a job state, a string or a list of strings. + :param has_tag: (optional) filter results by existing tag(s), a string + or a list of strings. + :param lacks_tag: (optional) filter results by missing tag(s), a string + or a list of strings. + :param startts: (optional) UNIX timestamp at which to begin results, + in millisecons. + :param endts: (optional) UNIX timestamp at which to end results, + in millisecons. + :param meta: (optional) request for additional fields, a single + field name or a list of field names to return. + :param \*\*params: (optional) other filter params. + + :return: list of dictionaries of jobs summary for a given filter params + :rtype: list[dict] + Please note that list() method can use a lot of memory and for a large amount of jobs it's recommended to iterate through it via iter() method (all params and available filters are same for both methods). - """ + # FIXME we double-check the params here, is there a better way? + # Simpler way would be to keep **params only here and point to iter(), + # but then we loose hinting kwargs for list() method. + update_kwargs(params, count=count, start=start, meta=meta, + spider=spider, state=state, has_tag=has_tag, + lacks_tag=lacks_tag, startts=startts, endts=endts) return list(self.iter(**params)) - def schedule(self, spidername=None, **params): - """Schedule a new job and returns its jobkey. + def schedule(self, spider=None, units=None, priority=None, meta=None, + add_tag=None, job_args=None, job_settings=None, cmd_args=None, + **params): + """Schedule a new job and returns its job key. - :param spidername: a spider name string + :param spider: a spider name string (not needed if job is scheduled via :attr:`Spider.jobs`). + :param units: (optional) amount of units for the job. + :param priority: (optional) integer priority value. + :param meta: (optional) a dictionary with metadata. + :param add_tag: (optional) a string tag of a list of tags to add. + :param job_args: (optional) a dictionary with job arguments. + :param job_settings: (optional) a dictionary with job settings. + :param cmd_args: (optional) a string with script command args. :param \*\*params: (optional) additional keyword args. - :return: a jobkey string pointing to the new job. + + :return: a job key string pointing to the new job. + :rtype: str Usage:: - >>> project.schedule('spider1', arg1='val1') + >>> project.jobs.schedule('spider1', job_args={'arg1': 'val1'}) '123/1/1' """ - if not spidername and not self.spider: - raise ValueError('Please provide spidername') - params['project'] = self.projectid - params['spider'] = spidername or self.spider.name - spider_args = params.pop('spider_args', None) - if spider_args: - if not isinstance(spider_args, dict): - raise ValueError("spider_args should be a dictionary") - cleaned_args = {k: v for k, v in spider_args.items() + if not spider and not self.spider: + raise ValueError('Please provide `spider` name') + if job_args: + if not isinstance(job_args, dict): + raise ValueError("job_args should be a dictionary") + cleaned_args = {k: v for k, v in job_args.items() if k not in params} params.update(cleaned_args) - if 'job_settings' in params: - params['job_settings'] = json.dumps(params['job_settings']) - if 'meta' in params: - params['meta'] = json.dumps(params['meta']) + + params['project'] = self.project_id + params['spider'] = spider or self.spider.name + + update_kwargs(params, units=units, priority=priority, add_tag=add_tag, + cmd_args=cmd_args, job_settings=job_settings, meta=meta) + # FIXME improve to schedule multiple jobs try: response = self._client._connection._post( @@ -155,17 +228,17 @@ def schedule(self, spidername=None, **params): raise return Job(self._client, response['jobid']) - def get(self, jobkey): - """Get a Job with a given jobkey. + def get(self, job_key): + """Get a Job with a given job_key. - :param jobkey: a string job key. + :param job_key: a string job key. - jobkey's project component should match the project used to get - :class:`Jobs` instance, and jobkey's spider component should match + job_key's project component should match the project used to get + :class:`Jobs` instance, and job_key's spider component should match the spider (if :attr:`Spider.jobs` was used). :return: :class:`Job` object. - :rtype: scrapinghub.client.Job. + :rtype: scrapinghub.client.jobs.Job Usage:: @@ -173,20 +246,23 @@ def get(self, jobkey): >>> job.key '123/1/2' """ - jobkey = parse_job_key(jobkey) - if jobkey.projectid != self.projectid: + job_key = parse_job_key(job_key) + if job_key.project_id != self.project_id: raise ValueError('Please use same project id') - if self.spider and jobkey.spiderid != self.spider._id: + if self.spider and job_key.spider_id != self.spider._id: raise ValueError('Please use same spider id') - return Job(self._client, str(jobkey)) + return Job(self._client, str(job_key)) - def summary(self, _queuename=None, **params): + def summary(self, state=None, spider=None, **params): """Get jobs summary (optionally by state). - :param _queuename: (optional) a string state to filter jobs. + :param state: (optional) a string state to filter jobs. + :param spider: (optional) a spider name + (not needed if instantiated with :cls:`Spider`). :param \*\*params: (optional) additional keyword args. - :return: a generator object over a list of dictionaries of jobs summary + :return: a list of dictionaries of jobs summary for a given filter params grouped by job state. + :rtype: list[dict] Usage:: @@ -198,16 +274,23 @@ def summary(self, _queuename=None, **params): >>> project.jobs.summary('pending') {'count': 0, 'name': 'pending', 'summary': []} """ - spiderid = self._extract_spider_id(params) + spider_id = self._extract_spider_id(spider) return self._project.jobq.summary( - _queuename, spiderid=spiderid, **params) + state, spiderid=spider_id, **params) - def iter_last(self, **params): + def iter_last(self, start=None, start_after=None, count=None, + spider=None, **params): """Iterate through last jobs for each spider. - :param \*\*params: (optional) keyword arguments to filter jobs. + :param start: (optional) + :param start_after: (optional) + :param count: (optional) + :param spider: (optional) a spider name + (not needed if instantiated with :cls:`Spider`). + :param \*\*params: (optional) additional keyword args. :return: a generator object over a list of dictionaries of jobs summary for a given filter params. + :rtype: types.GeneratorType[dict] Usage: @@ -232,29 +315,35 @@ def iter_last(self, **params): 'ts': 1482911615830, 'version': 'some-version'}] """ - spiderid = self._extract_spider_id(params) - return self._project.spiders.lastjobsummary(spiderid, **params) + spider_id = self._extract_spider_id(spider) + update_kwargs(params, start=start, startafter=start_after, count=count) + return self._project.spiders.lastjobsummary(spider_id, **params) - def _extract_spider_id(self, params): - spiderid = params.pop('spiderid', None) - if not spiderid and self.spider: + def _extract_spider_id(self, spider): + if not spider and self.spider: return self.spider._id - elif spiderid and self.spider and str(spiderid) != self.spider._id: - raise ValueError('Please use same spider id') - return str(spiderid) if spiderid else None - - def update_tags(self, add=None, remove=None, spidername=None): + if spider: + project = self._client.get_project(self.project_id) + spider_id = project.spiders.get(spider)._id + if self.spider and spider_id != self.spider._id: + raise ValueError('Please use same spider') + return spider_id + return None + + def update_tags(self, add=None, remove=None, spider=None): """Update tags for all existing spider jobs. :param add: (optional) list of tags to add to selected jobs. :param remove: (optional) list of tags to remove from selected jobs. - :param spidername: spider name, must if used with :attr:`Project.jobs`. + :param spider: (optional) spider name, must if used with + :attr:`Project.jobs`. It's not allowed to update tags for all project jobs, so spider must be specified (it's done implicitly when using :attr:`Spider.jobs`, or you - have to specify ``spidername`` param when using :attr:`Project.jobs`). + have to specify ``spider`` param when using :attr:`Project.jobs`). :return: amount of jobs that were updated. + :rtype: int Usage: @@ -267,16 +356,16 @@ def update_tags(self, add=None, remove=None, spidername=None): - remove existing tag ``existing`` for all spider jobs:: >>> project.jobs.update_tags( - ... remove=['existing'], spidername='spider2') + ... remove=['existing'], spider='spider2') 2 """ - spidername = spidername or (self.spider.name if self.spider else None) - if not spidername: - raise ValueError('Please provide spidername') + spider = spider or (self.spider.name if self.spider else None) + if not spider: + raise ValueError('Please provide spider') params = get_tags_for_update(add_tag=add, remove_tag=remove) if not params: return - params.update({'project': self.projectid, 'spider': spidername}) + params.update({'project': self.project_id, 'spider': spider}) result = self._client._connection._post('jobs_update', 'json', params) return result['count'] @@ -288,7 +377,7 @@ class Job(object): :class:`Jobs` instance to get a :class:`Job` instance. See :meth:`ScrapinghubClient.get_job` and :meth:`Jobs.get` methods. - :ivar projectid: in integer project id. + :ivar project_id: integer project id. :ivar key: a job key. :ivar items: :class:`Items` resource object. :ivar logs: :class:`Logs` resource object. @@ -304,21 +393,21 @@ class Job(object): >>> job.metadata.get('state') 'finished' """ - def __init__(self, client, jobkey): - self.projectid = parse_job_key(jobkey).projectid - self.key = jobkey + def __init__(self, client, job_key): + self.project_id = parse_job_key(job_key).project_id + self.key = job_key self._client = client - self._project = client._hsclient.get_project(self.projectid) - self._job = client._hsclient.get_job(jobkey) + self._project = client._hsclient.get_project(self.project_id) + self._job = client._hsclient.get_job(job_key) # proxied sub-resources - self.items = Items(_Items, client, jobkey) - self.logs = Logs(_Logs, client, jobkey) - self.requests = Requests(_Requests, client, jobkey) - self.samples = Samples(_Samples, client, jobkey) + self.items = Items(_Items, client, job_key) + self.logs = Logs(_Logs, client, job_key) + self.requests = Requests(_Requests, client, job_key) + self.samples = Samples(_Samples, client, job_key) - self.metadata = JobMeta(_JobMeta, client, jobkey) + self.metadata = JobMeta(_JobMeta, client, job_key) def update_tags(self, add=None, remove=None): """Partially update job tags. @@ -326,18 +415,16 @@ def update_tags(self, add=None, remove=None): It provides a convenient way to mark specific jobs (for better search, postprocessing etc). - :param add: (optional) list of tags to add - :param remove: (optional) list of tags to remove - :return: amount of jobs that were updated + :param add: (optional) list of tags to add. + :param remove: (optional) list of tags to remove. Usage: to mark a job with tag ``consumed``:: >>> job.update_tags(add=['consumed']) """ params = get_tags_for_update(add_tag=add, remove_tag=remove) - params.update({'project': self.projectid, 'job': self.key}) - result = self._client._connection._post('jobs_update', 'json', params) - return result['count'] + params.update({'project': self.project_id, 'job': self.key}) + self._client._connection._post('jobs_update', 'json', params) def close_writers(self): """Stop job batch writers threads gracefully. @@ -349,8 +436,9 @@ def close_writers(self): def start(self, **params): """Move job to running state. - :param \*\*params: (optional) keyword meta parameters to update - :return: a previous string job state + :param \*\*params: (optional) keyword meta parameters to update. + :return: a previous string job state. + :rtype: str Usage:: @@ -362,8 +450,9 @@ def start(self, **params): def finish(self, **params): """Move running job to finished state. - :param \*\*params: (optional) keyword meta parameters to update - :return: a previous string job state + :param \*\*params: (optional) keyword meta parameters to update. + :return: a previous string job state. + :rtype: str Usage:: @@ -375,8 +464,9 @@ def finish(self, **params): def delete(self, **params): """Mark finished job for deletion. - :param \*\*params: (optional) keyword meta parameters to update - :return: a previous string job state + :param \*\*params: (optional) keyword meta parameters to update. + :return: a previous string job state. + :rtype: str Usage:: @@ -385,19 +475,21 @@ def delete(self, **params): """ return self.update(state='deleted', **params) - def update(self, **params): + def update(self, state, **params): """Update job state. - :param \*\*params: (optional) keyword meta parameters to update - :return: a previous string job state + :param state: a new job state. + :param \*\*params: (optional) keyword meta parameters to update. + :return: a previous string job state. + :rtype: str Usage:: - >>> job.update(state='finished') + >>> job.update('finished') 'running' """ try: - job = next(self._project.jobq.update(self, **params)) + job = next(self._project.jobq.update(self, state=state, **params)) return job['prevstate'] except StopIteration: raise NotFound("Job {} doesn't exist".format(self.key)) @@ -420,29 +512,29 @@ class JobMeta(_MappingProxy): Not a public constructor: use :class:`Job` instance to get a :class:`Jobmeta` instance. See :attr:`Job.metadata` attribute. - Usage:: + Usage: - - get job metadata instance + - get job metadata instance:: >>> job.metadata - - iterate through job metadata + - iterate through job metadata:: >>> job.metadata.iter() - - list job metadata + - list job metadata:: >>> job.metadata.list() [('project', 123), ('units', 1), ('state', 'finished'), ...] - - get meta field value by name + - get meta field value by name:: >>> job.metadata.get('version') 'test' - - update job meta field value (some meta fields are read-only) + - update job meta field value (some meta fields are read-only):: >>> job.metadata.set('my-meta', 'test') @@ -450,7 +542,7 @@ class JobMeta(_MappingProxy): >>> job.metadata.update({'my-meta1': 'test1', 'my-meta2': 'test2}) - - delete meta field by name + - delete meta field by name:: >>> job.metadata.delete('my-meta') """ diff --git a/scrapinghub/client/logs.py b/scrapinghub/client/logs.py index f57b4b7e..ebfdfde7 100644 --- a/scrapinghub/client/logs.py +++ b/scrapinghub/client/logs.py @@ -36,13 +36,13 @@ class Logs(_Proxy): 'time': 1482233733976, }] - - retrive logs with a given log level and filter by a word + - retrive logs with a given log level and filter by a word:: - >>> filters = [("message", "contains", ["logger"])] + >>> filters = [("message", "contains", ["mymessage"])] >>> job.logs.list(level='WARNING', filter=filters) [{ 'level': 30, - 'message': 'Some warning message', + 'message': 'Some warning: mymessage', 'time': 1486375511188, }] """ @@ -58,8 +58,9 @@ def _modify_iter_params(self, params): - convert offset to start parameter - check log level and create a corresponding meta filter - :param params: an original dictionary with params - :return: a modified dictionary with params + :param params: an original dictionary with params. + :return: a modified dictionary with params. + :rtype: dict """ params = super(Logs, self)._modify_iter_params(params) offset = params.pop('offset', None) diff --git a/scrapinghub/client/projects.py b/scrapinghub/client/projects.py index a1e21db4..ff690b41 100644 --- a/scrapinghub/client/projects.py +++ b/scrapinghub/client/projects.py @@ -1,7 +1,5 @@ from __future__ import absolute_import -import six - from ..hubstorage.activity import Activity as _Activity from ..hubstorage.collectionsrt import Collections as _Collections from ..hubstorage.project import Settings as _Settings @@ -23,31 +21,32 @@ class Projects(object): Usage:: >>> client.projects - + """ def __init__(self, client): self._client = client - def get(self, projectid): + def get(self, project_id): """Get project for a given project id. - :param projectid: integer or string numeric project id. + :param project_id: integer or string numeric project id. :return: :class:`Project` object. - :rtype: scrapinghub.client.Project. + :rtype: scrapinghub.client.projects.Project Usage:: >>> project = client.projects.get(123) >>> project - + """ - return Project(self._client, parse_project_id(projectid)) + return Project(self._client, parse_project_id(project_id)) def list(self): """Get list of projects available to current user. - :return: a list of integer project ids. + :return: a list of project ids. + :rtype: list[int] Usage:: @@ -60,15 +59,20 @@ def iter(self): """Iterate through list of projects available to current user. Provided for the sake of API consistency. + + :return: an iterator over project ids list. + :rtype: collections.Iterable[int] """ return iter(self.list()) - def summary(self, **params): + def summary(self, state=None, **params): """Get short summaries for all available user projects. + :param state: a string state or a list of states. :return: a list of dictionaries: each dictionary represents a project summary (amount of pending/running/finished jobs and a flag if it has a capacity to schedule new jobs). + :rtype: list[dict] Usage:: @@ -84,6 +88,8 @@ def summary(self, **params): 'project': 456, 'running': 2}] """ + if state: + params['state'] = state return self._client._hsclient.projects.jobsummaries(**params) @@ -92,12 +98,12 @@ class Project(object): Not a public constructor: use :class:`ScrapinghubClient` instance or :class:`Projects` instance to get a :class:`Project` instance. See - :meth:`Scrapinghub.get_project` or :meth:`Projects.get_project` methods. + :meth:`Scrapinghub.get_project` or :meth:`Projects.get` methods. - :ivar id: integer project id. + :ivar key: string project id. :ivar activity: :class:`Activity` resource object. :ivar collections: :class:`Collections` resource object. - :ivar frontier: :class:`Frontier` resource object. + :ivar frontiers: :class:`Frontiers` resource object. :ivar jobs: :class:`Jobs` resource object. :ivar settings: :class:`Settings` resource object. :ivar spiders: :class:`Spiders` resource object. @@ -106,24 +112,24 @@ class Project(object): >>> project = client.get_project(123) >>> project - + >>> project.key '123' """ - def __init__(self, client, projectid): - self.key = str(projectid) + def __init__(self, client, project_id): + self.key = str(project_id) self._client = client # sub-resources - self.jobs = Jobs(client, projectid) - self.spiders = Spiders(client, projectid) + self.jobs = Jobs(client, project_id) + self.spiders = Spiders(client, project_id) # proxied sub-resources - self.activity = Activity(_Activity, client, projectid) - self.collections = Collections(_Collections, client, projectid) - self.frontiers = Frontiers(_HSFrontier, client, projectid) - self.settings = Settings(_Settings, client, projectid) + self.activity = Activity(_Activity, client, project_id) + self.collections = Collections(_Collections, client, project_id) + self.frontiers = Frontiers(_HSFrontier, client, project_id) + self.settings = Settings(_Settings, client, project_id) class Settings(_MappingProxy): @@ -132,44 +138,41 @@ class Settings(_MappingProxy): Not a public constructor: use :class:`Project` instance to get a :class:`Settings` instance. See :attr:`Project.settings` attribute. - Usage:: + Usage: - - get project settings instance + - get project settings instance:: >>> project.settings - - iterate through project settings + - iterate through project settings:: >>> project.settings.iter() - - list project settings + - list project settings:: >>> project.settings.list() - [(u'default_job_units', 2), - (u'job_runtime_limit', 20)] + [(u'default_job_units', 2), (u'job_runtime_limit', 20)] - - get setting value by name + - get setting value by name:: >>> project.settings.get('default_job_units') 2 - - update setting value (some settings are read-only) + - update setting value (some settings are read-only):: >>> project.settings.set('default_job_units', 2) - - update multiple settings at once + - update multiple settings at once:: >>> project.settings.update({'default_job_units': 1, ... 'job_runtime_limit': 20}) - - delete project setting by name + - delete project setting by name:: >>> project.settings.delete('job_runtime_limit') """ def set(self, key, value): # FIXME drop the method when post-by-key is implemented on server side - if not isinstance(key, six.string_types): - raise TypeError("key should be a string") self.update({key: value}) diff --git a/scrapinghub/client/requests.py b/scrapinghub/client/requests.py index 61dcf8e3..06ee1125 100644 --- a/scrapinghub/client/requests.py +++ b/scrapinghub/client/requests.py @@ -30,13 +30,13 @@ class Requests(_Proxy): >>> job.requests.list(count=1) [{ - 'duration': 354, - 'fp': '6d748741a927b10454c83ac285b002cd239964ea', - 'method': 'GET', - 'rs': 1270, - 'status': 200,a - 'time': 1482233733870, - 'url': 'https://example.com' + 'duration': 354, + 'fp': '6d748741a927b10454c83ac285b002cd239964ea', + 'method': 'GET', + 'rs': 1270, + 'status': 200,a + 'time': 1482233733870, + 'url': 'https://example.com' }] """ def __init__(self, *args, **kwargs): diff --git a/scrapinghub/client/spiders.py b/scrapinghub/client/spiders.py index 93a89e0a..522ecb05 100644 --- a/scrapinghub/client/spiders.py +++ b/scrapinghub/client/spiders.py @@ -14,57 +14,62 @@ class Spiders(object): Not a public constructor: use :class:`Project` instance to get a :class:`Spiders` instance. See :attr:`Project.spiders` attribute. - :ivar projectid: integer project id. + :ivar project_id: string project id. Usage:: >>> project.spiders - + """ - def __init__(self, client, projectid): - self.projectid = projectid + def __init__(self, client, project_id): + self.project_id = project_id self._client = client - def get(self, spidername, **params): + def get(self, spider, **params): """Get a spider object for a given spider name. The method gets/sets spider id (and checks if spider exists). - :param spidername: a string spider name. + :param spider: a string spider name. :return: :class:`Spider` object. - :rtype: scrapinghub.client.Spider. + :rtype: scrapinghub.client.spiders.Spider Usage:: >>> project.spiders.get('spider2') - + >>> project.spiders.get('non-existing') NotFound: Spider non-existing doesn't exist. """ - project = self._client._hsclient.get_project(self.projectid) - spiderid = project.ids.spider(spidername, **params) - if spiderid is None: - raise NotFound("Spider {} doesn't exist.".format(spidername)) - return Spider(self._client, self.projectid, spiderid, spidername) + project = self._client._hsclient.get_project(self.project_id) + spider_id = project.ids.spider(spider, **params) + if spider_id is None: + raise NotFound("Spider {} doesn't exist.".format(spider)) + return Spider(self._client, self.project_id, spider_id, spider) def list(self): """Get a list of spiders for a project. :return: a list of dictionaries with spiders metadata. + :rtype: list[dict] - Usage:: # noqa + Usage:: >>> project.spiders.list() [{'id': 'spider1', 'tags': [], 'type': 'manual', 'version': '123'}, {'id': 'spider2', 'tags': [], 'type': 'manual', 'version': '123'}] """ - project = self._client._connection[self.projectid] + project = self._client._connection[self.project_id] return project.spiders() def iter(self): """Iterate through a list of spiders for a project. + :return: an iterator over spiders list where each spider is represented + as a dict containing its metadata. + :rtype: collection.Iterable[dict] + Provided for the sake of API consistency. """ return iter(self.list()) @@ -76,7 +81,8 @@ class Spider(object): Not a public constructor: use :class:`Spiders` instance to get a :class:`Spider` instance. See :meth:`Spiders.get` method. - :ivar projectid: integer project id. + :ivar project_id: a string project id. + :ivar key: a string key in format 'project_id/spider_id'. :ivar name: a spider name string. :ivar jobs: a collection of jobs, :class:`Jobs` object. @@ -89,18 +95,23 @@ class Spider(object): 'spider1' """ - def __init__(self, client, projectid, spiderid, spidername): - self.projectid = projectid - self.key = '{}/{}'.format(str(projectid), str(spiderid)) - self._id = str(spiderid) - self.name = spidername - self.jobs = Jobs(client, projectid, self) + def __init__(self, client, project_id, spider_id, spider): + self.project_id = project_id + self.key = '{}/{}'.format(str(project_id), str(spider_id)) + self._id = str(spider_id) + self.name = spider + self.jobs = Jobs(client, project_id, self) self._client = client @wrap_http_errors def update_tags(self, add=None, remove=None): + """Update tags for the spider. + + :param add: (optional) a list of string tags to add. + :param remove: (optional) a list of string tags to remove. + """ params = get_tags_for_update(add=add, remove=remove) - path = 'v2/projects/{}/spiders/{}/tags'.format(self.projectid, + path = 'v2/projects/{}/spiders/{}/tags'.format(self.project_id, self._id) url = urljoin(self._client._connection.url, path) response = self._client._connection._session.patch(url, json=params) @@ -108,7 +119,12 @@ def update_tags(self, add=None, remove=None): @wrap_http_errors def list_tags(self): - path = 'v2/projects/{}/spiders/{}'.format(self.projectid, self._id) + """List spider tags. + + :return: a list of spider tags. + :rtype: list[str] + """ + path = 'v2/projects/{}/spiders/{}'.format(self.project_id, self._id) url = urljoin(self._client._connection.url, path) response = self._client._connection._session.get(url) response.raise_for_status() diff --git a/scrapinghub/client/utils.py b/scrapinghub/client/utils.py index 395b34e9..c0ec0496 100644 --- a/scrapinghub/client/utils.py +++ b/scrapinghub/client/utils.py @@ -26,32 +26,33 @@ class LogLevel(object): class JobKey(object): - def __init__(self, projectid, spiderid, jobid): - self.projectid = projectid - self.spiderid = spiderid - self.jobid = jobid + def __init__(self, project_id, spider_id, job_id): + self.project_id = project_id + self.spider_id = spider_id + self.job_id = job_id def __str__(self): - return '{}/{}/{}'.format(self.projectid, self.spiderid, self.jobid) + return '{}/{}/{}'.format(self.project_id, self.spider_id, self.job_id) -def parse_project_id(projectid): +def parse_project_id(project_id): try: - int(projectid) + int(project_id) except ValueError: raise ValueError("Project id should be convertible to integer") - return str(projectid) + return str(project_id) -def parse_job_key(jobkey): - if isinstance(jobkey, tuple): - parts = jobkey - elif isinstance(jobkey, six.string_types): - parts = jobkey.split('/') +def parse_job_key(job_key): + if isinstance(job_key, tuple): + parts = job_key + elif isinstance(job_key, six.string_types): + parts = job_key.split('/') else: raise ValueError("Job key should be a string or a tuple") if len(parts) != 3: - raise ValueError("Job key should consist of projectid/spiderid/jobid") + raise ValueError( + "Job key should consist of project_id/spider_id/job_id") try: map(int, parts) except ValueError: @@ -78,11 +79,11 @@ class _Proxy(object): origin depending on the origin base class as a part of init logic: - :class:`ItemsResourceType` provides items-based attributes to access - items in an arbitrary collection with get/write/flush/close/stats/iter - methods. + items in an arbitrary collection with get/write/flush/close/stats/ + iter methods. - :class:`DownloadableResource` provides download-based attributes to - iter through collection with or without msgpack support. + iter through collection with or without msgpack support. """ def __init__(self, cls, client, key): @@ -118,22 +119,53 @@ def _wrap_iter_methods(self, methods): setattr(self, method, wrapped) def _modify_iter_params(self, params): - """Modify iter() params on-the-fly.""" + """A helper to modify iter() params on-the-fly. + + The method is internal and should be redefined in subclasses. + + :param params: a dictionary with input parameters. + :return: an updated dictionary with parameters. + :rtype: dict + """ return format_iter_filters(params) def list(self, *args, **kwargs): + """Convenient shortcut to list iter results. + + Please note that list() method can use a lot of memory and for a large + amount of elements it's recommended to iterate through it via iter() + method (all params and available filters are same for both methods). + """ return list(self.iter(*args, **kwargs)) class _MappingProxy(_Proxy): + """A helper class to support basic get/set interface for dict-like + collections of elements. + """ def get(self, key): + """Get element value by key. + + :param key: a string key + """ return next(self._origin.apiget(key)) def set(self, key, value): + """Set element value. + + :param key: a string key + :param value: new value to set for the key + """ self._origin.apipost(key, data=json.dumps(value), is_idempotent=True) def update(self, values): + """Update multiple elements at once. + + The method provides convenient interface for partial updates. + + :param values: a dictionary with key/values to update. + """ if not isinstance(values, dict): raise TypeError("values should be a dict") data = next(self._origin.apiget()) @@ -143,9 +175,18 @@ def update(self, values): is_idempotent=True) def delete(self, key): + """Delete element by key. + + :param key: a string key + """ self._origin.apidelete(key) def iter(self): + """Iterate through key/value pairs. + + :return: an iterator over key/value pairs. + :rtype: collections.Iterable + """ return six.iteritems(next(self._origin.apiget())) @@ -161,6 +202,7 @@ def proxy_methods(origin, successor, methods): """A helper to proxy methods from origin to successor. Accepts a list with strings and tuples: + - each string defines: a successor method name to proxy 1:1 with origin method - each tuple should consist of 2 strings: @@ -196,6 +238,11 @@ def format_iter_filters(params): return params +def update_kwargs(kwargs, **params): + kwargs.update({k: json.dumps(v) if isinstance(v, dict) else v + for k, v in params.items() if v is not None}) + + def parse_auth(auth): """Parse authentification token. diff --git a/tests/client/test_job.py b/tests/client/test_job.py index fce0297d..54f69fd1 100644 --- a/tests/client/test_job.py +++ b/tests/client/test_job.py @@ -14,7 +14,7 @@ def test_job_base(client, spider): job = spider.jobs.schedule() assert isinstance(job, Job) - assert job.projectid == TEST_PROJECT_ID + assert job.project_id == TEST_PROJECT_ID assert job.key.startswith(spider.key) assert isinstance(job.items, Items) @@ -25,22 +25,21 @@ def test_job_base(client, spider): def test_job_update_tags(spider): - job1 = spider.jobs.schedule(spider_args={'subid': 'tags-1'}, + job1 = spider.jobs.schedule(job_args={'subid': 'tags-1'}, add_tag=['tag1']) - job2 = spider.jobs.schedule(spider_args={'subid': 'tags-2'}, + job2 = spider.jobs.schedule(job_args={'subid': 'tags-2'}, add_tag=['tag2']) - # FIXME the endpoint normalises tags so it's impossible to send tags - # having upper-cased symbols, let's add more tests when it's fixed - assert job1.update_tags(add=['tag11', 'tag12']) == 1 - assert job1.metadata.get('tags') == ['tag1', 'tag11', 'tag12'] + job1.update_tags(add=['tagA1', 'tagA2']) + assert job1.metadata.get('tags') == ['tag1', 'tagA1', 'tagA2'] - assert job1.update_tags(remove=['tag1', 'tagx']) == 1 - assert job1.metadata.get('tags') == ['tag11', 'tag12'] + job1.update_tags(remove=['tag1', 'tagx']) + assert job1.metadata.get('tags') == ['tagA1', 'tagA2'] + + job1.update_tags(add=['tagB'], remove=['tagA2']) + assert job1.metadata.get('tags') == ['tagA1', 'tagB'] # assert that 2nd job tags weren't changed assert job2.metadata.get('tags') == ['tag2'] - # FIXME adding and removing tags at the same time doesn't work neither: - # remove-tag field is ignored if there's non-void add-tag field def test_job_start(spider): diff --git a/tests/client/test_projects.py b/tests/client/test_projects.py index 855b4d12..a19d7f89 100644 --- a/tests/client/test_projects.py +++ b/tests/client/test_projects.py @@ -72,7 +72,7 @@ def test_project_base(project): def test_project_jobs(project): jobs = project.jobs - assert jobs.projectid == TEST_PROJECT_ID + assert jobs.project_id == TEST_PROJECT_ID assert jobs.spider is None diff --git a/tests/client/test_spiders.py b/tests/client/test_spiders.py index bc40a0aa..e412e0ae 100644 --- a/tests/client/test_spiders.py +++ b/tests/client/test_spiders.py @@ -34,14 +34,13 @@ def test_spiders_list(project): def test_spider_base(project, spider): assert isinstance(spider._id, string_types) assert isinstance(spider.key, string_types) - assert spider.key == spider.projectid + '/' + spider._id + assert spider.key == spider.project_id + '/' + spider._id assert spider.name == TEST_SPIDER_NAME - assert spider.projectid == TEST_PROJECT_ID + assert spider.project_id == TEST_PROJECT_ID assert isinstance(project.jobs, Jobs) def test_spider_list_update_tags(project, spider): - # FIXME empty update should fail with pytest.raises(BadRequest): spider.update_tags() @@ -55,7 +54,7 @@ def test_spider_list_update_tags(project, spider): def test_spider_jobs(spider): jobs = spider.jobs - assert jobs.projectid == TEST_PROJECT_ID + assert jobs.project_id == TEST_PROJECT_ID assert jobs.spider is spider @@ -173,7 +172,7 @@ def test_spider_jobs_get(spider): with pytest.raises(ValueError): spider.jobs.get(TEST_PROJECT_ID + '/2/3') - fake_job_id = str(JobKey(spider.projectid, spider._id, 3)) + fake_job_id = str(JobKey(spider.project_id, spider._id, 3)) fake_job = spider.jobs.get(fake_job_id) assert isinstance(fake_job, Job) From 56568255f70d82e4a23beb3bd423c398a954d77f Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Fri, 17 Mar 2017 12:41:32 +0300 Subject: [PATCH 2/2] Minor README fix for legacy client --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 7974d77d..66709a51 100644 --- a/README.rst +++ b/README.rst @@ -750,7 +750,7 @@ To see last jobs summaries:: To get job summary per spider:: - >>> summary = project.spiders.lastjobsummary(spider_id='1') + >>> summary = project.spiders.lastjobsummary(spiderid='1') Job ---