Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ Jobs instance is described well in ``Jobs`` section below.

For example, to schedule a spider run (it returns a job object)::

>>> project.jobs.schedule('spider1', spider_args={'arg1':'val1'})
>>> project.jobs.schedule('spider1', job_args={'arg1':'val1'})
<scrapinghub.client.Job at 0x106ee12e8>>

Project instance also has the following fields:
Expand Down Expand Up @@ -151,7 +151,7 @@ Like project instance, spider instance has ``jobs`` field to work with the spide

To schedule a spider run::

>>> spider.jobs.schedule(spider_args={'arg1:'val1'})
>>> spider.jobs.schedule(job_args={'arg1:'val1'})
<scrapinghub.client.Job at 0x106ee12e8>>

Note that you don't need to specify spider name explicitly.
Expand Down
27 changes: 13 additions & 14 deletions scrapinghub/client/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,44 +51,43 @@ def __init__(self, auth=None, dash_endpoint=None, **kwargs):
url=dash_endpoint)
self._hsclient = HubstorageClient(auth=(login, password), **kwargs)

def get_project(self, projectid):
def get_project(self, project_id):
"""Get :class:`Project` instance with a given project id.

The method is a shortcut for client.projects.get().

:param projectid: integer or string numeric project id.
:param project_id: integer or string numeric project id.
:return: :class:`Project` object.
:rtype: scrapinghub.client.Project.
:rtype: scrapinghub.client.projects.Project

Usage::

>>> project = client.get_project(123)
>>> project
<scrapinghub.client.Project at 0x106cdd6a0>
<scrapinghub.client.projects.Project at 0x106cdd6a0>
"""
return self.projects.get(parse_project_id(projectid))
return self.projects.get(parse_project_id(project_id))

def get_job(self, jobkey):
"""Get Job with a given jobkey.
def get_job(self, job_key):
"""Get Job with a given job key.

:param jobkey: job key string in format 'project/spider/job',
:param job_key: job key string in format 'project_id/spider_id/job_id',
where all the components are integers.
:return: :class:`Job` object.
:rtype: scrapinghub.client.Job.
:rtype: scrapinghub.client.jobs.Job

Usage::

>>> job = client.get_job('123/1/1')
>>> job
<scrapinghub.client.Job at 0x10afe2eb1>
<scrapinghub.client.jobs.Job at 0x10afe2eb1>
"""
projectid = parse_job_key(jobkey).projectid
return self.projects.get(projectid).jobs.get(jobkey)
project_id = parse_job_key(job_key).project_id
return self.projects.get(project_id).jobs.get(job_key)

def close(self, timeout=None):
"""Close client instance.

:param timeout: (optional) float timeout secs to stop everything
gracefully.
:param timeout: (optional) float timeout secs to stop gracefully.
"""
self._hsclient.close(timeout=timeout)
9 changes: 7 additions & 2 deletions scrapinghub/client/activity.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,17 @@ def __init__(self, *args, **kwargs):
self._wrap_iter_methods(['iter'])

def add(self, values, **kwargs):
"""Add new event to the project activity.

:param values: a single event or a list of events, where event is
represented with a dictionary of ('event', 'job', 'user') keys.
"""
if not isinstance(values, list):
values = list(values)
for activity in values:
if not isinstance(activity, dict):
raise ValueError("Please pass events as dictionaries")
jobkey = activity.get('job')
if jobkey and parse_job_key(jobkey).projectid != self.key:
job_key = activity.get('job')
if job_key and parse_job_key(job_key).project_id != self.key:
raise ValueError('Please use same project id')
self._origin.post(values, **kwargs)
152 changes: 118 additions & 34 deletions scrapinghub/client/collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,9 @@

from ..hubstorage.collectionsrt import Collection as _Collection

from .utils import _Proxy
from .utils import format_iter_filters
from .utils import proxy_methods
from .utils import wrap_kwargs
from .utils import (
_Proxy, format_iter_filters, proxy_methods, wrap_kwargs, update_kwargs,
)


class Collections(_Proxy):
Expand All @@ -25,29 +24,75 @@ class Collections(_Proxy):
>>> foo_store = collections.get_store('foo_store')
"""

def get(self, coltype, colname):
"""Base method to get a collection with a given type and name."""
self._origin._validate_collection(coltype, colname)
return Collection(self._client, self, coltype, colname)
def get(self, type_, name):
"""Base method to get a collection with a given type and name.

def get_store(self, colname):
return self.get('s', colname)
:param type_: a collection type string.
:param name: a collection name string.
:return: :class:`Collection` object.
:rtype: Collection
"""
self._origin._validate_collection(type_, name)
return Collection(self._client, self, type_, name)

def get_store(self, name):
"""Method to get a store collection by name.

:param name: a collection name string.
:return: :class:`Collection` object.
:rtype: Collection
"""
return self.get('s', name)

def get_cached_store(self, name):
"""Method to get a cashed-store collection by name.

The collection type means that items expire after a month.

:param name: a collection name string.
:return: :class:`Collection` object.
:rtype: Collection
"""
return self.get('cs', name)

def get_versioned_store(self, name):
"""Method to get a versioned-store collection by name.

The collection type retains up to 3 copies of each item.

:param name: a collection name string.
:return: :class:`Collection` object.
:rtype: Collection
"""
return self.get('vs', name)

def get_cached_store(self, colname):
return self.get('cs', colname)
def get_versioned_cached_store(self, name):
"""Method to get a versioned-cached-store collection by name.

def get_versioned_store(self, colname):
return self.get('vs', colname)
Multiple copies are retained, and each one expires after a month.

def get_versioned_cached_store(self, colname):
return self.get('vcs', colname)
:param name: a collection name string.
:return: :class:`Collection` object.
:rtype: Collection
"""
return self.get('vcs', name)

def iter(self):
"""Iterate through collections of a project."""
"""Iterate through collections of a project.

:return: an iterator over collections list where each collection is
represented by a dictionary with ('name','type') fields.
:rtype: collections.Iterable[dict]
"""
return self._origin.apiget('list')

def list(self):
"""List collections of a project."""
"""List collections of a project.

:return: a list of collections where each collection is
represented by a dictionary with ('name','type') fields.
:rtype: list[dict]
"""
return list(self.iter())


Expand All @@ -56,7 +101,7 @@ class Collection(object):

Not a public constructor: use :class:`Collections` instance to get a
:class:`Collection` instance. See :meth:`Collections.get_store` and
similar methods. # noqa
similar methods.

Usage:

Expand Down Expand Up @@ -84,8 +129,7 @@ class Collection(object):

>>> for elem in foo_store.iter(count=1)):
>>> ... print(elem)
[{'_key': '002d050ee3ff6192dcbecc4e4b4457d7',
'value': '1447221694537'}]
[{'_key': '002d050ee3ff6192dcbecc4e4b4457d7', 'value': '1447221694537'}]

- filter by multiple keys, only values for keys that exist will be returned::

Expand All @@ -97,9 +141,9 @@ class Collection(object):
>>> foo_store.delete('002d050ee3ff6192dcbecc4e4b4457d7')
"""

def __init__(self, client, collections, coltype, colname):
def __init__(self, client, collections, type_, name):
self._client = client
self._origin = _Collection(coltype, colname, collections._origin)
self._origin = _Collection(type_, name, collections._origin)
proxy_methods(self._origin, self, [
'create_writer', 'count',
('iter', 'iter_values'),
Expand All @@ -111,35 +155,58 @@ def __init__(self, client, collections, coltype, colname):
wrapped = wrap_kwargs(getattr(self, method), format_iter_filters)
setattr(self, method, wrapped)

def list(self, *args, **kwargs):
def list(self, key=None, prefix=None, prefixcount=None, startts=None,
endts=None, requests_params=None, **params):
"""Convenient shortcut to list iter results.

Please note that list() method can use a lot of memory and for a large
amount of elements it's recommended to iterate through it via iter()
method (all params and available filters are same for both methods).

:param key: a string key or a list of keys to filter with.
:param prefix: a string prefix to filter items.
:param prefixcount: maximum number of values to return per prefix.
:param startts: UNIX timestamp at which to begin results.
:param endts: UNIX timestamp at which to end results.
:param requests_params: (optional) a dict with optional requests params.
:param \*\*params: (optional) additional query params for the request.
:return: a list of items where each item is represented with a dict.
:rtype: list[dict]

# FIXME there should be similar docstrings for iter/iter_raw_json
# but as we proxy them as-is, it's not in place, should be improved
"""
return list(self.iter(*args, **kwargs))
update_kwargs(params, key=key, prefix=prefix, prefixcount=prefixcount,
startts=startts, endts=endts,
requests_params=requests_params)
return list(self.iter(requests_params=None, **params))

def get(self, key, *args, **kwargs):
def get(self, key, **params):
"""Get item from collection by key.

:param key: string item key
:return: an item dictionary if exists
:param key: string item key.
:param \*\*params: (optional) additional query params for the request.
:return: an item dictionary if exists.
:rtype: dict
"""
if key is None:
raise ValueError("key cannot be None")
return self._origin.get(key, *args, **kwargs)
return self._origin.get(key, **params)

def set(self, *args, **kwargs):
def set(self, value):
"""Set item to collection by key.

:param value: a dict representing a collection item.

The method returns None (original method returns an empty generator).
"""
self._origin.set(*args, **kwargs)
self._origin.set(value)

def delete(self, keys):
"""Delete item(s) from collection by key(s).

:param keys: a single key or a list of keys.

The method returns None (original method returns an empty generator).
"""
if (not isinstance(keys, string_types) and
Expand All @@ -148,7 +215,24 @@ def delete(self, keys):
"object providing string keys")
self._origin.delete(keys)

def iter_raw_msgpack(self, requests_params=None, **apiparams):
def iter_raw_msgpack(self, key=None, prefix=None, prefixcount=None,
startts=None, endts=None, requests_params=None,
**params):
"""A method to iterate through raw msgpack-ed items.
Can be convenient if data is needed in same msgpack format.

:param key: a string key or a list of keys to filter with.
:param prefix: a string prefix to filter items.
:param prefixcount: maximum number of values to return per prefix.
:param startts: UNIX timestamp at which to begin results.
:param endts: UNIX timestamp at which to end results.
:param requests_params: (optional) a dict with optional requests params.
:param \*\*params: (optional) additional query params for the request.
:return: an iterator over items list packed with msgpack.
:rtype: collections.Iterable[bytes]
"""
update_kwargs(params, key=key, prefix=prefix, prefixcount=prefixcount,
startts=startts, endts=endts,
requests_params=requests_params)
return self._origin._collections.iter_msgpack(
self._origin.coltype, self._origin.colname,
requests_params=requests_params, **apiparams)
self._origin.coltype, self._origin.colname, **params)
Loading