Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,478 changes: 0 additions & 1,478 deletions scrapinghub/client.py

This file was deleted.

94 changes: 94 additions & 0 deletions scrapinghub/client/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
from scrapinghub import Connection as _Connection
from scrapinghub import HubstorageClient as _HubstorageClient

from .projects import Projects
from .exceptions import wrap_http_errors

from .utils import parse_auth
from .utils import parse_project_id, parse_job_key


__all__ = ['ScrapinghubClient']


class Connection(_Connection):

@wrap_http_errors
def _request(self, *args, **kwargs):
return super(Connection, self)._request(*args, **kwargs)


class HubstorageClient(_HubstorageClient):

@wrap_http_errors
def request(self, *args, **kwargs):
return super(HubstorageClient, self).request(*args, **kwargs)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let's add __all__ = ['ScrapinghubClient'] - to exclude possible clashes with HS client and Connection on import



class ScrapinghubClient(object):
"""Main class to work with Scrapinghub API.

:param auth: Scrapinghub APIKEY or other SH auth credentials.
:param dash_endpoint: (optional) Scrapinghub Dash panel url.
:param \*\*kwargs: (optional) Additional arguments for
:class:`scrapinghub.hubstorage.HubstorageClient` constructor.

:ivar projects: projects collection, :class:`Projects` instance.

Usage::

>>> from scrapinghub import ScrapinghubClient
>>> client = ScrapinghubClient('APIKEY')
>>> client
<scrapinghub.client.ScrapinghubClient at 0x1047af2e8>
"""

def __init__(self, auth=None, dash_endpoint=None, **kwargs):
self.projects = Projects(self)
login, password = parse_auth(auth)
self._connection = Connection(apikey=login,
password=password,
url=dash_endpoint)
self._hsclient = HubstorageClient(auth=(login, password), **kwargs)

def get_project(self, projectid):
"""Get :class:`Project` instance with a given project id.

The method is a shortcut for client.projects.get().

:param projectid: integer or string numeric project id.
:return: :class:`Project` object.
:rtype: scrapinghub.client.Project.

Usage::

>>> project = client.get_project(123)
>>> project
<scrapinghub.client.Project at 0x106cdd6a0>
"""
return self.projects.get(parse_project_id(projectid))

def get_job(self, jobkey):
"""Get Job with a given jobkey.

:param jobkey: job key string in format 'project/spider/job',
where all the components are integers.
:return: :class:`Job` object.
:rtype: scrapinghub.client.Job.

Usage::

>>> job = client.get_job('123/1/1')
>>> job
<scrapinghub.client.Job at 0x10afe2eb1>
"""
projectid = parse_job_key(jobkey).projectid
return self.projects.get(projectid).jobs.get(jobkey)

def close(self, timeout=None):
"""Close client instance.

:param timeout: (optional) float timeout secs to stop everything
gracefully.
"""
self._hsclient.close(timeout=timeout)
60 changes: 60 additions & 0 deletions scrapinghub/client/activity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
from __future__ import absolute_import

from .utils import _Proxy
from .utils import parse_job_key


class Activity(_Proxy):
"""Representation of collection of job activity events.

Not a public constructor: use :class:`Project` instance to get a
:class:`Activity` instance. See :attr:`Project.activity` attribute.

Please note that list() method can use a lot of memory and for a large
amount of activities it's recommended to iterate through it via iter()
method (all params and available filters are same for both methods).

Usage:

- get all activity from a project::

>>> project.activity.iter()
<generator object jldecode at 0x1049ee990>

- get only last 2 events from a project::

>>> project.activity.list(count=2)
[{'event': 'job:completed', 'job': '123/2/3', 'user': 'jobrunner'},
{'event': 'job:started', 'job': '123/2/3', 'user': 'john'}]

- post a new event::

>>> event = {'event': 'job:completed',
'job': '123/2/4',
'user': 'jobrunner'}
>>> project.activity.add(event)

- post multiple events at once::

>>> events = [
{'event': 'job:completed', 'job': '123/2/5', 'user': 'jobrunner'},
{'event': 'job:cancelled', 'job': '123/2/6', 'user': 'john'},
]
>>> project.activity.add(events)

"""
def __init__(self, *args, **kwargs):
super(Activity, self).__init__(*args, **kwargs)
self._proxy_methods([('iter', 'list')])
self._wrap_iter_methods(['iter'])

def add(self, values, **kwargs):
if not isinstance(values, list):
values = list(values)
for activity in values:
if not isinstance(activity, dict):
raise ValueError("Please pass events as dictionaries")
jobkey = activity.get('job')
if jobkey and parse_job_key(jobkey).projectid != self.key:
raise ValueError('Please use same project id')
self._origin.post(values, **kwargs)
154 changes: 154 additions & 0 deletions scrapinghub/client/collections.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
from __future__ import absolute_import
import collections

from six import string_types

from ..hubstorage.collectionsrt import Collection as _Collection

from .utils import _Proxy
from .utils import format_iter_filters
from .utils import proxy_methods
from .utils import wrap_kwargs


class Collections(_Proxy):
"""Access to project collections.

Not a public constructor: use :class:`Project` instance to get a
:class:`Collections` instance. See :attr:`Project.collections` attribute.

Usage::

>>> collections = project.collections
>>> collections.list()
[{'name': 'Pages', 'type': 's'}]
>>> foo_store = collections.get_store('foo_store')
"""

def get(self, coltype, colname):
"""Base method to get a collection with a given type and name."""
self._origin._validate_collection(coltype, colname)
return Collection(self._client, self, coltype, colname)

def get_store(self, colname):
return self.get('s', colname)

def get_cached_store(self, colname):
return self.get('cs', colname)

def get_versioned_store(self, colname):
return self.get('vs', colname)

def get_versioned_cached_store(self, colname):
return self.get('vcs', colname)

def iter(self):
"""Iterate through collections of a project."""
return self._origin.apiget('list')

def list(self):
"""List collections of a project."""
return list(self.iter())


class Collection(object):
"""Representation of a project collection object.

Not a public constructor: use :class:`Collections` instance to get a
:class:`Collection` instance. See :meth:`Collections.get_store` and
similar methods. # noqa

Usage:

- add a new item to collection::

>>> foo_store.set({'_key': '002d050ee3ff6192dcbecc4e4b4457d7',
'value': '1447221694537'})

- count items in collection::

>>> foo_store.count()
1

- get an item from collection::

>>> foo_store.get('002d050ee3ff6192dcbecc4e4b4457d7')
{'value': '1447221694537'}

- get all items from collection::

>>> foo_store.iter()
<generator object jldecode at 0x1049eef10>

- iterate iterate over _key & value pair::

>>> for elem in foo_store.iter(count=1)):
>>> ... print(elem)
[{'_key': '002d050ee3ff6192dcbecc4e4b4457d7',
'value': '1447221694537'}]

- filter by multiple keys, only values for keys that exist will be returned::

>>> foo_store.list(key=['002d050ee3ff6192dcbecc4e4b4457d7', 'blah'])
[{'_key': '002d050ee3ff6192dcbecc4e4b4457d7', 'value': '1447221694537'}]

- delete an item by key::

>>> foo_store.delete('002d050ee3ff6192dcbecc4e4b4457d7')
"""

def __init__(self, client, collections, coltype, colname):
self._client = client
self._origin = _Collection(coltype, colname, collections._origin)
proxy_methods(self._origin, self, [
'create_writer', 'count',
('iter', 'iter_values'),
('iter_raw_json', 'iter_json'),
])
# simplified version of _Proxy._wrap_iter_methods logic
# to provide better support for filter param in iter methods
for method in ['iter', 'iter_raw_json']:
wrapped = wrap_kwargs(getattr(self, method), format_iter_filters)
setattr(self, method, wrapped)

def list(self, *args, **kwargs):
"""Convenient shortcut to list iter results.

Please note that list() method can use a lot of memory and for a large
amount of elements it's recommended to iterate through it via iter()
method (all params and available filters are same for both methods).
"""
return list(self.iter(*args, **kwargs))

def get(self, key, *args, **kwargs):
"""Get item from collection by key.

:param key: string item key
:return: an item dictionary if exists
"""
if key is None:
raise ValueError("key cannot be None")
return self._origin.get(key, *args, **kwargs)

def set(self, *args, **kwargs):
"""Set item to collection by key.

The method returns None (original method returns an empty generator).
"""
self._origin.set(*args, **kwargs)

def delete(self, keys):
"""Delete item(s) from collection by key(s).

The method returns None (original method returns an empty generator).
"""
if (not isinstance(keys, string_types) and
not isinstance(keys, collections.Iterable)):
raise ValueError("You should provide string key or iterable "
"object providing string keys")
self._origin.delete(keys)

def iter_raw_msgpack(self, requests_params=None, **apiparams):
return self._origin._collections.iter_msgpack(
self._origin.coltype, self._origin.colname,
requests_params=requests_params, **apiparams)
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from functools import wraps

from requests import HTTPError

from .legacy import APIError
from .hubstorage import ValueTooLarge as _ValueTooLarge
from ..legacy import APIError
from ..hubstorage import ValueTooLarge as _ValueTooLarge


def _get_http_error_msg(exc):
Expand Down
Loading