From d725c3e77022476363e88c4d462ee68a2b93dd4e Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Tue, 7 Feb 2017 12:48:44 +0300 Subject: [PATCH 01/35] Make project/spider/job keys consistent --- scrapinghub/client.py | 16 ++++++++-------- tests/client/test_client.py | 2 +- tests/client/test_job.py | 2 +- tests/client/test_project.py | 2 +- tests/client/test_spider.py | 6 +++--- 5 files changed, 14 insertions(+), 14 deletions(-) diff --git a/scrapinghub/client.py b/scrapinghub/client.py index 4f63f2f8..eb410a70 100644 --- a/scrapinghub/client.py +++ b/scrapinghub/client.py @@ -200,7 +200,7 @@ class Project(object): """ def __init__(self, client, projectid): - self.id = projectid + self.key = projectid self._client = client # sub-resources @@ -284,7 +284,7 @@ class Spider(object): >>> project.spiders.get('spider2') - >>> spider.id + >>> spider.key 2 >>> spider.name spider2 @@ -292,7 +292,7 @@ class Spider(object): def __init__(self, client, projectid, spiderid, spidername): self.projectid = projectid - self.id = spiderid + self.key = spiderid self.name = spidername self.jobs = Jobs(client, projectid, self) self._client = client @@ -435,13 +435,13 @@ def get(self, jobkey): Usage:: >>> job = project.jobs.get('123/1/2') - >>> job.id + >>> job.key '123/1/2' """ jobkey = parse_job_key(jobkey) if jobkey.projectid != self.projectid: raise ValueError('Please use same project id') - if self.spider and jobkey.spiderid != self.spider.id: + if self.spider and jobkey.spiderid != self.spider.key: raise ValueError('Please use same spider id') return Job(self._client, str(jobkey)) @@ -503,8 +503,8 @@ def iter_last(self, **params): def _extract_spider_id(self, params): spiderid = params.pop('spiderid', None) if not spiderid and self.spider: - return self.spider.id - elif spiderid and self.spider and spiderid != self.spider.id: + return self.spider.key + elif spiderid and self.spider and spiderid != self.spider.key: raise ValueError('Please use same spider id') return spiderid @@ -563,7 +563,7 @@ class Job(object): Usage:: >>> job = project.job('123/1/2') - >>> job.id + >>> job.key '123/1/2' >>> job.metadata['state'] 'finished' diff --git a/tests/client/test_client.py b/tests/client/test_client.py index 0183c4c2..711a0acf 100644 --- a/tests/client/test_client.py +++ b/tests/client/test_client.py @@ -44,7 +44,7 @@ def test_client_projects_get_project(client): # testing with string project id p2 = projects.get(TEST_PROJECT_ID) assert isinstance(p2, Project) - assert p1.id == p2.id + assert p1.key == p2.key def test_client_projects_list_projects(client): diff --git a/tests/client/test_job.py b/tests/client/test_job.py index 8a5f79e8..f1a917f9 100644 --- a/tests/client/test_job.py +++ b/tests/client/test_job.py @@ -10,7 +10,7 @@ def test_job_base(client, spider): job = spider.jobs.schedule() assert isinstance(job, Job) assert job.projectid == int(TEST_PROJECT_ID) - assert job.key.startswith(TEST_PROJECT_ID + '/' + str(spider.id)) + assert job.key.startswith(TEST_PROJECT_ID + '/' + str(spider.key)) assert isinstance(job.items, Items) assert isinstance(job.logs, Logs) diff --git a/tests/client/test_project.py b/tests/client/test_project.py index 11009116..177d2ac5 100644 --- a/tests/client/test_project.py +++ b/tests/client/test_project.py @@ -14,7 +14,7 @@ def test_project_subresources(project): - assert project.id == int(TEST_PROJECT_ID) + assert project.key == int(TEST_PROJECT_ID) assert isinstance(project.collections, Collections) assert isinstance(project.jobs, Jobs) assert isinstance(project.spiders, Spiders) diff --git a/tests/client/test_spider.py b/tests/client/test_spider.py index c7c9efef..cf6789bf 100644 --- a/tests/client/test_spider.py +++ b/tests/client/test_spider.py @@ -16,7 +16,7 @@ def test_spiders_get(project): spider = project.spiders.get(TEST_SPIDER_NAME) assert isinstance(spider, Spider) - assert isinstance(spider.id, int) + assert isinstance(spider.key, int) assert spider.name == TEST_SPIDER_NAME assert spider.projectid == int(TEST_PROJECT_ID) assert isinstance(spider.jobs, Jobs) @@ -32,7 +32,7 @@ def test_spiders_list(project): def test_spider_base(project, spider): - assert spider.id == 1 + assert spider.key == 1 assert spider.name == TEST_SPIDER_NAME assert spider.projectid == int(TEST_PROJECT_ID) assert isinstance(project.jobs, Jobs) @@ -152,7 +152,7 @@ def test_spider_jobs_get(spider): with pytest.raises(ValueError): spider.jobs.get(TEST_PROJECT_ID + '/2/3') - fake_job_id = str(JobKey(TEST_PROJECT_ID, spider.id, 3)) + fake_job_id = str(JobKey(TEST_PROJECT_ID, spider.key, 3)) fake_job = spider.jobs.get(fake_job_id) assert isinstance(fake_job, Job) From 22d042876b18c1036f426b55a0dea0359bec4b80 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Tue, 7 Feb 2017 13:00:20 +0300 Subject: [PATCH 02/35] collection.set() should return None --- scrapinghub/client.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/scrapinghub/client.py b/scrapinghub/client.py index eb410a70..e84fc221 100644 --- a/scrapinghub/client.py +++ b/scrapinghub/client.py @@ -1018,3 +1018,10 @@ def get(self, key, *args, **kwargs): if key is None: raise ValueError("key cannot be None") return self._origin.get(key, *args, **kwargs) + + def set(self, *args, **kwargs): + """Set item to collection by key. + + The method returns None (original method returns an empty generator). + """ + self._origin.set(*args, **kwargs) From 23a7d5285b731dea332e5b5183d74bf5886b093d Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Tue, 7 Feb 2017 14:42:44 +0300 Subject: [PATCH 03/35] Better support for filter parameter --- scrapinghub/client.py | 57 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 47 insertions(+), 10 deletions(-) diff --git a/scrapinghub/client.py b/scrapinghub/client.py index e84fc221..1f2529d4 100644 --- a/scrapinghub/client.py +++ b/scrapinghub/client.py @@ -742,18 +742,33 @@ def __init__(self, cls, client, key): ('iter_raw_json', 'iter_json')] self._proxy_methods(methods) - # apply_iter_filters is responsible to modify filter params for all - # iter* calls: should be used only if defined for a child class - if hasattr(self, '_modify_iter_filters'): - apply_fn = getattr(self, '_modify_iter_filters') - for method in [method[0] for method in methods]: - wrapped = wrap_kwargs(getattr(self, method), apply_fn) - setattr(self, method, wrapped) + # modify filter params for all iter* methods + for method in [method[0] for method in methods]: + wrapped = wrap_kwargs(getattr(self, method), + self._modify_iter_filters) + setattr(self, method, wrapped) def _proxy_methods(self, methods): """A little helper for cleaner interface.""" proxy_methods(self._origin, self, methods) + def _modify_iter_filters(self, params): + """Modify iter() filters on-the-fly. + + Base implementation is responsible for improving filters support + to pass multiple filters at once as a list with tuples. + """ + filters = params.get('filter') + if filters and isinstance(filters, list): + filter_data = [] + for elem in params.pop('filter'): + if not isinstance(elem, (list, tuple)): + raise ValueError("Filter condition must be tuple or list") + filter_data.append(json.dumps(elem)) + if filter_data: + params['filter'] = filter_data + return params + class Logs(_Proxy): """Representation of collection of job logs. @@ -774,7 +789,17 @@ class Logs(_Proxy): [{ 'level': 20, 'message': '[scrapy.core.engine] Closing spider (finished)', - 'time': 1482233733976}, + 'time': 1482233733976, + }] + + - retrive logs with a given log level and filter by a word + + >>> filters = [("message", "contains", ["logger"])] + >>> list(job.logs.iter(level='WARNING', filter=filters)) + [{ + 'level': 30, + 'message': 'Some warning message', + 'time': 1486375511188, }] """ @@ -792,6 +817,7 @@ def _modify_iter_filters(self, params): :param params: an original dictionary with params :return: a modified dictionary with params """ + params = super(Logs, self)._modify_iter_filters(params) offset = params.pop('offset', None) if offset: params['start'] = '{}/{}'.format(self.key, offset) @@ -800,7 +826,9 @@ def _modify_iter_filters(self, params): minlevel = getattr(LogLevel, level, None) if minlevel is None: raise ValueError("Unknown log level: {}".format(level)) - params['filter'] = json.dumps(['level', '>=', [minlevel]]) + level_filter = json.dumps(['level', '>=', [minlevel]]) + # there can already be some filters handled by super class method + params['filter'] = params.get('filter', []) + [level_filter] return params @@ -822,7 +850,15 @@ class Items(_Proxy): >>> list(job.items.iter(startts=1447221694537)) {'name': ['Some custom item'], - 'url': 'http://some-url/item.html'}] + 'url': 'http://some-url/item.html', + 'size': 100000} + + - retrieve 1 item with multiple filters: + >>> filters = [("size", ">", [30000]), ("size", "<", [40000])] + >>> list(job.items.iter(count=1, filter=filters)) + {'name': ['Some other item'], + 'url': 'http://some-url/other-item.html', + 'size': 50000} """ def _modify_iter_filters(self, params): @@ -831,6 +867,7 @@ def _modify_iter_filters(self, params): Returns: dict: updated set of params """ + params = super(Items, self)._modify_iter_filters(params) offset = params.pop('offset', None) if offset: params['start'] = '{}/{}'.format(self.key, offset) From 459c928961cfa57e7e79931d697cc298ed68c80e Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Tue, 7 Feb 2017 18:45:23 +0300 Subject: [PATCH 04/35] Add base validation for adding activities --- scrapinghub/client.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/scrapinghub/client.py b/scrapinghub/client.py index 1f2529d4..dbeb8507 100644 --- a/scrapinghub/client.py +++ b/scrapinghub/client.py @@ -956,10 +956,14 @@ def __init__(self, *args, **kwargs): self._proxy_methods([('iter', 'list')]) def add(self, *args, **kwargs): - self._origin.add(*args, **kwargs) + entry = dict(*args, **kwargs) + return self.post(entry) - def post(self, *args, **kwargs): - self._origin.post(*args, **kwargs) + def post(self, _value, **kwargs): + jobkey = _value.get('job') or kwargs.get('job') + if jobkey and parse_job_key(jobkey).projectid != self.key: + raise ValueError('Please use same project id') + self._origin.post(_value, **kwargs) class Collections(_Proxy): From 79dd8893c72877b6b4c9277259fa898865144023 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Tue, 7 Feb 2017 19:14:34 +0300 Subject: [PATCH 05/35] Add list & drop iter* for collections --- scrapinghub/client.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/scrapinghub/client.py b/scrapinghub/client.py index dbeb8507..e0870467 100644 --- a/scrapinghub/client.py +++ b/scrapinghub/client.py @@ -736,7 +736,8 @@ def __init__(self, cls, client, key): setattr(self, 'write', wrap_value_too_large(origin_method)) # DType iter_values() has more priority than IType list() - if issubclass(cls, DownloadableResource): + # plus Collections interface doesn't need the iter methods + if issubclass(cls, DownloadableResource) and cls is not Collections: methods = [('iter', 'iter_values'), ('iter_raw_msgpack', 'iter_msgpack'), ('iter_raw_json', 'iter_json')] @@ -975,6 +976,8 @@ class Collections(_Proxy): Usage:: >>> collections = project.collections + >>> collections.list() + [{'name': 'Pages', 'type': 's'}] >>> foo_store = collections.get_store('foo_store') """ @@ -995,6 +998,9 @@ def get_versioned_store(self, colname): def get_versioned_cached_store(self, colname): return self.get('vcs', colname) + def list(self): + return list(self._origin.apiget('list')) + class Collection(object): """Representation of a project collection object. From ed6482e1dfc99bbffeb950e523c121c6466bba73 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Tue, 7 Feb 2017 19:19:55 +0300 Subject: [PATCH 06/35] Add AuthorizationError and handle HTTP 401 --- scrapinghub/exceptions.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/scrapinghub/exceptions.py b/scrapinghub/exceptions.py index 969700a2..5b08c045 100644 --- a/scrapinghub/exceptions.py +++ b/scrapinghub/exceptions.py @@ -35,6 +35,10 @@ class InvalidUsage(ScrapinghubAPIError): pass +class AuthorizationError(ScrapinghubAPIError): + pass + + class NotFound(ScrapinghubAPIError): pass @@ -56,6 +60,11 @@ def wrapped(*args, **kwargs): status_code = exc.response.status_code if status_code == 400: raise InvalidUsage(http_error=exc) + elif status_code == 401: + raise AuthorizationError( + message='Please check your credentials.', + http_error=exc, + ) elif status_code == 404: raise NotFound(http_error=exc) elif status_code == 413: From d40a2f87e739122326a7838ef4d3cd71821e7d50 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Tue, 7 Feb 2017 19:38:52 +0300 Subject: [PATCH 07/35] Unify authentication errors --- scrapinghub/exceptions.py | 9 ++++----- scrapinghub/legacy.py | 5 +++++ 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/scrapinghub/exceptions.py b/scrapinghub/exceptions.py index 5b08c045..c0de28d2 100644 --- a/scrapinghub/exceptions.py +++ b/scrapinghub/exceptions.py @@ -35,7 +35,7 @@ class InvalidUsage(ScrapinghubAPIError): pass -class AuthorizationError(ScrapinghubAPIError): +class Unauthorized(ScrapinghubAPIError): pass @@ -61,10 +61,7 @@ def wrapped(*args, **kwargs): if status_code == 400: raise InvalidUsage(http_error=exc) elif status_code == 401: - raise AuthorizationError( - message='Please check your credentials.', - http_error=exc, - ) + raise Unauthorized(http_error=exc) elif status_code == 404: raise NotFound(http_error=exc) elif status_code == 413: @@ -80,6 +77,8 @@ def wrapped(*args, **kwargs): raise ValueError(msg) elif exc._type == APIError.ERR_INVALID_USAGE: raise InvalidUsage(msg) + elif exc._type == APIError.ERR_AUTH_ERROR: + raise Unauthorized(http_error=exc) raise ScrapinghubAPIError(msg) return wrapped diff --git a/scrapinghub/legacy.py b/scrapinghub/legacy.py index 87c8df44..59bf01f7 100644 --- a/scrapinghub/legacy.py +++ b/scrapinghub/legacy.py @@ -146,6 +146,10 @@ def _decode_response(self, response, format, raw): try: if data['status'] == 'ok': return data + elif (data['status'] == 'error' and + data['message'] == 'Authentication failed'): + raise APIError(data['message'], + _type=APIError.ERR_AUTH_ERROR) elif data['status'] in ('error', 'badrequest'): raise APIError(data['message'], _type=APIError.ERR_INVALID_USAGE) @@ -402,6 +406,7 @@ class APIError(Exception): ERR_NOT_FOUND = 1 ERR_VALUE_ERROR = 2 ERR_INVALID_USAGE = 3 + ERR_AUTH_ERROR = 4 def __init__(self, message, _type=None): super(APIError, self).__init__(message) From b5d49422a21a1eb538c8c58413e2773965e22ce4 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Wed, 8 Feb 2017 14:11:19 +0300 Subject: [PATCH 08/35] Refactor logic for wrapping iter* params To provide better support for filter param for all classes having HS-based iter* methods we have to simplify it and make the logic reusable. The changes also contain minor renaming for cleaner code. --- scrapinghub/client.py | 46 ++++++++++++++++++++----------------------- scrapinghub/utils.py | 18 +++++++++++++++++ 2 files changed, 39 insertions(+), 25 deletions(-) diff --git a/scrapinghub/client.py b/scrapinghub/client.py index e0870467..43dae038 100644 --- a/scrapinghub/client.py +++ b/scrapinghub/client.py @@ -29,6 +29,7 @@ from .utils import parse_project_id, parse_job_key from .utils import proxy_methods from .utils import wrap_kwargs +from .utils import format_iter_filters class Connection(_Connection): @@ -742,33 +743,22 @@ def __init__(self, cls, client, key): ('iter_raw_msgpack', 'iter_msgpack'), ('iter_raw_json', 'iter_json')] self._proxy_methods(methods) - - # modify filter params for all iter* methods - for method in [method[0] for method in methods]: - wrapped = wrap_kwargs(getattr(self, method), - self._modify_iter_filters) - setattr(self, method, wrapped) + self._wrap_iter_methods([method[0] for method in methods]) def _proxy_methods(self, methods): """A little helper for cleaner interface.""" proxy_methods(self._origin, self, methods) - def _modify_iter_filters(self, params): - """Modify iter() filters on-the-fly. + def _wrap_iter_methods(self, methods): + """Modify kwargs for all passed self.iter* methods.""" + for method in methods: + wrapped = wrap_kwargs(getattr(self, method), + self._modify_iter_params) + setattr(self, method, wrapped) - Base implementation is responsible for improving filters support - to pass multiple filters at once as a list with tuples. - """ - filters = params.get('filter') - if filters and isinstance(filters, list): - filter_data = [] - for elem in params.pop('filter'): - if not isinstance(elem, (list, tuple)): - raise ValueError("Filter condition must be tuple or list") - filter_data.append(json.dumps(elem)) - if filter_data: - params['filter'] = filter_data - return params + def _modify_iter_params(self, params): + """Modify iter() params on-the-fly.""" + return format_iter_filters(params) class Logs(_Proxy): @@ -809,7 +799,7 @@ def __init__(self, *args, **kwargs): self._proxy_methods(['log', 'debug', 'info', 'warning', 'warn', 'error', 'batch_write_start']) - def _modify_iter_filters(self, params): + def _modify_iter_params(self, params): """Modify iter() filters on-the-fly. - convert offset to start parameter @@ -818,7 +808,7 @@ def _modify_iter_filters(self, params): :param params: an original dictionary with params :return: a modified dictionary with params """ - params = super(Logs, self)._modify_iter_filters(params) + params = super(Logs, self)._modify_iter_params(params) offset = params.pop('offset', None) if offset: params['start'] = '{}/{}'.format(self.key, offset) @@ -862,13 +852,13 @@ class Items(_Proxy): 'size': 50000} """ - def _modify_iter_filters(self, params): + def _modify_iter_params(self, params): """Modify iter filter to convert offset to start parameter. Returns: dict: updated set of params """ - params = super(Items, self)._modify_iter_filters(params) + params = super(Items, self)._modify_iter_params(params) offset = params.pop('offset', None) if offset: params['start'] = '{}/{}'.format(self.key, offset) @@ -955,6 +945,7 @@ class Activity(_Proxy): def __init__(self, *args, **kwargs): super(Activity, self).__init__(*args, **kwargs) self._proxy_methods([('iter', 'list')]) + self._wrap_iter_methods(['iter']) def add(self, *args, **kwargs): entry = dict(*args, **kwargs) @@ -1055,6 +1046,11 @@ def __init__(self, client, collections, coltype, colname): ('iter', 'iter_values'), ('iter_raw_json', 'iter_json'), ]) + # simplified version of _Proxy._wrap_iter_methods logic + # to provide better support for filter param in iter methods + for method in ['iter', 'iter_raw_json']: + wrapped = wrap_kwargs(getattr(self, method), format_iter_filters) + setattr(self, method, wrapped) def get(self, key, *args, **kwargs): """Get item from collection by key. diff --git a/scrapinghub/utils.py b/scrapinghub/utils.py index ac5f0b8e..932b0d04 100644 --- a/scrapinghub/utils.py +++ b/scrapinghub/utils.py @@ -1,3 +1,4 @@ +import json import logging from six import string_types @@ -83,3 +84,20 @@ def proxy_methods(origin, successor, methods): successor_name, origin_name = method, method if not hasattr(successor, successor_name): setattr(successor, successor_name, getattr(origin, origin_name)) + + +def format_iter_filters(params): + """Format iter() filter param on-the-fly. + + Support passing multiple filters at once as a list with tuples. + """ + filters = params.get('filter') + if filters and isinstance(filters, list): + filter_data = [] + for elem in params.pop('filter'): + if not isinstance(elem, (list, tuple)): + raise ValueError("Filter condition must be tuple or list") + filter_data.append(json.dumps(elem)) + if filter_data: + params['filter'] = filter_data + return params From 5c5a9ffca4bc77ae3ff5afaed1332aca0d47fefe Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Thu, 9 Feb 2017 11:03:00 +0300 Subject: [PATCH 09/35] Accept strings in iter filter param --- scrapinghub/utils.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/scrapinghub/utils.py b/scrapinghub/utils.py index 932b0d04..c87fb676 100644 --- a/scrapinghub/utils.py +++ b/scrapinghub/utils.py @@ -95,9 +95,13 @@ def format_iter_filters(params): if filters and isinstance(filters, list): filter_data = [] for elem in params.pop('filter'): - if not isinstance(elem, (list, tuple)): - raise ValueError("Filter condition must be tuple or list") - filter_data.append(json.dumps(elem)) + if isinstance(elem, string_types): + filter_data.append(elem) + elif isinstance(elem, (list, tuple)): + filter_data.append(json.dumps(elem)) + else: + raise ValueError( + "Filter condition must be string, tuple or list") if filter_data: params['filter'] = filter_data return params From 793a4e1b980c59d3b2529c5752b4981dc4747ba5 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Thu, 9 Feb 2017 11:08:57 +0300 Subject: [PATCH 10/35] Some minor fixes after CR Fix spider id in docstrings to keep consistency Don't proxy methods overwritten in Collection class --- scrapinghub/client.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scrapinghub/client.py b/scrapinghub/client.py index 43dae038..b689ae2d 100644 --- a/scrapinghub/client.py +++ b/scrapinghub/client.py @@ -286,7 +286,7 @@ class Spider(object): >>> project.spiders.get('spider2') >>> spider.key - 2 + 1 >>> spider.name spider2 """ @@ -1042,7 +1042,7 @@ def __init__(self, client, collections, coltype, colname): self._client = client self._origin = _Collection(coltype, colname, collections._origin) proxy_methods(self._origin, self, [ - 'create_writer', 'get', 'set', 'delete', 'count', + 'create_writer', 'delete', 'count', ('iter', 'iter_values'), ('iter_raw_json', 'iter_json'), ]) From 68f0428755f873dd7393d8448cdfcbd4557b217b Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Thu, 9 Feb 2017 11:28:35 +0300 Subject: [PATCH 11/35] Unify test data (projects,spiders) in docstrings --- scrapinghub/client.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/scrapinghub/client.py b/scrapinghub/client.py index b689ae2d..8b252c58 100644 --- a/scrapinghub/client.py +++ b/scrapinghub/client.py @@ -96,7 +96,7 @@ def get_job(self, jobkey): Usage:: - >>> job = client.get_job('1/2/3') + >>> job = client.get_job('123/1/1') >>> job """ @@ -283,12 +283,12 @@ class Spider(object): Usage:: - >>> project.spiders.get('spider2') + >>> spider = project.spiders.get('spider1') >>> spider.key 1 >>> spider.name - spider2 + spider1 """ def __init__(self, client, projectid, spiderid, spidername): @@ -313,7 +313,7 @@ class Jobs(object): >>> project.jobs - >>> spider = project.spiders.get('spider2') + >>> spider = project.spiders.get('spider1') >>> spider.jobs """ @@ -333,6 +333,7 @@ def count(self, **params): Usage:: + >>> spider = project.spiders.get('spider1') >>> spider.jobs.count() 5 >>> project.jobs.count(spider='spider2', state='finished') @@ -384,7 +385,7 @@ def iter(self, **params): - get certain number of last finished jobs per some spider:: >>> jobs_summary = project.jobs.iter( - ... spider='foo', state='finished', count=3) + ... spider='spider2', state='finished', count=3) """ if self.spider: params['spider'] = self.spider.name @@ -400,7 +401,7 @@ def schedule(self, spidername=None, **params): Usage:: - >>> project.schedule('myspider', arg1='val1') + >>> project.schedule('spider1', arg1='val1') '123/1/1' """ if not spidername and not self.spider: @@ -526,13 +527,14 @@ def update_tags(self, add=None, remove=None, spidername=None): - mark all spider jobs with tag ``consumed``:: + >>> spider = project.spiders.get('spider1') >>> spider.jobs.update_tags(add=['consumed']) 5 - remove existing tag ``existing`` for all spider jobs:: >>> project.jobs.update_tags( - ... remove=['existing'], spidername='spider') + ... remove=['existing'], spidername='spider2') 2 """ spidername = spidername or (self.spider.name if self.spider else None) From 3b1865432c694cb862732d413301cd95f018b651 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Thu, 9 Feb 2017 12:02:54 +0300 Subject: [PATCH 12/35] Add list method for all entities (with a warning) --- scrapinghub/client.py | 91 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 74 insertions(+), 17 deletions(-) diff --git a/scrapinghub/client.py b/scrapinghub/client.py index 8b252c58..256a9ea4 100644 --- a/scrapinghub/client.py +++ b/scrapinghub/client.py @@ -391,6 +391,16 @@ def iter(self, **params): params['spider'] = self.spider.name return self._project.jobq.list(**params) + def list(self, **params): + """Convenient shortcut to list iter results. + + Please note that list() method can use a lot of memory and for a large + amount of jobs it's recommended to iterate through it via iter() + method (all params and available filters are same for both methods). + + """ + return list(self.iter(**params)) + def schedule(self, spidername=None, **params): """Schedule a new job and returns its jobkey. @@ -762,6 +772,9 @@ def _modify_iter_params(self, params): """Modify iter() params on-the-fly.""" return format_iter_filters(params) + def list(self, *args, **kwargs): + return list(self.iter(*args, **kwargs)) + class Logs(_Proxy): """Representation of collection of job logs. @@ -769,6 +782,10 @@ class Logs(_Proxy): Not a public constructor: use :class:`Job` instance to get a :class:`Logs` instance. See :attr:`Job.logs` attribute. + Please note that list() method can use a lot of memory and for a large + amount of logs it's recommended to iterate through it via iter() method + (all params and available filters are same for both methods). + Usage: - retrieve all logs from a job:: @@ -776,9 +793,14 @@ class Logs(_Proxy): >>> job.logs.iter() + - iterate through first 100 log entries and print them:: + + >>> for log in job.logs.iter(count=100): + >>> ... print(log) + - retrieve a single log entry from a job:: - >>> list(job.logs.iter(count=1)) + >>> job.logs.list(count=1) [{ 'level': 20, 'message': '[scrapy.core.engine] Closing spider (finished)', @@ -788,7 +810,7 @@ class Logs(_Proxy): - retrive logs with a given log level and filter by a word >>> filters = [("message", "contains", ["logger"])] - >>> list(job.logs.iter(level='WARNING', filter=filters)) + >>> job.logs.list(level='WARNING', filter=filters) [{ 'level': 30, 'message': 'Some warning message', @@ -831,6 +853,10 @@ class Items(_Proxy): Not a public constructor: use :class:`Job` instance to get a :class:`Items` instance. See :attr:`Job.items` attribute. + Please note that list() method can use a lot of memory and for a large + amount of items it's recommended to iterate through it via iter() method + (all params and available filters are same for both methods). + Usage: - retrieve all scraped items from a job:: @@ -838,20 +864,29 @@ class Items(_Proxy): >>> job.items.iter() + - iterate through first 100 items and print them:: + + >>> for log in job.logs.iter(count=100): + >>> ... print(log) + - retrieve items with timestamp greater or equal to given timestamp (item here is an arbitrary dictionary depending on your code):: - >>> list(job.items.iter(startts=1447221694537)) - {'name': ['Some custom item'], - 'url': 'http://some-url/item.html', - 'size': 100000} + >>> job.items.list(startts=1447221694537) + [{ + 'name': ['Some custom item'], + 'url': 'http://some-url/item.html', + 'size': 100000, + }] - retrieve 1 item with multiple filters: >>> filters = [("size", ">", [30000]), ("size", "<", [40000])] - >>> list(job.items.iter(count=1, filter=filters)) - {'name': ['Some other item'], - 'url': 'http://some-url/other-item.html', - 'size': 50000} + >>> job.items.list(count=1, filter=filters) + [{ + 'name': ['Some other item'], + 'url': 'http://some-url/other-item.html', + 'size': 50000, + }] """ def _modify_iter_params(self, params): @@ -873,6 +908,10 @@ class Requests(_Proxy): Not a public constructor: use :class:`Job` instance to get a :class:`Requests` instance. See :attr:`Job.requests` attribute. + Please note that list() method can use a lot of memory and for a large + amount of requests it's recommended to iterate through it via iter() + method (all params and available filters are same for both methods). + Usage: - retrieve all requests from a job:: @@ -888,7 +927,7 @@ class Requests(_Proxy): - retrieve single request from a job:: - >>> list(job.requests.iter(count=1)) + >>> job.requests.list(count=1) [{ 'duration': 354, 'fp': '6d748741a927b10454c83ac285b002cd239964ea', @@ -910,6 +949,10 @@ class Samples(_Proxy): Not a public constructor: use :class:`Job` instance to get a :class:`Samples` instance. See :attr:`Job.samples` attribute. + Please note that list() method can use a lot of memory and for a large + amount of samples it's recommended to iterate through it via iter() + method (all params and available filters are same for both methods). + Usage: - retrieve all samples from a job:: @@ -919,7 +962,7 @@ class Samples(_Proxy): - retrieve samples with timestamp greater or equal to given timestamp:: - >>> list(job.samples.iter(startts=1484570043851)) + >>> job.samples.list(startts=1484570043851) [[1484570043851, 554, 576, 1777, 821, 0], [1484570046673, 561, 583, 1782, 821, 0]] """ @@ -931,6 +974,10 @@ class Activity(_Proxy): Not a public constructor: use :class:`Project` instance to get a :class:`Activity` instance. See :attr:`Project.activity` attribute. + Please note that list() method can use a lot of memory and for a large + amount of activities it's recommended to iterate through it via iter() + method (all params and available filters are same for both methods). + Usage: - get all activity from a project:: @@ -940,7 +987,7 @@ class Activity(_Proxy): - get only last 2 events from a project:: - >>> list(p.activity.iter(count=2)) + >>> p.activity.list(count=2) [{'event': 'job:completed', 'job': '123/2/3', 'user': 'jobrunner'}, {'event': 'job:cancelled', 'job': '123/2/3', 'user': 'john'}] """ @@ -1026,13 +1073,14 @@ class Collection(object): - iterate iterate over _key & value pair:: - >>> list(foo_store.iter()) - [{'_key': '002d050ee3ff6192dcbecc4e4b4457d7', - 'value': '1447221694537'}] + >>> for elem in foo_store.iter(count=1)): + >>> ... print(elem) + [{'_key': '002d050ee3ff6192dcbecc4e4b4457d7', + 'value': '1447221694537'}] - filter by multiple keys, only values for keys that exist will be returned:: - >>> list(foo_store.iter(key=['002d050ee3ff6192dcbecc4e4b4457d7', 'blah'])) + >>> foo_store.list(key=['002d050ee3ff6192dcbecc4e4b4457d7', 'blah']) [{'_key': '002d050ee3ff6192dcbecc4e4b4457d7', 'value': '1447221694537'}] - delete an item by key:: @@ -1054,6 +1102,15 @@ def __init__(self, client, collections, coltype, colname): wrapped = wrap_kwargs(getattr(self, method), format_iter_filters) setattr(self, method, wrapped) + def list(self, *args, **kwargs): + """Convenient shortcut to list iter results. + + Please note that list() method can use a lot of memory and for a large + amount of elements it's recommended to iterate through it via iter() + method (all params and available filters are same for both methods). + """ + return list(self.iter(*args, **kwargs)) + def get(self, key, *args, **kwargs): """Get item from collection by key. From 1d1d5eedffd40304c6a585de4fa5ed9fd230fec9 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Thu, 9 Feb 2017 12:11:49 +0300 Subject: [PATCH 13/35] Use string keys everywhere --- scrapinghub/client.py | 4 ++-- scrapinghub/utils.py | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/scrapinghub/client.py b/scrapinghub/client.py index 256a9ea4..bf710ca2 100644 --- a/scrapinghub/client.py +++ b/scrapinghub/client.py @@ -286,9 +286,9 @@ class Spider(object): >>> spider = project.spiders.get('spider1') >>> spider.key - 1 + '1' >>> spider.name - spider1 + 'spider1' """ def __init__(self, client, projectid, spiderid, spidername): diff --git a/scrapinghub/utils.py b/scrapinghub/utils.py index c87fb676..ad3d2e58 100644 --- a/scrapinghub/utils.py +++ b/scrapinghub/utils.py @@ -26,10 +26,10 @@ def __str__(self): def parse_project_id(projectid): try: - projectid = int(projectid) + int(projectid) except ValueError: - raise ValueError("Project ID should be convertible to integer") - return projectid + raise ValueError("Project id should be convertible to integer") + return str(projectid) def parse_job_key(jobkey): @@ -42,10 +42,10 @@ def parse_job_key(jobkey): if len(parts) != 3: raise ValueError("Job key should consist of projectid/spiderid/jobid") try: - parts = map(int, parts) + map(int, parts) except ValueError: raise ValueError("Job key parts should be integers") - return JobKey(*parts) + return JobKey(*map(str, parts)) def get_tags_for_update(**kwargs): From 707a6adb0f319e8aa575c60f832030d65ba8f3fe Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Thu, 9 Feb 2017 12:34:16 +0300 Subject: [PATCH 14/35] Spider key should also be a string --- scrapinghub/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapinghub/client.py b/scrapinghub/client.py index bf710ca2..a106f810 100644 --- a/scrapinghub/client.py +++ b/scrapinghub/client.py @@ -293,7 +293,7 @@ class Spider(object): def __init__(self, client, projectid, spiderid, spidername): self.projectid = projectid - self.key = spiderid + self.key = str(spiderid) self.name = spidername self.jobs = Jobs(client, projectid, self) self._client = client From 1ee768d52c5c9e2b02b8d11f1af3ac247d12aa79 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Thu, 9 Feb 2017 13:10:54 +0300 Subject: [PATCH 15/35] Add validation for collection.delete input --- scrapinghub/client.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/scrapinghub/client.py b/scrapinghub/client.py index a106f810..4586372b 100644 --- a/scrapinghub/client.py +++ b/scrapinghub/client.py @@ -1,5 +1,7 @@ import json +from six import string_types + from scrapinghub import APIError from scrapinghub import Connection as _Connection from scrapinghub import HubstorageClient as _HubstorageClient @@ -1092,7 +1094,7 @@ def __init__(self, client, collections, coltype, colname): self._client = client self._origin = _Collection(coltype, colname, collections._origin) proxy_methods(self._origin, self, [ - 'create_writer', 'delete', 'count', + 'create_writer', 'count', ('iter', 'iter_values'), ('iter_raw_json', 'iter_json'), ]) @@ -1127,3 +1129,15 @@ def set(self, *args, **kwargs): The method returns None (original method returns an empty generator). """ self._origin.set(*args, **kwargs) + + def delete(self, _keys): + """Delete item(s) from collection by key(s). + + The method returns None (original method returns an empty generator). + """ + if (not isinstance(_keys, string_types) and + not(isinstance(_keys, (list, tuple)) and + all(isinstance(key, string_types) for key in _keys))): + raise ValueError( + "You should provide a string key or a list of string keys") + self._origin.delete(_keys) From 100476c7ff059dfbe5eec67e9ff3638560bb8a31 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Thu, 9 Feb 2017 13:14:38 +0300 Subject: [PATCH 16/35] Check input for update_tags logic --- scrapinghub/client.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scrapinghub/client.py b/scrapinghub/client.py index 4586372b..f9459b3f 100644 --- a/scrapinghub/client.py +++ b/scrapinghub/client.py @@ -549,6 +549,8 @@ def update_tags(self, add=None, remove=None, spidername=None): ... remove=['existing'], spidername='spider2') 2 """ + if not (add or remove): + raise ValueError('Please provide tags to add or remove') spidername = spidername or (self.spider.name if self.spider else None) if not spidername: raise ValueError('Please provide spidername') @@ -630,6 +632,8 @@ def update_tags(self, add=None, remove=None): >>> job.update_tags(add=['consumed']) """ + if not (add or remove): + raise ValueError('Please provide tags to add or remove') params = get_tags_for_update(add_tag=add, remove_tag=remove) if not params: return From f8f846dd0116781d6821b63e3713697338b73f31 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Thu, 9 Feb 2017 16:03:30 +0300 Subject: [PATCH 17/35] Add spider.key for consistency, minor fixes --- scrapinghub/client.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/scrapinghub/client.py b/scrapinghub/client.py index f9459b3f..b4d56a90 100644 --- a/scrapinghub/client.py +++ b/scrapinghub/client.py @@ -200,10 +200,12 @@ class Project(object): >>> project = client.get_project(123) >>> project + >>> project.key + '123' """ def __init__(self, client, projectid): - self.key = projectid + self.key = str(projectid) self._client = client # sub-resources @@ -287,15 +289,18 @@ class Spider(object): >>> spider = project.spiders.get('spider1') - >>> spider.key + >>> spider.id '1' + >>> spider.key + '123/1' >>> spider.name 'spider1' """ def __init__(self, client, projectid, spiderid, spidername): self.projectid = projectid - self.key = str(spiderid) + self.key = '{}/{}'.format(str(projectid), str(spiderid)) + self.id = str(spiderid) self.name = spidername self.jobs = Jobs(client, projectid, self) self._client = client @@ -455,7 +460,7 @@ def get(self, jobkey): jobkey = parse_job_key(jobkey) if jobkey.projectid != self.projectid: raise ValueError('Please use same project id') - if self.spider and jobkey.spiderid != self.spider.key: + if self.spider and jobkey.spiderid != self.spider.id: raise ValueError('Please use same spider id') return Job(self._client, str(jobkey)) @@ -517,10 +522,10 @@ def iter_last(self, **params): def _extract_spider_id(self, params): spiderid = params.pop('spiderid', None) if not spiderid and self.spider: - return self.spider.key - elif spiderid and self.spider and spiderid != self.spider.key: + return self.spider.id + elif spiderid and self.spider and str(spiderid) != self.spider.id: raise ValueError('Please use same spider id') - return spiderid + return str(spiderid) def update_tags(self, add=None, remove=None, spidername=None): """Update tags for all existing spider jobs. From f9330fb99e172a5de256e3a7af232ccc096fa196 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Thu, 9 Feb 2017 16:12:56 +0300 Subject: [PATCH 18/35] Fix minor bug when using project.jobs --- scrapinghub/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapinghub/client.py b/scrapinghub/client.py index b4d56a90..b2582a07 100644 --- a/scrapinghub/client.py +++ b/scrapinghub/client.py @@ -525,7 +525,7 @@ def _extract_spider_id(self, params): return self.spider.id elif spiderid and self.spider and str(spiderid) != self.spider.id: raise ValueError('Please use same spider id') - return str(spiderid) + return str(spiderid) if spiderid else None def update_tags(self, add=None, remove=None, spidername=None): """Update tags for all existing spider jobs. From c457d407585c56714e5061fc16842891fc0abf39 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Thu, 9 Feb 2017 16:52:20 +0300 Subject: [PATCH 19/35] Move check for update_tags input on server-side --- scrapinghub/client.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/scrapinghub/client.py b/scrapinghub/client.py index b2582a07..288f23ac 100644 --- a/scrapinghub/client.py +++ b/scrapinghub/client.py @@ -554,8 +554,6 @@ def update_tags(self, add=None, remove=None, spidername=None): ... remove=['existing'], spidername='spider2') 2 """ - if not (add or remove): - raise ValueError('Please provide tags to add or remove') spidername = spidername or (self.spider.name if self.spider else None) if not spidername: raise ValueError('Please provide spidername') @@ -637,8 +635,6 @@ def update_tags(self, add=None, remove=None): >>> job.update_tags(add=['consumed']) """ - if not (add or remove): - raise ValueError('Please provide tags to add or remove') params = get_tags_for_update(add_tag=add, remove_tag=remove) if not params: return From 360a12fa3ab8db2d78745b0a0c93685afa379408 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Thu, 16 Feb 2017 14:06:05 +0300 Subject: [PATCH 20/35] Add spider.update_tags logic --- scrapinghub/client.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/scrapinghub/client.py b/scrapinghub/client.py index 288f23ac..e791992f 100644 --- a/scrapinghub/client.py +++ b/scrapinghub/client.py @@ -1,6 +1,7 @@ import json from six import string_types +from requests.compat import urljoin from scrapinghub import APIError from scrapinghub import Connection as _Connection @@ -305,6 +306,16 @@ def __init__(self, client, projectid, spiderid, spidername): self.jobs = Jobs(client, projectid, self) self._client = client + @wrap_http_errors + def update_tags(self, add=None, remove=None): + params = get_tags_for_update(add=add, remove=remove) + if not params: + return + path = 'v2/projects/{}/spiders/{}/tags'.format(self.projectid, self.id) + url = urljoin(self._client._connection.url, path) + response = self._client._connection._session.patch(url, json=params) + response.raise_for_status() + class Jobs(object): """Class representing a collection of jobs for a project/spider. From ffcbcbb1af5a551aa5f7ded79d7a3f1bcb4259b5 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Fri, 17 Feb 2017 13:35:29 +0300 Subject: [PATCH 21/35] Multiple minor changes after running tests Add Spider.list_tags method for consistency Fix catching DuplicateJobError after reworking error handling Do not check for empty updates in update_tags client logic Weakening keys check for Collection.delete: any iterables are allowed Fixed spider fixture to clean tags on start when running in real Add missing tests for new functionality (list, filters, activity) --- scrapinghub/client.py | 29 ++++---- tests/client/conftest.py | 11 ++- tests/client/test_collections.py | 13 ++++ tests/client/test_job.py | 7 +- tests/client/test_project.py | 28 +++++++- tests/client/test_proxy.py | 115 +++++++++++++++++++++++++++++-- tests/client/test_spider.py | 67 +++++++++++------- 7 files changed, 219 insertions(+), 51 deletions(-) diff --git a/scrapinghub/client.py b/scrapinghub/client.py index e791992f..d5e981c3 100644 --- a/scrapinghub/client.py +++ b/scrapinghub/client.py @@ -1,9 +1,9 @@ import json +import collections from six import string_types from requests.compat import urljoin -from scrapinghub import APIError from scrapinghub import Connection as _Connection from scrapinghub import HubstorageClient as _HubstorageClient @@ -24,9 +24,9 @@ from .hubstorage.job import Samples as _Samples from .hubstorage.job import Requests as _Requests -from .exceptions import ( - NotFound, DuplicateJobError, wrap_http_errors, wrap_value_too_large -) +from .exceptions import NotFound, InvalidUsage, DuplicateJobError +from .exceptions import wrap_http_errors, wrap_value_too_large + from .utils import LogLevel from .utils import get_tags_for_update from .utils import parse_project_id, parse_job_key @@ -309,13 +309,19 @@ def __init__(self, client, projectid, spiderid, spidername): @wrap_http_errors def update_tags(self, add=None, remove=None): params = get_tags_for_update(add=add, remove=remove) - if not params: - return path = 'v2/projects/{}/spiders/{}/tags'.format(self.projectid, self.id) url = urljoin(self._client._connection.url, path) response = self._client._connection._session.patch(url, json=params) response.raise_for_status() + @wrap_http_errors + def list_tags(self): + path = 'v2/projects/{}/spiders/{}'.format(self.projectid, self.id) + url = urljoin(self._client._connection.url, path) + response = self._client._connection._session.get(url) + response.raise_for_status() + return response.json().get('tags', []) + class Jobs(object): """Class representing a collection of jobs for a project/spider. @@ -444,7 +450,7 @@ def schedule(self, spidername=None, **params): try: response = self._client._connection._post( 'schedule', 'json', params) - except APIError as exc: + except InvalidUsage as exc: if 'already scheduled' in str(exc): raise DuplicateJobError(exc) raise @@ -647,8 +653,6 @@ def update_tags(self, add=None, remove=None): >>> job.update_tags(add=['consumed']) """ params = get_tags_for_update(add_tag=add, remove_tag=remove) - if not params: - return params.update({'project': self.projectid, 'job': self.key}) result = self._client._connection._post('jobs_update', 'json', params) return result['count'] @@ -1152,8 +1156,7 @@ def delete(self, _keys): The method returns None (original method returns an empty generator). """ if (not isinstance(_keys, string_types) and - not(isinstance(_keys, (list, tuple)) and - all(isinstance(key, string_types) for key in _keys))): - raise ValueError( - "You should provide a string key or a list of string keys") + not isinstance(_keys, collections.Iterable)): + raise ValueError("You should provide string key or iterable " + "object providing string keys") self._origin.delete(_keys) diff --git a/tests/client/conftest.py b/tests/client/conftest.py index d451a19d..20b5fd59 100644 --- a/tests/client/conftest.py +++ b/tests/client/conftest.py @@ -69,7 +69,7 @@ def is_using_real_services(request): @pytest.fixture(scope='session') def client(): - return ScrapinghubClient(auth=TEST_ADMIN_AUTH, + return ScrapinghubClient(apikey=TEST_ADMIN_AUTH, endpoint=TEST_HS_ENDPOINT, dash_endpoint=TEST_DASH_ENDPOINT) @@ -81,10 +81,15 @@ def project(client): @my_vcr.use_cassette() @pytest.fixture(scope='session') -def spider(project): +def spider(project, request): # on normal conditions you can't create a new spider this way: # it can only be created on project deploy as usual - return project.spiders.get(TEST_SPIDER_NAME, create=True) + spider = project.spiders.get(TEST_SPIDER_NAME, create=True) + if is_using_real_services(request): + existing_tags = spider.list_tags() + if existing_tags: + spider.update_tags(remove=existing_tags) + return spider @pytest.fixture(scope='session') diff --git a/tests/client/test_collections.py b/tests/client/test_collections.py index 3bb52f11..d33083dc 100644 --- a/tests/client/test_collections.py +++ b/tests/client/test_collections.py @@ -12,6 +12,19 @@ def _mkitem(): field3=3, field4={'v4k': 'v4v'}) +def test_collections_list(project): + # create/check test collections + project.collections.get_store(TEST_COLLECTION_NAME), + project.collections.get_cached_store(TEST_COLLECTION_NAME), + project.collections.get_versioned_store(TEST_COLLECTION_NAME), + project.collections.get_versioned_cached_store(TEST_COLLECTION_NAME), + collections = project.collections.list() + assert isinstance(collections, list) + assert len(collections) >= 4 + for coltype in ('s', 'vs', 'cs', 'vcs'): + assert {'name': TEST_COLLECTION_NAME, 'type': coltype} in collections + + def test_simple_count(project, collection): test_item = dict(_mkitem()) test_item['_key'] = 'a' diff --git a/tests/client/test_job.py b/tests/client/test_job.py index f1a917f9..c4b7f8ff 100644 --- a/tests/client/test_job.py +++ b/tests/client/test_job.py @@ -9,8 +9,8 @@ def test_job_base(client, spider): job = spider.jobs.schedule() assert isinstance(job, Job) - assert job.projectid == int(TEST_PROJECT_ID) - assert job.key.startswith(TEST_PROJECT_ID + '/' + str(spider.key)) + assert job.projectid == TEST_PROJECT_ID + assert job.key.startswith(spider.key) assert isinstance(job.items, Items) assert isinstance(job.logs, Logs) @@ -72,8 +72,7 @@ def test_job_start_extras(spider): 'false': False, 'nil': None, } - started = next(job.start(**extras)) - assert job.key == started['key'] + assert job.start(**extras) == 'pending' for k, v in extras.items(): if type(v) is float: assert abs(job.metadata.get(k) - v) < 0.0001 diff --git a/tests/client/test_project.py b/tests/client/test_project.py index 177d2ac5..fc39739d 100644 --- a/tests/client/test_project.py +++ b/tests/client/test_project.py @@ -14,7 +14,7 @@ def test_project_subresources(project): - assert project.key == int(TEST_PROJECT_ID) + assert project.key == TEST_PROJECT_ID assert isinstance(project.collections, Collections) assert isinstance(project.jobs, Jobs) assert isinstance(project.spiders, Spiders) @@ -25,7 +25,7 @@ def test_project_subresources(project): def test_project_jobs(project): jobs = project.jobs - assert jobs.projectid == int(TEST_PROJECT_ID) + assert jobs.projectid == TEST_PROJECT_ID assert jobs.spider is None @@ -82,6 +82,30 @@ def test_project_jobs_iter(project): next(jobs1) +def test_project_jobs_list(project): + project.jobs.schedule(TEST_SPIDER_NAME, meta={'state': 'running'}) + + # no finished jobs + jobs0 = project.jobs.list() + assert isinstance(jobs0, list) + assert len(jobs0) == 0 + + # filter by state must work like for iter + jobs1 = project.jobs.list(state='running') + assert len(jobs1) == 1 + job = jobs1[0] + assert isinstance(job, dict) + ts = job.get('ts') + assert isinstance(ts, int) and ts > 0 + running_time = job.get('running_time') + assert isinstance(running_time, int) and running_time > 0 + elapsed = job.get('elapsed') + assert isinstance(elapsed, int) and elapsed > 0 + assert job.get('key').startswith(TEST_PROJECT_ID) + assert job.get('spider') == TEST_SPIDER_NAME + assert job.get('state') == 'running' + + def test_project_jobs_schedule(project): # scheduling on project level requires spidername with pytest.raises(ValueError): diff --git a/tests/client/test_proxy.py b/tests/client/test_proxy.py index 75b7cfae..c835507a 100644 --- a/tests/client/test_proxy.py +++ b/tests/client/test_proxy.py @@ -8,6 +8,7 @@ from scrapinghub.client import LogLevel from scrapinghub.hubstorage.serialization import mpdecode +from .conftest import TEST_PROJECT_ID # use some fixed timestamp to represent current time TEST_TS = 1476803148638 @@ -69,6 +70,45 @@ def test_logs_iter(spider): next(logs3).get('message') +def test_logs_list(spider): + job = spider.jobs.schedule() + _add_test_logs(job) + + logs1 = job.logs.list() + assert isinstance(logs1, list) + assert len(logs1) == 9 + assert logs1[0].get('message') == 'simple-msg1' + assert logs1[1].get('message') == 'simple-msg2' + assert logs1[2].get('message') == 'simple-msg3' + + # testing offset + logs2 = job.logs.list(offset=3) + assert len(logs2) == 6 + assert logs2[0].get('message') == 'debug-msg' + assert logs2[1].get('message') == 'info-msg' + + # testing level + logs3 = job.logs.list(level='ERROR') + assert len(logs3) == 1 + assert logs3[0].get('message') == 'error-msg' + + +def test_logs_list_filter(spider): + job = spider.jobs.schedule() + _add_test_logs(job) + + logs1 = job.logs.list(filter='["message", "contains", ["simple"]]') + assert isinstance(logs1, list) + assert len(logs1) == 3 + assert logs1[0].get('message') == 'simple-msg1' + + logs2 = job.logs.list(filter=[['message', 'contains', ['simple']]]) + assert len(logs2) == 3 + + logs3 = job.logs.list(filter=[('message', 'contains', ['simple'])]) + assert len(logs3) == 3 + + def test_logs_iter_raw_json(spider): job = spider.jobs.schedule() _add_test_logs(job) @@ -152,13 +192,18 @@ def test_requests_iter_raw_json(spider): next(rr) -def test_samples_iter(spider): - job = spider.jobs.schedule(meta={'state': 'running'}) - assert list(job.samples.iter()) == [] +def _add_test_samples(job): job.samples.write([TEST_TS, 1, 2, 3]) job.samples.write([TEST_TS + 1, 5, 9, 4]) job.samples.flush() job.samples.close() + + +def test_samples_iter(spider): + job = spider.jobs.schedule(meta={'state': 'running'}) + assert list(job.samples.iter()) == [] + _add_test_samples(job) + o = job.samples.iter() assert next(o) == [TEST_TS, 1, 2, 3] assert next(o) == [TEST_TS + 1, 5, 9, 4] @@ -166,13 +211,27 @@ def test_samples_iter(spider): next(o) -def test_items_iter(spider): +def test_samples_list(spider): job = spider.jobs.schedule(meta={'state': 'running'}) + _add_test_samples(job) + o = job.samples.list() + assert isinstance(o, list) + assert len(o) == 2 + assert o[0] == [TEST_TS, 1, 2, 3] + assert o[1] == [TEST_TS + 1, 5, 9, 4] + + +def _add_test_items(job): for i in range(3): job.items.write({'id': i, 'data': 'data' + str(i)}) job.items.flush() job.items.close() + +def test_items_iter(spider): + job = spider.jobs.schedule(meta={'state': 'running'}) + _add_test_items(job) + o = job.items.iter() assert next(o) == {'id': 0, 'data': 'data0'} assert next(o) == {'id': 1, 'data': 'data1'} @@ -196,3 +255,51 @@ def test_items_iter(spider): o = mpdecode(msgpacked_o) assert item['id'] == 2 assert item['data'] == 'data2' + + +def test_items_list(spider): + job = spider.jobs.schedule(meta={'state': 'running'}) + _add_test_items(job) + + o = job.items.list() + assert isinstance(o, list) + assert len(o) == 3 + assert o[0] == {'id': 0, 'data': 'data0'} + assert o[1] == {'id': 1, 'data': 'data1'} + assert o[2] == {'id': 2, 'data': 'data2'} + + +def _add_test_activity(project): + activity = project.activity + jobkey = TEST_PROJECT_ID + '/2/3' + activity.add(event='job:completed', job=jobkey, user='jobrunner') + activity.add(event='job:cancelled', job=jobkey, user='john') + + +def test_activity_wrong_project(project): + with pytest.raises(ValueError): + project.activity.add(event='job:completed', + job='123/1/1', user='user') + + +def test_activity_iter(project): + _add_test_activity(project) + activity = project.activity.iter() + assert isinstance(activity, types.GeneratorType) + activity_item = next(activity) + assert activity_item == {'event': 'job:cancelled', + 'job': TEST_PROJECT_ID + '/2/3', + 'user': 'john'} + + +def test_activity_list(project): + _add_test_activity(project) + activity = project.activity.list(count=2) + assert isinstance(activity, list) + assert len(activity) == 2 + assert activity[0] == {'event': 'job:cancelled', + 'job': TEST_PROJECT_ID + '/2/3', + 'user': 'john'} + assert activity[1] == {'event': 'job:completed', + 'job': TEST_PROJECT_ID + '/2/3', + 'user': 'jobrunner'} diff --git a/tests/client/test_spider.py b/tests/client/test_spider.py index cf6789bf..c4b4a1d4 100644 --- a/tests/client/test_spider.py +++ b/tests/client/test_spider.py @@ -2,9 +2,10 @@ from collections import defaultdict import pytest +from six import string_types from six.moves import range -from scrapinghub.exceptions import NotFound, DuplicateJobError +from scrapinghub.exceptions import NotFound, DuplicateJobError, InvalidUsage from scrapinghub.client import Jobs, Job from scrapinghub.client import Spider from scrapinghub.utils import JobKey @@ -16,9 +17,6 @@ def test_spiders_get(project): spider = project.spiders.get(TEST_SPIDER_NAME) assert isinstance(spider, Spider) - assert isinstance(spider.key, int) - assert spider.name == TEST_SPIDER_NAME - assert spider.projectid == int(TEST_PROJECT_ID) assert isinstance(spider.jobs, Jobs) with pytest.raises(NotFound): @@ -32,35 +30,30 @@ def test_spiders_list(project): def test_spider_base(project, spider): - assert spider.key == 1 + assert isinstance(spider.id, string_types) + assert isinstance(spider.key, string_types) + assert spider.key == spider.projectid + '/' + spider.id assert spider.name == TEST_SPIDER_NAME - assert spider.projectid == int(TEST_PROJECT_ID) + assert spider.projectid == TEST_PROJECT_ID assert isinstance(project.jobs, Jobs) -def test_spider_update_tags(project, spider): - # empty updates - assert spider.update_tags() is None - assert spider.update_tags( - add=['new1', 'new2'], remove=['old1', 'old2']) == 0 +def test_spider_list_update_tags(project, spider): + # FIXME empty update should fail + with pytest.raises(InvalidUsage): + spider.update_tags() - jobs = [project.jobs.schedule(TEST_SPIDER_NAME, subid='tags-' + str(i)) - for i in range(2)] - # FIXME the endpoint normalises tags so it's impossible to send tags - # having upper-cased symbols, let's add more tests when it's fixed - assert spider.update_tags(add=['tag-a', 'tag-b', 'tag-c']) == 2 - for job in jobs: - assert job.metadata.liveget('tags') == ['tag-a', 'tag-b', 'tag-c'] - assert spider.update_tags(remove=['tag-c', 'tag-d']) == 2 - for job in jobs: - assert job.metadata.liveget('tags') == ['tag-a', 'tag-b'] - # FIXME adding and removing tags at the same time doesn't work neither: - # remove-tag field is ignored if there's non-void add-tag field + spider.update_tags(add=['new1', 'new2']) + assert spider.list_tags() == ['new1', 'new2'] + spider.update_tags(add=['new2', 'new3'], remove=['new1']) + assert spider.list_tags() == ['new2', 'new3'] + spider.update_tags(remove=['new2', 'new3']) + assert spider.list_tags() == [] def test_spider_jobs(spider): jobs = spider.jobs - assert jobs.projectid == int(TEST_PROJECT_ID) + assert jobs.projectid == TEST_PROJECT_ID assert jobs.spider is spider @@ -113,6 +106,30 @@ def test_spider_jobs_iter(spider): next(jobs1) +def test_spider_jobs_list(spider): + spider.jobs.schedule(meta={'state': 'running'}) + + # no finished jobs + jobs0 = spider.jobs.list() + assert isinstance(jobs0, list) + assert len(jobs0) == 0 + + # filter by state must work like for iter + jobs1 = spider.jobs.list(state='running') + assert len(jobs1) == 1 + job = jobs1[0] + assert isinstance(job, dict) + ts = job.get('ts') + assert isinstance(ts, int) and ts > 0 + running_time = job.get('running_time') + assert isinstance(running_time, int) and running_time > 0 + elapsed = job.get('elapsed') + assert isinstance(elapsed, int) and elapsed > 0 + assert job.get('key').startswith(TEST_PROJECT_ID) + assert job.get('spider') == TEST_SPIDER_NAME + assert job.get('state') == 'running' + + def test_spider_jobs_schedule(spider): job0 = spider.jobs.schedule() assert isinstance(job0, Job) @@ -152,7 +169,7 @@ def test_spider_jobs_get(spider): with pytest.raises(ValueError): spider.jobs.get(TEST_PROJECT_ID + '/2/3') - fake_job_id = str(JobKey(TEST_PROJECT_ID, spider.key, 3)) + fake_job_id = str(JobKey(spider.projectid, spider.id, 3)) fake_job = spider.jobs.get(fake_job_id) assert isinstance(fake_job, Job) From fdab51f51205bae792d6ef61ca34abfdf31355de Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Fri, 17 Feb 2017 15:23:22 +0300 Subject: [PATCH 22/35] Add some tests for iter-filter logic --- tests/client/test_utils.py | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 tests/client/test_utils.py diff --git a/tests/client/test_utils.py b/tests/client/test_utils.py new file mode 100644 index 00000000..763cb41a --- /dev/null +++ b/tests/client/test_utils.py @@ -0,0 +1,37 @@ +import pytest +from scrapinghub.utils import format_iter_filters + + +def test_format_iter_filters(): + # work with empty params + assert format_iter_filters({}) == {} + + # doesn't affect other params + params = {'a': 123, 'b': 456} + assert format_iter_filters(params) == params + + # pass filter as-is if not list + params = {'filter': 'some-string'} + assert format_iter_filters(params) == params + + # work fine with empty filter + params = {'filter': []} + assert format_iter_filters(params) == params + + # pass string filters as-is + params = {'filter': ['str1', 'str2']} + assert format_iter_filters(params) == params + + # converts list-formatted filters + params = {'filter': [['field', '>=', ['val']], 'filter2']} + assert (format_iter_filters(params) == + {'filter': ['["field", ">=", ["val"]]', 'filter2']}) + + # works the same with tuple entries + params = {'filter': [('field', '==', ['val'])]} + assert (format_iter_filters(params) == + {'filter': ['["field", "==", ["val"]]']}) + + # exception if entry is not list/tuple or string + with pytest.raises(ValueError): + format_iter_filters({'filter': ['test', 123]}) From 5aa7593a5d2e64c891368ae4d83b1a145365dfd5 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Fri, 17 Feb 2017 18:25:00 +0300 Subject: [PATCH 23/35] Spider id should be a private attribute --- README.rst | 10 +++++----- scrapinghub/client.py | 17 +++++++---------- tests/client/test_spider.py | 6 +++--- 3 files changed, 15 insertions(+), 18 deletions(-) diff --git a/README.rst b/README.rst index 303ea2e6..a399cc34 100644 --- a/README.rst +++ b/README.rst @@ -74,8 +74,8 @@ And select a particular project to work with:: >>> project = client.get_project(123) >>> project - >>> project.id - 123 + >>> project.key + '123' The above is a shortcut for ``client.projects.get(123)``. @@ -116,8 +116,8 @@ To select a particular spider to work with:: >>> spider = project.spiders.get('spider2') >>> spider - >>> spider.id - 2 + >>> spider.key + '123/2' >>> spider.name spider2 @@ -144,7 +144,7 @@ get To select a specific job for a project:: >>> job = project.jobs.get('123/1/2') - >>> job.id + >>> job.key '123/1/2' Also there's a shortcut to get same job with client instance:: diff --git a/scrapinghub/client.py b/scrapinghub/client.py index d5e981c3..2f390703 100644 --- a/scrapinghub/client.py +++ b/scrapinghub/client.py @@ -282,16 +282,12 @@ class Spider(object): a :class:`Spider` instance. See :meth:`Spiders.get` method. :ivar projectid: integer project id. - :ivar id: integer spider id. :ivar name: a spider name string. :ivar jobs: a collection of jobs, :class:`Jobs` object. Usage:: >>> spider = project.spiders.get('spider1') - - >>> spider.id - '1' >>> spider.key '123/1' >>> spider.name @@ -301,7 +297,7 @@ class Spider(object): def __init__(self, client, projectid, spiderid, spidername): self.projectid = projectid self.key = '{}/{}'.format(str(projectid), str(spiderid)) - self.id = str(spiderid) + self._id = str(spiderid) self.name = spidername self.jobs = Jobs(client, projectid, self) self._client = client @@ -309,14 +305,15 @@ def __init__(self, client, projectid, spiderid, spidername): @wrap_http_errors def update_tags(self, add=None, remove=None): params = get_tags_for_update(add=add, remove=remove) - path = 'v2/projects/{}/spiders/{}/tags'.format(self.projectid, self.id) + path = 'v2/projects/{}/spiders/{}/tags'.format(self.projectid, + self._id) url = urljoin(self._client._connection.url, path) response = self._client._connection._session.patch(url, json=params) response.raise_for_status() @wrap_http_errors def list_tags(self): - path = 'v2/projects/{}/spiders/{}'.format(self.projectid, self.id) + path = 'v2/projects/{}/spiders/{}'.format(self.projectid, self._id) url = urljoin(self._client._connection.url, path) response = self._client._connection._session.get(url) response.raise_for_status() @@ -477,7 +474,7 @@ def get(self, jobkey): jobkey = parse_job_key(jobkey) if jobkey.projectid != self.projectid: raise ValueError('Please use same project id') - if self.spider and jobkey.spiderid != self.spider.id: + if self.spider and jobkey.spiderid != self.spider._id: raise ValueError('Please use same spider id') return Job(self._client, str(jobkey)) @@ -539,8 +536,8 @@ def iter_last(self, **params): def _extract_spider_id(self, params): spiderid = params.pop('spiderid', None) if not spiderid and self.spider: - return self.spider.id - elif spiderid and self.spider and str(spiderid) != self.spider.id: + return self.spider._id + elif spiderid and self.spider and str(spiderid) != self.spider._id: raise ValueError('Please use same spider id') return str(spiderid) if spiderid else None diff --git a/tests/client/test_spider.py b/tests/client/test_spider.py index c4b4a1d4..916b1cb2 100644 --- a/tests/client/test_spider.py +++ b/tests/client/test_spider.py @@ -30,9 +30,9 @@ def test_spiders_list(project): def test_spider_base(project, spider): - assert isinstance(spider.id, string_types) + assert isinstance(spider._id, string_types) assert isinstance(spider.key, string_types) - assert spider.key == spider.projectid + '/' + spider.id + assert spider.key == spider.projectid + '/' + spider._id assert spider.name == TEST_SPIDER_NAME assert spider.projectid == TEST_PROJECT_ID assert isinstance(project.jobs, Jobs) @@ -169,7 +169,7 @@ def test_spider_jobs_get(spider): with pytest.raises(ValueError): spider.jobs.get(TEST_PROJECT_ID + '/2/3') - fake_job_id = str(JobKey(spider.projectid, spider.id, 3)) + fake_job_id = str(JobKey(spider.projectid, spider._id, 3)) fake_job = spider.jobs.get(fake_job_id) assert isinstance(fake_job, Job) From 0dad052e979338912e7a963494fe556df9a3051b Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Fri, 17 Feb 2017 18:29:08 +0300 Subject: [PATCH 24/35] Rename col.delete() _keys argument for simplicity --- scrapinghub/client.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scrapinghub/client.py b/scrapinghub/client.py index 2f390703..ff856dc0 100644 --- a/scrapinghub/client.py +++ b/scrapinghub/client.py @@ -1147,13 +1147,13 @@ def set(self, *args, **kwargs): """ self._origin.set(*args, **kwargs) - def delete(self, _keys): + def delete(self, keys): """Delete item(s) from collection by key(s). The method returns None (original method returns an empty generator). """ - if (not isinstance(_keys, string_types) and - not isinstance(_keys, collections.Iterable)): + if (not isinstance(keys, string_types) and + not isinstance(keys, collections.Iterable)): raise ValueError("You should provide string key or iterable " "object providing string keys") - self._origin.delete(_keys) + self._origin.delete(keys) From b48bfc06bbb96e66c572b461e2235b18caa3af76 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Fri, 17 Feb 2017 18:37:25 +0300 Subject: [PATCH 25/35] Add missing iter methods for more consistent API --- scrapinghub/client.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/scrapinghub/client.py b/scrapinghub/client.py index ff856dc0..87e2302a 100644 --- a/scrapinghub/client.py +++ b/scrapinghub/client.py @@ -157,6 +157,13 @@ def list(self): """ return self._client._connection.project_ids() + def iter(self): + """Iterate through list of projects available to current user. + + Provided for the sake of API consistency. + """ + return iter(self.list()) + def summary(self, **params): """Get short summaries for all available user projects. @@ -274,6 +281,13 @@ def list(self): project = self._client._connection[self.projectid] return project.spiders() + def iter(self): + """Iterate through a list of spiders for a project. + + Provided for the sake of API consistency. + """ + return iter(self.list()) + class Spider(object): """Class representing a Spider object. @@ -1057,8 +1071,13 @@ def get_versioned_store(self, colname): def get_versioned_cached_store(self, colname): return self.get('vcs', colname) + def iter(self): + """Iterate through collections of a project.""" + return self._origin.apiget('list') + def list(self): - return list(self._origin.apiget('list')) + """List collections of a project.""" + return list(self.iter()) class Collection(object): From d5ef2c3ec84d3b8c48109efeb8e1c83b802e4ece Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Tue, 21 Feb 2017 19:02:57 +0300 Subject: [PATCH 26/35] Allow authorisation via jwt tokens --- scrapinghub/client.py | 14 ++++++++++---- scrapinghub/legacy.py | 10 +++++----- scrapinghub/utils.py | 27 +++++++++++++++++++++++++++ 3 files changed, 42 insertions(+), 9 deletions(-) diff --git a/scrapinghub/client.py b/scrapinghub/client.py index 87e2302a..9a0bfd0f 100644 --- a/scrapinghub/client.py +++ b/scrapinghub/client.py @@ -29,6 +29,7 @@ from .utils import LogLevel from .utils import get_tags_for_update +from .utils import parse_auth from .utils import parse_project_id, parse_job_key from .utils import proxy_methods from .utils import wrap_kwargs @@ -52,7 +53,7 @@ def request(self, *args, **kwargs): class ScrapinghubClient(object): """Main class to work with Scrapinghub API. - :param apikey: Scrapinghub APIKEY string. + :param auth: Scrapinghub APIKEY or other SH auth credentials. :param dash_endpoint: (optional) Scrapinghub Dash panel url. :param \*\*kwargs: (optional) Additional arguments for :class:`scrapinghub.hubstorage.HubstorageClient` constructor. @@ -67,10 +68,15 @@ class ScrapinghubClient(object): """ - def __init__(self, apikey=None, dash_endpoint=None, **kwargs): + def __init__(self, auth=None, dash_endpoint=None, **kwargs): self.projects = Projects(self) - self._connection = Connection(apikey=apikey, url=dash_endpoint) - self._hsclient = HubstorageClient(auth=apikey, **kwargs) + auth = parse_auth(auth) + connection_kwargs = {'apikey': auth, 'url': dash_endpoint} + if len(auth) == 2: + connection_kwargs['apikey'] = auth[0] + connection_kwargs['password'] = auth[1] + self._connection = Connection(**connection_kwargs) + self._hsclient = HubstorageClient(auth=auth, **kwargs) def get_project(self, projectid): """Get :class:`Project` instance with a given project id. diff --git a/scrapinghub/legacy.py b/scrapinghub/legacy.py index 59bf01f7..899e7b54 100644 --- a/scrapinghub/legacy.py +++ b/scrapinghub/legacy.py @@ -60,10 +60,10 @@ def __init__(self, apikey=None, password='', _old_passwd='', url=None): assert not apikey.startswith('http://'), \ "Instantiating scrapinghub.Connection with url as first argument is not supported" - assert not password, \ - "Authentication with user:pass is not supported, use your apikey instead" - + if password: + warnings.warn("A lot of endpoints support authentication only via apikey.") self.apikey = apikey + self.password = password or '' self.url = url or self.DEFAULT_ENDPOINT self._session = self._create_session() @@ -74,13 +74,13 @@ def __repr__(self): def auth(self): warnings.warn("'auth' connection attribute is deprecated, " "use 'apikey' attribute instead", stacklevel=2) - return (self.apikey, '') + return (self.apikey, self.password) def _create_session(self): from requests import session from scrapinghub import __version__ s = session() - s.auth = (self.apikey, '') + s.auth = (self.apikey, self.password) s.headers.update({ 'User-Agent': 'python-scrapinghub/{0}'.format(__version__), }) diff --git a/scrapinghub/utils.py b/scrapinghub/utils.py index ad3d2e58..fe2bebf1 100644 --- a/scrapinghub/utils.py +++ b/scrapinghub/utils.py @@ -1,6 +1,8 @@ import json import logging +import binascii +from codecs import decode from six import string_types @@ -105,3 +107,28 @@ def format_iter_filters(params): if filter_data: params['filter'] = filter_data return params + + +def parse_auth(auth): + """Parse authentification token. + + >>> parse_auth(None) + >>> parse_auth(('user', 'pass')) + ('user', 'pass') + >>> parse_auth('user:pass') + ('user', 'pass') + >>> parse_auth('apikey') + ('apikey', '') + >>> parse_auth('312f322f333a736f6d652e6a77742e746f6b656e') + ('1/2/3', 'some.jwt.token') + """ + if auth is None or isinstance(auth, tuple): + return auth + try: + auth = decode(auth, 'hex_codec') + if not isinstance(auth, string_types): + auth = auth.decode('ascii') + except binascii.Error: + pass + user, _, password = auth.partition(':') + return user, password From cafc15a3d165975dcbad1a07437f9351bffe4d40 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Wed, 22 Feb 2017 12:22:04 +0300 Subject: [PATCH 27/35] Extend auth logic to cover all possible cases --- scrapinghub/client.py | 12 ++++----- scrapinghub/utils.py | 51 +++++++++++++++++++++++++++++-------- tests/client/conftest.py | 2 +- tests/client/test_utils.py | 52 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 99 insertions(+), 18 deletions(-) diff --git a/scrapinghub/client.py b/scrapinghub/client.py index 9a0bfd0f..3864a98d 100644 --- a/scrapinghub/client.py +++ b/scrapinghub/client.py @@ -70,13 +70,11 @@ class ScrapinghubClient(object): def __init__(self, auth=None, dash_endpoint=None, **kwargs): self.projects = Projects(self) - auth = parse_auth(auth) - connection_kwargs = {'apikey': auth, 'url': dash_endpoint} - if len(auth) == 2: - connection_kwargs['apikey'] = auth[0] - connection_kwargs['password'] = auth[1] - self._connection = Connection(**connection_kwargs) - self._hsclient = HubstorageClient(auth=auth, **kwargs) + login, password = parse_auth(auth) + self._connection = Connection(apikey=login, + password=password, + url=dash_endpoint) + self._hsclient = HubstorageClient(auth=(login, password), **kwargs) def get_project(self, projectid): """Get :class:`Project` instance with a given project id. diff --git a/scrapinghub/utils.py b/scrapinghub/utils.py index fe2bebf1..f2f940be 100644 --- a/scrapinghub/utils.py +++ b/scrapinghub/utils.py @@ -1,9 +1,10 @@ +import os import json import logging import binascii from codecs import decode -from six import string_types +from six import string_types, binary_type class LogLevel(object): @@ -112,23 +113,53 @@ def format_iter_filters(params): def parse_auth(auth): """Parse authentification token. + >>> parse_auth(None) # noqa + RuntimeError: No API key provided and SH_APIKEY environment variable not set + >>> os.environ['SH_APIKEY'] = 'apikey' >>> parse_auth(None) + ('apikey', '') >>> parse_auth(('user', 'pass')) ('user', 'pass') >>> parse_auth('user:pass') ('user', 'pass') - >>> parse_auth('apikey') - ('apikey', '') + >>> parse_auth('c3a3c298c2b8c3a6c291c284c3a9') + ('c3a3c298c2b8c3a6c291c284c3a9', '') >>> parse_auth('312f322f333a736f6d652e6a77742e746f6b656e') ('1/2/3', 'some.jwt.token') """ - if auth is None or isinstance(auth, tuple): + if auth is None: + apikey = os.environ.get('SH_APIKEY') + if apikey is None: + raise RuntimeError("No API key provided and SH_APIKEY " + "environment variable not set") + return (apikey, '') + + if isinstance(auth, tuple): + if len(auth) != 2: + raise ValueError("Wrong amount of authentication credentials") + if not isinstance(auth[0], string_types): + raise ValueError("Login must me of a string type") + if not (auth[1] is None or isinstance(auth[1], string_types)): + raise ValueError("Password must be None or of a string type") return auth + + if not isinstance(auth, string_types): + raise ValueError("Wrong authentication credentials") + + decoded_auth = None try: - auth = decode(auth, 'hex_codec') - if not isinstance(auth, string_types): - auth = auth.decode('ascii') - except binascii.Error: + decoded_auth = decode(auth, 'hex_codec') + except (binascii.Error, TypeError): + login, _, password = auth.partition(':') + return (login, password) + + try: + if not isinstance(decoded_auth, string_types): + decoded_auth = decoded_auth.decode('ascii') + login, _, password = decoded_auth.partition(':') + if password and parse_job_key(login): + return (login, password) + except (UnicodeDecodeError, ValueError): pass - user, _, password = auth.partition(':') - return user, password + + return (auth, '') diff --git a/tests/client/conftest.py b/tests/client/conftest.py index 20b5fd59..75f2ab95 100644 --- a/tests/client/conftest.py +++ b/tests/client/conftest.py @@ -69,7 +69,7 @@ def is_using_real_services(request): @pytest.fixture(scope='session') def client(): - return ScrapinghubClient(apikey=TEST_ADMIN_AUTH, + return ScrapinghubClient(auth=TEST_ADMIN_AUTH, endpoint=TEST_HS_ENDPOINT, dash_endpoint=TEST_DASH_ENDPOINT) diff --git a/tests/client/test_utils.py b/tests/client/test_utils.py index 763cb41a..c7ca5fba 100644 --- a/tests/client/test_utils.py +++ b/tests/client/test_utils.py @@ -1,4 +1,10 @@ +import os import pytest +from codecs import encode + +import mock + +from scrapinghub.utils import parse_auth from scrapinghub.utils import format_iter_filters @@ -35,3 +41,49 @@ def test_format_iter_filters(): # exception if entry is not list/tuple or string with pytest.raises(ValueError): format_iter_filters({'filter': ['test', 123]}) + + +def test_parse_auth_none(): + with pytest.raises(RuntimeError): + parse_auth(None) + + +@mock.patch.dict(os.environ, {'SH_APIKEY': 'testkey'}) +def test_parse_auth_none_with_env(): + assert parse_auth(None) == ('testkey', '') + + +def test_parse_auth_tuple(): + assert parse_auth(('test', 'test')) == ('test', 'test') + assert parse_auth(('apikey', None)) == ('apikey', None) + + with pytest.raises(ValueError): + parse_auth(('user', 'pass', 'bad-param')) + + with pytest.raises(ValueError): + parse_auth((None, None)) + + with pytest.raises(ValueError): + parse_auth((1234, '')) + + +def test_parse_auth_not_string(): + with pytest.raises(ValueError): + parse_auth(12345) + + +def test_parse_auth_simple(): + assert parse_auth('user:pass') == ('user', 'pass') + + +def test_parse_auth_apikey(): + test_key = u'\xe3\x98\xb8\xe6\x91\x84\xe9' + apikey = encode(test_key.encode('utf8'), 'hex_codec').decode('ascii') + assert parse_auth(apikey) == (apikey, '') + + +def test_parse_auth_jwt_token(): + test_job, test_token = '1/2/3', 'some.jwt.token' + raw_token = (test_job + ':' + test_token).encode('utf8') + encoded_token = encode(raw_token, 'hex_codec').decode('ascii') + assert parse_auth(encoded_token) == (test_job, test_token) From ac7272a050a4e211e331703208f05745b23fab7a Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Wed, 22 Feb 2017 12:32:11 +0300 Subject: [PATCH 28/35] Check for bad apikey: should be hex-encoded --- scrapinghub/utils.py | 2 ++ tests/client/test_utils.py | 5 +++++ 2 files changed, 7 insertions(+) diff --git a/scrapinghub/utils.py b/scrapinghub/utils.py index f2f940be..537df364 100644 --- a/scrapinghub/utils.py +++ b/scrapinghub/utils.py @@ -151,6 +151,8 @@ def parse_auth(auth): decoded_auth = decode(auth, 'hex_codec') except (binascii.Error, TypeError): login, _, password = auth.partition(':') + if not password: + raise ValueError("Bad apikey, please check your credentials") return (login, password) try: diff --git a/tests/client/test_utils.py b/tests/client/test_utils.py index c7ca5fba..21f61ef7 100644 --- a/tests/client/test_utils.py +++ b/tests/client/test_utils.py @@ -76,6 +76,11 @@ def test_parse_auth_simple(): assert parse_auth('user:pass') == ('user', 'pass') +def test_parse_auth_bad_apikey(): + with pytest.raises(ValueError): + parse_auth('apikey') + + def test_parse_auth_apikey(): test_key = u'\xe3\x98\xb8\xe6\x91\x84\xe9' apikey = encode(test_key.encode('utf8'), 'hex_codec').decode('ascii') From 6d05df0a5b0c4cc689156f8dbda395349fe94d36 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Wed, 22 Feb 2017 16:14:59 +0300 Subject: [PATCH 29/35] Rework auth logic after code-review --- scrapinghub/utils.py | 29 +++++++++++++---------------- tests/client/test_utils.py | 10 ++-------- 2 files changed, 15 insertions(+), 24 deletions(-) diff --git a/scrapinghub/utils.py b/scrapinghub/utils.py index 537df364..9c60f3f1 100644 --- a/scrapinghub/utils.py +++ b/scrapinghub/utils.py @@ -113,8 +113,6 @@ def format_iter_filters(params): def parse_auth(auth): """Parse authentification token. - >>> parse_auth(None) # noqa - RuntimeError: No API key provided and SH_APIKEY environment variable not set >>> os.environ['SH_APIKEY'] = 'apikey' >>> parse_auth(None) ('apikey', '') @@ -135,26 +133,27 @@ def parse_auth(auth): return (apikey, '') if isinstance(auth, tuple): - if len(auth) != 2: - raise ValueError("Wrong amount of authentication credentials") - if not isinstance(auth[0], string_types): - raise ValueError("Login must me of a string type") - if not (auth[1] is None or isinstance(auth[1], string_types)): - raise ValueError("Password must be None or of a string type") + all_strings = all(isinstance(k, string_types) for k in auth) + if len(auth) != 2 or not all_strings: + raise ValueError("Wrong authentication credentials") return auth if not isinstance(auth, string_types): raise ValueError("Wrong authentication credentials") - decoded_auth = None + jwt_auth = _search_for_jwt_credentials(auth) + if jwt_auth: + return jwt_auth + + login, _, password = auth.partition(':') + return (login, password) + + +def _search_for_jwt_credentials(auth): try: decoded_auth = decode(auth, 'hex_codec') except (binascii.Error, TypeError): - login, _, password = auth.partition(':') - if not password: - raise ValueError("Bad apikey, please check your credentials") - return (login, password) - + return try: if not isinstance(decoded_auth, string_types): decoded_auth = decoded_auth.decode('ascii') @@ -163,5 +162,3 @@ def parse_auth(auth): return (login, password) except (UnicodeDecodeError, ValueError): pass - - return (auth, '') diff --git a/tests/client/test_utils.py b/tests/client/test_utils.py index 21f61ef7..475e317a 100644 --- a/tests/client/test_utils.py +++ b/tests/client/test_utils.py @@ -55,7 +55,7 @@ def test_parse_auth_none_with_env(): def test_parse_auth_tuple(): assert parse_auth(('test', 'test')) == ('test', 'test') - assert parse_auth(('apikey', None)) == ('apikey', None) + assert parse_auth(('apikey', '')) == ('apikey', '') with pytest.raises(ValueError): parse_auth(('user', 'pass', 'bad-param')) @@ -76,14 +76,8 @@ def test_parse_auth_simple(): assert parse_auth('user:pass') == ('user', 'pass') -def test_parse_auth_bad_apikey(): - with pytest.raises(ValueError): - parse_auth('apikey') - - def test_parse_auth_apikey(): - test_key = u'\xe3\x98\xb8\xe6\x91\x84\xe9' - apikey = encode(test_key.encode('utf8'), 'hex_codec').decode('ascii') + apikey = 'c3a3c298c2b8c3a6c291c284c3a9' assert parse_auth(apikey) == (apikey, '') From f6117203c45457d211d8083e58d18a247992afda Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Wed, 22 Feb 2017 16:16:25 +0300 Subject: [PATCH 30/35] Add pytest.ini to enable doctests --- pytest.ini | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 pytest.ini diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 00000000..7b04c7b1 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,2 @@ +[pytest] +addopts = --doctest-modules --doctest-glob='scrapinghub/*.py' From 1f98bb99d67d4f44b48a88422b79d10964143736 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Wed, 22 Feb 2017 17:39:31 +0300 Subject: [PATCH 31/35] Provide extended API for Frontier --- pytest.ini | 2 +- scrapinghub/client.py | 29 +++++++++++++++++++++++++++-- 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/pytest.ini b/pytest.ini index 7b04c7b1..956ee487 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,2 +1,2 @@ [pytest] -addopts = --doctest-modules --doctest-glob='scrapinghub/*.py' +addopts = --doctest-glob='scrapinghub/*.py' diff --git a/scrapinghub/client.py b/scrapinghub/client.py index 3864a98d..63e0d52c 100644 --- a/scrapinghub/client.py +++ b/scrapinghub/client.py @@ -11,7 +11,6 @@ from .hubstorage.resourcetype import ItemsResourceType # scrapinghub.hubstorage classes to use as-is -from .hubstorage.frontier import Frontier from .hubstorage.job import JobMeta from .hubstorage.project import Settings @@ -19,6 +18,7 @@ from .hubstorage.activity import Activity as _Activity from .hubstorage.collectionsrt import Collections as _Collections from .hubstorage.collectionsrt import Collection as _Collection +from .hubstorage.frontier import Frontier as _Frontier from .hubstorage.job import Items as _Items from .hubstorage.job import Logs as _Logs from .hubstorage.job import Samples as _Samples @@ -227,7 +227,7 @@ def __init__(self, client, projectid): # proxied sub-resources self.activity = Activity(_Activity, client, projectid) self.collections = Collections(_Collections, client, projectid) - self.frontier = Frontier(client._hsclient, projectid) + self.frontier = Frontier(_Frontier, client, projectid) self.settings = Settings(client._hsclient, projectid) @@ -1084,6 +1084,31 @@ def list(self): return list(self.iter()) +class Frontier(_Proxy): + """Frontiers collection for a project.""" + + def __init__(self, *args, **kwargs): + super(Frontier, self).__init__(*args, **kwargs) + self._proxy_methods(['close', 'flush', 'add', 'read', 'delete', + 'delete_slot']) + + @property + def newcount(self): + return self._origin.newcount + + def iter(self): + return iter(self.list()) + + def list(self): + return next(self._origin.apiget('list')) + + def iter_slots(self, name): + return iter(self.list_slots(name)) + + def list_slots(self, name): + return next(self._origin.apiget((name, 'list'))) + + class Collection(object): """Representation of a project collection object. From 57a688e12834ed401625f0b085de23db5b144343 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Wed, 22 Feb 2017 17:55:01 +0300 Subject: [PATCH 32/35] Drop outdated test for modified auth logic --- tests/legacy/test_connection.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tests/legacy/test_connection.py b/tests/legacy/test_connection.py index 2f88e14e..0c45ebee 100644 --- a/tests/legacy/test_connection.py +++ b/tests/legacy/test_connection.py @@ -32,11 +32,6 @@ def test_connection_init_assert_apikey_not_url(): Connection(password='testpass', apikey='http://some-url') -def test_connection_init_with_password(): - with pytest.raises(AssertionError): - Connection(apikey='testkey', password='testpass') - - def test_connection_init_with_default_url(): conn = Connection(apikey='testkey') assert conn.url == Connection.DEFAULT_ENDPOINT From 50222c40ec9d8a17292cd98a12550a0d42a6fe89 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Wed, 22 Feb 2017 18:09:22 +0300 Subject: [PATCH 33/35] Add spider_args to schedule method --- scrapinghub/client.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/scrapinghub/client.py b/scrapinghub/client.py index 63e0d52c..a1233fb4 100644 --- a/scrapinghub/client.py +++ b/scrapinghub/client.py @@ -457,6 +457,13 @@ def schedule(self, spidername=None, **params): raise ValueError('Please provide spidername') params['project'] = self.projectid params['spider'] = spidername or self.spider.name + spider_args = params.pop('spider_args', None) + if spider_args: + if not isinstance(spider_args, dict): + raise ValueError("spider_args should be a dictionary") + cleaned_args = {k: v for k, v in spider_args.items() + if k not in params} + params.update(cleaned_args) if 'job_settings' in params: params['job_settings'] = json.dumps(params['job_settings']) if 'meta' in params: From e75fd4d53d9d95d4b15b136b1864dee3b78149cb Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Wed, 22 Feb 2017 18:36:54 +0300 Subject: [PATCH 34/35] Add spider_args param to the docs --- README.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index a399cc34..0e7e3506 100644 --- a/README.rst +++ b/README.rst @@ -88,7 +88,7 @@ Jobs instance is described well in ``Jobs`` section below. For example, to schedule a spider run (it returns a job object):: - >>> project.jobs.schedule('spider1', arg1='val1') + >>> project.jobs.schedule('spider1', spider_args={'arg1':'val1'}) > Project instance also has the following fields: @@ -128,7 +128,7 @@ Like project instance, spider instance has ``jobs`` field to work with the spide To schedule a spider run:: - >>> spider.jobs.schedule(arg1='val1') + >>> spider.jobs.schedule(spider_args={'arg1:'val1'}) > Note that you don't need to specify spider name explicitly. @@ -160,6 +160,7 @@ Use ``schedule`` method to schedule a new job for project/spider:: Scheduling logic supports different options, like +- spider_args to provide spider arguments for the job - units to specify amount of units to schedule the job - job_settings to pass additional settings for the job - priority to set higher/lower priority of the job From 50a10c0e16cbd12794d14ddda5aa46f3dd3fdd6b Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Wed, 22 Feb 2017 19:40:04 +0300 Subject: [PATCH 35/35] Use spider_args in tests --- tests/client/test_job.py | 7 ++++--- tests/client/test_project.py | 10 +++++----- tests/client/test_spider.py | 13 ++++++++----- 3 files changed, 17 insertions(+), 13 deletions(-) diff --git a/tests/client/test_job.py b/tests/client/test_job.py index c4b7f8ff..660dd6c7 100644 --- a/tests/client/test_job.py +++ b/tests/client/test_job.py @@ -29,9 +29,10 @@ def test_job_update_metadata(spider): def test_job_update_tags(spider): - job1 = spider.jobs.schedule(subid='tags-1', add_tag=['tag1']) - job2 = spider.jobs.schedule(subid='tags-2', add_tag=['tag2']) - + job1 = spider.jobs.schedule(spider_args={'subid': 'tags-1'}, + add_tag=['tag1']) + job2 = spider.jobs.schedule(spider_args={'subid': 'tags-2'}, + add_tag=['tag2']) # FIXME the endpoint normalises tags so it's impossible to send tags # having upper-cased symbols, let's add more tests when it's fixed assert job1.update_tags(add=['tag11', 'tag12']) == 1 diff --git a/tests/client/test_project.py b/tests/client/test_project.py index fc39739d..0566c142 100644 --- a/tests/client/test_project.py +++ b/tests/client/test_project.py @@ -38,13 +38,13 @@ def test_project_jobs_count(project): for i in range(2): project.jobs.schedule(TEST_SPIDER_NAME, - subid='running-%s' % i, + spider_args={'subid': 'running-%s' % i}, meta={'state': 'running'}) assert project.jobs.count(state='running') == 2 for i in range(3): project.jobs.schedule(TEST_SPIDER_NAME, - subid='finished%s' % i, + spider_args={'subid': 'finished%s' % i}, meta={'state': 'finished'}) assert project.jobs.count(state='finished') == 3 @@ -123,7 +123,7 @@ def test_project_jobs_schedule(project): project.jobs.schedule(TEST_SPIDER_NAME) job1 = project.jobs.schedule(TEST_SPIDER_NAME, - arg1='val1', arg2='val2', + spider_args={'arg1':'val1', 'arg2': 'val2'}, priority=3, units=3, add_tag=['tagA', 'tagB'], meta={'state': 'running', 'meta1': 'val1'}) @@ -157,7 +157,7 @@ def test_project_jobs_summary(project): for state in sorted(counts): for i in range(counts[state]): job = project.jobs.schedule(TEST_SPIDER_NAME, - subid=state + str(i), + spider_args={'subid': state + str(i)}, meta={'state': state}) jobs[state].append(job.key) summary1 = project.jobs.summary() @@ -202,7 +202,7 @@ def test_project_jobs_iter_last(project): # next iter_last should return last spider's job job2 = project.jobs.schedule(TEST_SPIDER_NAME, - subid=1, + spider_args={'subid': 1}, meta={'state': 'finished'}) lastsumm2 = list(project.jobs.iter_last()) assert len(lastsumm2) == 1 diff --git a/tests/client/test_spider.py b/tests/client/test_spider.py index 916b1cb2..c48af592 100644 --- a/tests/client/test_spider.py +++ b/tests/client/test_spider.py @@ -66,11 +66,13 @@ def test_spider_jobs_count(spider): assert jobs.count(state='pending') == 1 for i in range(2): - jobs.schedule(subid='running-%s' % i, meta={'state': 'running'}) + jobs.schedule(spider_args={'subid': 'running-%s' % i}, + meta={'state': 'running'}) assert jobs.count(state='running') == 2 for i in range(3): - jobs.schedule(subid='finished%s' % i, meta={'state': 'finished'}) + jobs.schedule(spider_args={'subid': 'finished%s' % i}, + meta={'state': 'finished'}) assert jobs.count(state='finished') == 3 assert jobs.count(state=['pending', 'running', 'finished']) == 6 @@ -141,7 +143,7 @@ def test_spider_jobs_schedule(spider): with pytest.raises(DuplicateJobError): spider.jobs.schedule() - job1 = spider.jobs.schedule(arg1='val1', arg2='val2', + job1 = spider.jobs.schedule(spider_args={'arg1': 'val1', 'arg2': 'val2'}, priority=3, units=3, meta={'state': 'running', 'meta1': 'val1'}, add_tag=['tagA', 'tagB']) @@ -184,7 +186,7 @@ def test_spider_jobs_summary(spider): jobs = defaultdict(list) for state in sorted(counts): for i in range(counts[state]): - job = spider.jobs.schedule(subid=state + str(i), + job = spider.jobs.schedule(spider_args={'subid': state + str(i)}, meta={'state': state}) jobs[state].append(job.key) summary1 = spider.jobs.summary() @@ -228,7 +230,8 @@ def test_spider_jobs_iter_last(spider): assert lastsumm1[0].get('ts') > 0 # next iter_last should return last spider's job again - job2 = spider.jobs.schedule(subid=1, meta={'state': 'finished'}) + job2 = spider.jobs.schedule(spider_args={'subid': 1}, + meta={'state': 'finished'}) lastsumm2 = list(spider.jobs.iter_last()) assert len(lastsumm2) == 1 assert lastsumm2[0].get('key') == job2.key