From a8103a208386159b8eed8c4b6e1b83c7b586d742 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Wed, 28 Dec 2016 11:58:05 +0300 Subject: [PATCH 1/8] Initial version of new client doc --- README_client.rst | 353 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 353 insertions(+) create mode 100644 README_client.rst diff --git a/README_client.rst b/README_client.rst new file mode 100644 index 00000000..5739cf8d --- /dev/null +++ b/README_client.rst @@ -0,0 +1,353 @@ +=========================================== +[Beta] Client interface for Scrapinghub API +=========================================== + + +The ``scrapinghub.ScrapinghubClient`` is a new Python client for communicating +with the `Scrapinghub API`_. It takes best from ``scrapinghub.Connection`` and +``scrapinghub.HubstorageClient`` and combines it under single interface. + + +.. contents:: :depth: 1 + + +Usage +===== + +Client +------ + +First, you connect to Scrapinghub:: + + >>> from scrapinghub import ScrapinghubClient + >>> client = ScrapinghubClient('APIKEY') + >>> client + + +Client instance has ``projects`` field for access to client projects collection. + +Projects +-------- + +You can list the projects available to your account:: + + >>> client.projects.list() + [123, 456] + +Or check the projects summary:: + + >>> client.projects.summary() + [{'finished': 674, + 'has_capacity': True, + 'pending': 0, + 'project': 123, + 'running': 1}, + {'finished': 33079, + 'has_capacity': True, + 'pending': 0, + 'project': 456, + 'running': 2}] + +And select a particular project to work with:: + + >>> project = client.get_project(123) + >>> project + + >>> project.id + 123 + +(The above is a shortcut for ``client.projects.get(123)``.) + +Project +------- + +Project instance has ``jobs`` field to work with the project jobs. + +To schedule a spider run (it returns a job object):: + + >>> project.jobs.schedule('spider1', arg1='val1') + > + +(Check ``Jobs`` section below for other features.) + +Project instance also has the following fields: + +- activity +- collections +- frontier +- reports +- settings +- spiders + +Collections +----------- + +Let's store hash and timestamp pair for foo spider. Usual workflow with `Collections`_ would be:: + + >>> collections = project.collections + >>> foo_store = collections.new_store('foo_store') + >>> foo_store.set({'_key': '002d050ee3ff6192dcbecc4e4b4457d7', 'value': '1447221694537'}) + >>> foo_store.count() + 1 + >>> foo_store.get('002d050ee3ff6192dcbecc4e4b4457d7') + '1447221694537' + >>> for result in foo_store.iter_values(): + # do something with _key & value pair + >>> foo_store.delete('002d050ee3ff6192dcbecc4e4b4457d7') + >>> foo_store.count() + 0 + +Frontier +-------- + +Typical workflow with `Frontier`_:: + + >>> frontier = project.frontier + +Add a request to the frontier:: + + >>> frontier.add('test', 'example.com', [{'fp': '/some/path.html'}]) + >>> frontier.flush() + >>> frontier.newcount + 1 + +Add requests with additional parameters:: + + >>> frontier.add('test', 'example.com', [{'fp': '/'}, {'fp': 'page1.html', 'p': 1, 'qdata': {'depth': 1}}]) + >>> frontier.flush() + >>> frontier.newcount + 2 + +To delete the slot ``example.com`` from the frontier:: + + >>> frontier.delete_slot('test', 'example.com') + +To retrieve requests for a given slot:: + + >>> reqs = frontier.read('test', 'example.com') + +To delete a batch of requests:: + + >>> frontier.delete('test', 'example.com', '00013967d8af7b0001') + +To retrieve fingerprints for a given slot:: + + >>> fps = [req['requests'] for req in frontier.read('test', 'example.com')] + +Spiders +------- + +To get the list of spiders in the project:: + + >>> project.spiders.list() + [ + {'id': 'spider1', 'tags': [], 'type': 'manual', 'version': '123'}, + {'id': 'spider2', 'tags': [], 'type': 'manual', 'version': '123'} + ] + +To select a particular spider to work with:: + + >>> spider = project.spiders.get('spider2') + >>> spider + + >>> spider.id + 2 + >>> spider.name + spider2 + +Spider +------ + +Like project instance, spider instance has ``jobs`` field to work with the spider's jobs. + +To schedule a spider run (you don't need to specify spider name explicitly):: + + >>> spider.jobs.schedule(arg1='val1') + > + +Jobs +---- + +To select a specific job for a project:: + + >>> job = project.jobs.get('123/1/2') + >>> job.id + '123/1/2' + +Also there's a shortcut to get same job with client instance:: + + >>> job = client.get_job('123/1/2') + +Use ``schedule`` method to schedule a new job for project/spider:: + + >>> job = spider.jobs.schedule() + +It's possible to count jobs for a given project/spider:: + + >> spider.jobs.count() + 5 + +Count logic supports different filters, as described for `count endpoint`_. + +To get a list of jobs for a spider:: + + >>> jobs = spider.jobs.iter() + +Iter logic also supports different filters, as described for `list endpoint`_. + +For example, to get all finished jobs:: + + >>> jobs = spider.jobs.iter(state='finished') + +``jobs`` is an iterator and, when iterated, return an iterable of dict objects, +so you typically use it like this:: + + >>> for job in jobs: + ... # do something with job data + +Or, if you just want to get the job ids:: + + >>> [x['key'] for x in jobs] + ['123/1/1', '123/1/2', '123/1/3'] + +Job dictionary object itself looks like:: + + >>> job + { + 'key': '123/1/2', + 'spider': 'myspider', + 'version': 'some-version' + 'state': 'finished', + 'close_reason': 'success', + 'errors': 0, + 'logs': 8, + 'pending_time': 1482852737072, + 'running_time': 1482852737848, + 'finished_time': 1482852774356, + 'ts': 1482852755902, + 'elapsed': 207609, + } + +Dict entries returned by ``iter`` method contain some additional meta, but can be +easily converted to ``Job`` instances with:: + + >>> [Job(x['key']) for x in jobs] + [ + , + , + , + ] + +To check jobs summary:: + + >>> spider.jobs.summary() + + [{'count': 0, 'name': 'pending', 'summary': []}, + {'count': 0, 'name': 'running', 'summary': []}, + {'count': 5, + 'name': 'finished', + 'summary': [.., + +It's also possible to get last job summary (for each spider):: + + >>> list(sp.jobs.lastjobsummary()) + [{'close_reason': 'success', + 'elapsed': 3062444, + 'errors': 1, + 'finished_time': 1482911633089, + 'key': '123/1/3', + 'logs': 8, + 'pending_time': 1482911596566, + 'running_time': 1482911598909, + 'spider': 'spider1', + 'state': 'finished', + 'ts': 1482911615830, + 'version': 'some-version'}] + +(Note that there can be a lot of spiders, so the method above returns an iterator.) + +Job +--- + + + +To delete a job:: + + >>> job.delete() + + + +To get job metadata:: + + >>> job.metadata['spider'] + 'myspider' + >>> job.metadata['started_time'] + '2010-09-28T15:09:57.629000' + >>> job.metadata['tags'] + [] + >>> j.metadata['scrapystats']['memusage/max'] + 53628928 + +Items +----- + +To retrieve all scraped items from a job:: + + >>> for item in job.items.iter(): + ... # do something with item (it's just a dict) + +Logs +---- + +To retrieve all log entries from a job:: + + >>> for logitem in job.logs.iter(): + ... # logitem is a dict with level, message, time + >>> logitem + { + 'level': 20, + 'message': '[scrapy.core.engine] Closing spider (finished)', + 'time': 1482233733976}, + } + +Requests +-------- + +To retrieve all requests from a job:: + + >>> for reqitem in job.requests.iter(): + ... # reqitem is a dict + >>> reqitem + [{ + 'duration': 354, + 'fp': '6d748741a927b10454c83ac285b002cd239964ea', + 'method': 'GET', + 'rs': 1270, + 'status': 200, + 'time': 1482233733870, + 'url': 'https://example.com' + }] + + +Tags +---- + +Tags is a convenient way to mark specific jobs (for better search, postprocessing etc). + +To mark a job with tag ``consumed``:: + + >>> job.update_tags(add=['consumed']) + +To mark all spider jobs with tag ``consumed``:: + + >>> spider.update_tags(add=['consumed']) + +To remove existing tag ``existing`` for all spider jobs:: + + >>> spider.update_tags(remove=['existing']) + + +.. _count endpoint: https://doc.scrapinghub.com/api/jobq.html#jobq-project-id-count +.. _list endpoint: https://doc.scrapinghub.com/api/jobq.html#jobq-project-id-list +.. _Collections: http://doc.scrapinghub.com/api/collections.html +.. _Frontier: http://doc.scrapinghub.com/api/frontier.html From 4d859c5f451d1742ec2e5e1f1dee3043444dcbc1 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Wed, 28 Dec 2016 12:24:48 +0300 Subject: [PATCH 2/8] Extending the doc --- README_client.rst | 175 +++++++++++++++++++++++----------------------- 1 file changed, 88 insertions(+), 87 deletions(-) diff --git a/README_client.rst b/README_client.rst index 5739cf8d..44d6efe2 100644 --- a/README_client.rst +++ b/README_client.rst @@ -8,15 +8,12 @@ with the `Scrapinghub API`_. It takes best from ``scrapinghub.Connection`` and ``scrapinghub.HubstorageClient`` and combines it under single interface. -.. contents:: :depth: 1 +.. contents:: :depth: 3 Usage ===== -Client ------- - First, you connect to Scrapinghub:: >>> from scrapinghub import ScrapinghubClient @@ -26,8 +23,8 @@ First, you connect to Scrapinghub:: Client instance has ``projects`` field for access to client projects collection. -Projects --------- +Projects (client level) +----------------------- You can list the projects available to your account:: @@ -58,8 +55,8 @@ And select a particular project to work with:: (The above is a shortcut for ``client.projects.get(123)``.) -Project -------- +Project (projects level) +------------------------ Project instance has ``jobs`` field to work with the project jobs. @@ -72,70 +69,16 @@ To schedule a spider run (it returns a job object):: Project instance also has the following fields: -- activity -- collections -- frontier -- reports -- settings -- spiders - -Collections ------------ +- activity - access to project activity records +- collections - work with project collections (see ``Collections`` section) +- frontier - using project frontier (see ``Frontier`` section) +- reports - work with project reports +- settings - interface to project settings +- spiders - access to spiders collection (see ``Spiders`` section) -Let's store hash and timestamp pair for foo spider. Usual workflow with `Collections`_ would be:: - >>> collections = project.collections - >>> foo_store = collections.new_store('foo_store') - >>> foo_store.set({'_key': '002d050ee3ff6192dcbecc4e4b4457d7', 'value': '1447221694537'}) - >>> foo_store.count() - 1 - >>> foo_store.get('002d050ee3ff6192dcbecc4e4b4457d7') - '1447221694537' - >>> for result in foo_store.iter_values(): - # do something with _key & value pair - >>> foo_store.delete('002d050ee3ff6192dcbecc4e4b4457d7') - >>> foo_store.count() - 0 - -Frontier --------- - -Typical workflow with `Frontier`_:: - - >>> frontier = project.frontier - -Add a request to the frontier:: - - >>> frontier.add('test', 'example.com', [{'fp': '/some/path.html'}]) - >>> frontier.flush() - >>> frontier.newcount - 1 - -Add requests with additional parameters:: - - >>> frontier.add('test', 'example.com', [{'fp': '/'}, {'fp': 'page1.html', 'p': 1, 'qdata': {'depth': 1}}]) - >>> frontier.flush() - >>> frontier.newcount - 2 - -To delete the slot ``example.com`` from the frontier:: - - >>> frontier.delete_slot('test', 'example.com') - -To retrieve requests for a given slot:: - - >>> reqs = frontier.read('test', 'example.com') - -To delete a batch of requests:: - - >>> frontier.delete('test', 'example.com', '00013967d8af7b0001') - -To retrieve fingerprints for a given slot:: - - >>> fps = [req['requests'] for req in frontier.read('test', 'example.com')] - -Spiders -------- +Spiders (project level) +----------------------- To get the list of spiders in the project:: @@ -155,8 +98,8 @@ To select a particular spider to work with:: >>> spider.name spider2 -Spider ------- +Spider (spiders level) +---------------------- Like project instance, spider instance has ``jobs`` field to work with the spider's jobs. @@ -165,8 +108,8 @@ To schedule a spider run (you don't need to specify spider name explicitly):: >>> spider.jobs.schedule(arg1='val1') > -Jobs ----- +Jobs (project/spider level) +--------------------------- To select a specific job for a project:: @@ -266,10 +209,8 @@ It's also possible to get last job summary (for each spider):: (Note that there can be a lot of spiders, so the method above returns an iterator.) -Job ---- - - +Job (jobs level) +---------------- To delete a job:: @@ -288,16 +229,16 @@ To get job metadata:: >>> j.metadata['scrapystats']['memusage/max'] 53628928 -Items ------ +Items (job level) +----------------- To retrieve all scraped items from a job:: >>> for item in job.items.iter(): ... # do something with item (it's just a dict) -Logs ----- +Logs (job level) +---------------- To retrieve all log entries from a job:: @@ -310,8 +251,8 @@ To retrieve all log entries from a job:: 'time': 1482233733976}, } -Requests --------- +Requests (job level) +-------------------- To retrieve all requests from a job:: @@ -329,8 +270,69 @@ To retrieve all requests from a job:: }] -Tags ----- +Additional features +=================== + +Collections (project level) +--------------------------- + +As an example, let's store hash and timestamp pair for foo spider. + +Usual workflow with `Collections`_ would be:: + + >>> collections = project.collections + >>> foo_store = collections.new_store('foo_store') + >>> foo_store.set({'_key': '002d050ee3ff6192dcbecc4e4b4457d7', 'value': '1447221694537'}) + >>> foo_store.count() + 1 + >>> foo_store.get('002d050ee3ff6192dcbecc4e4b4457d7') + '1447221694537' + >>> for result in foo_store.iter_values(): + # do something with _key & value pair + >>> foo_store.delete('002d050ee3ff6192dcbecc4e4b4457d7') + >>> foo_store.count() + 0 + +Frontier (project level) +------------------------ + +Typical workflow with `Frontier`_:: + + >>> frontier = project.frontier + +Add a request to the frontier:: + + >>> frontier.add('test', 'example.com', [{'fp': '/some/path.html'}]) + >>> frontier.flush() + >>> frontier.newcount + 1 + +Add requests with additional parameters:: + + >>> frontier.add('test', 'example.com', [{'fp': '/'}, {'fp': 'page1.html', 'p': 1, 'qdata': {'depth': 1}}]) + >>> frontier.flush() + >>> frontier.newcount + 2 + +To delete the slot ``example.com`` from the frontier:: + + >>> frontier.delete_slot('test', 'example.com') + +To retrieve requests for a given slot:: + + >>> reqs = frontier.read('test', 'example.com') + +To delete a batch of requests:: + + >>> frontier.delete('test', 'example.com', '00013967d8af7b0001') + +To retrieve fingerprints for a given slot:: + + >>> fps = [req['requests'] for req in frontier.read('test', 'example.com')] + + +Tags (spider/job level) +----------------------- Tags is a convenient way to mark specific jobs (for better search, postprocessing etc). @@ -346,7 +348,6 @@ To remove existing tag ``existing`` for all spider jobs:: >>> spider.update_tags(remove=['existing']) - .. _count endpoint: https://doc.scrapinghub.com/api/jobq.html#jobq-project-id-count .. _list endpoint: https://doc.scrapinghub.com/api/jobq.html#jobq-project-id-list .. _Collections: http://doc.scrapinghub.com/api/collections.html From b9d792f0d558429f20c91de41c62468bcf4b2dc7 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Wed, 28 Dec 2016 12:49:57 +0300 Subject: [PATCH 3/8] Trying to make the doc more readable --- README_client.rst | 86 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 63 insertions(+), 23 deletions(-) diff --git a/README_client.rst b/README_client.rst index 44d6efe2..c9332766 100644 --- a/README_client.rst +++ b/README_client.rst @@ -209,36 +209,61 @@ It's also possible to get last job summary (for each spider):: (Note that there can be a lot of spiders, so the method above returns an iterator.) -Job (jobs level) ----------------- +Job +--- + +Job instance provides access to job data: + +- items +- logs +- requests +- samples +- metadata + +Request to cancel a job:: + + >>> job.cancel() To delete a job:: >>> job.delete() +Metadata +~~~~~~~~ -To get job metadata:: +Job details can be found in jobs metadata and it's scrapystats:: - >>> job.metadata['spider'] - 'myspider' - >>> job.metadata['started_time'] - '2010-09-28T15:09:57.629000' - >>> job.metadata['tags'] - [] - >>> j.metadata['scrapystats']['memusage/max'] - 53628928 + >>> job.metadata['version'] + '5123a86-master' + >>> job.metadata['scrapystats'] + ... + 'downloader/response_count': 104, + 'downloader/response_status_count/200': 104, + 'finish_reason': 'finished', + 'finish_time': 1447160494937, + 'item_scraped_count': 50, + 'log_count/DEBUG': 157, + 'log_count/INFO': 1365, + 'log_count/WARNING': 3, + 'memusage/max': 182988800, + 'memusage/startup': 62439424, + ... -Items (job level) ------------------ +Anything can be stored in metadata, here is example how to add tags:: + + >>> job.update_metadata({'tags': 'obsolete'}) + +Items +~~~~~ To retrieve all scraped items from a job:: >>> for item in job.items.iter(): ... # do something with item (it's just a dict) -Logs (job level) ----------------- +Logs +~~~~ To retrieve all log entries from a job:: @@ -251,8 +276,8 @@ To retrieve all log entries from a job:: 'time': 1482233733976}, } -Requests (job level) --------------------- +Requests +~~~~~~~~ To retrieve all requests from a job:: @@ -269,12 +294,21 @@ To retrieve all requests from a job:: 'url': 'https://example.com' }] +Samples +~~~~~~~ + +To retrieve all samples for a job:: + + >>> for sample in job.samples.iter(): + ... # sample is a list with a timestamp and data + >>> sample + [1482233732452, 0, 0, 0, 0, 0] Additional features =================== -Collections (project level) ---------------------------- +Collections +----------- As an example, let's store hash and timestamp pair for foo spider. @@ -293,8 +327,10 @@ Usual workflow with `Collections`_ would be:: >>> foo_store.count() 0 -Frontier (project level) ------------------------- +Collections are available on project level only. + +Frontier +-------- Typical workflow with `Frontier`_:: @@ -330,9 +366,10 @@ To retrieve fingerprints for a given slot:: >>> fps = [req['requests'] for req in frontier.read('test', 'example.com')] +Frontier is available on project level only. -Tags (spider/job level) ------------------------ +Tags +---- Tags is a convenient way to mark specific jobs (for better search, postprocessing etc). @@ -348,6 +385,9 @@ To remove existing tag ``existing`` for all spider jobs:: >>> spider.update_tags(remove=['existing']) +Modifying tags is available on spider/job levels. + +.. _Scrapinghub API: http://doc.scrapinghub.com/api.html .. _count endpoint: https://doc.scrapinghub.com/api/jobq.html#jobq-project-id-count .. _list endpoint: https://doc.scrapinghub.com/api/jobq.html#jobq-project-id-list .. _Collections: http://doc.scrapinghub.com/api/collections.html From 3c07ba56121af7e7fa1e9bf4e0c985e6368e15b1 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Wed, 28 Dec 2016 12:58:44 +0300 Subject: [PATCH 4/8] Another attempt --- README_client.rst | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/README_client.rst b/README_client.rst index c9332766..7ccddbe4 100644 --- a/README_client.rst +++ b/README_client.rst @@ -11,8 +11,8 @@ with the `Scrapinghub API`_. It takes best from ``scrapinghub.Connection`` and .. contents:: :depth: 3 -Usage -===== +Basic usage +=========== First, you connect to Scrapinghub:: @@ -23,8 +23,8 @@ First, you connect to Scrapinghub:: Client instance has ``projects`` field for access to client projects collection. -Projects (client level) ------------------------ +Projects +-------- You can list the projects available to your account:: @@ -55,8 +55,8 @@ And select a particular project to work with:: (The above is a shortcut for ``client.projects.get(123)``.) -Project (projects level) ------------------------- +Project +~~~~~~~ Project instance has ``jobs`` field to work with the project jobs. @@ -77,8 +77,8 @@ Project instance also has the following fields: - spiders - access to spiders collection (see ``Spiders`` section) -Spiders (project level) ------------------------ +Spiders +------- To get the list of spiders in the project:: @@ -98,8 +98,8 @@ To select a particular spider to work with:: >>> spider.name spider2 -Spider (spiders level) ----------------------- +Spider +~~~~~~ Like project instance, spider instance has ``jobs`` field to work with the spider's jobs. @@ -108,8 +108,10 @@ To schedule a spider run (you don't need to specify spider name explicitly):: >>> spider.jobs.schedule(arg1='val1') > -Jobs (project/spider level) ---------------------------- +Jobs +---- + +Jobs collection is available on project/spider level. To select a specific job for a project:: @@ -210,7 +212,7 @@ It's also possible to get last job summary (for each spider):: (Note that there can be a lot of spiders, so the method above returns an iterator.) Job ---- +~~~ Job instance provides access to job data: @@ -230,7 +232,7 @@ To delete a job:: Metadata -~~~~~~~~ +^^^^^^^^ Job details can be found in jobs metadata and it's scrapystats:: @@ -255,7 +257,7 @@ Anything can be stored in metadata, here is example how to add tags:: >>> job.update_metadata({'tags': 'obsolete'}) Items -~~~~~ +^^^^^ To retrieve all scraped items from a job:: @@ -263,7 +265,7 @@ To retrieve all scraped items from a job:: ... # do something with item (it's just a dict) Logs -~~~~ +^^^^ To retrieve all log entries from a job:: @@ -277,7 +279,7 @@ To retrieve all log entries from a job:: } Requests -~~~~~~~~ +^^^^^^^^ To retrieve all requests from a job:: @@ -295,7 +297,7 @@ To retrieve all requests from a job:: }] Samples -~~~~~~~ +^^^^^^^ To retrieve all samples for a job:: From 19e9bbb5dd78e3fa7e785654d603fd9e5618d4ee Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Wed, 28 Dec 2016 13:03:58 +0300 Subject: [PATCH 5/8] Lets keep it simple --- README_client.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README_client.rst b/README_client.rst index 7ccddbe4..7e29e339 100644 --- a/README_client.rst +++ b/README_client.rst @@ -56,7 +56,7 @@ And select a particular project to work with:: (The above is a shortcut for ``client.projects.get(123)``.) Project -~~~~~~~ +------- Project instance has ``jobs`` field to work with the project jobs. @@ -99,7 +99,7 @@ To select a particular spider to work with:: spider2 Spider -~~~~~~ +------ Like project instance, spider instance has ``jobs`` field to work with the spider's jobs. @@ -212,7 +212,7 @@ It's also possible to get last job summary (for each spider):: (Note that there can be a lot of spiders, so the method above returns an iterator.) Job -~~~ +--- Job instance provides access to job data: From 3f3355badd8256a227699a0ab3bf21c3aaf36952 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Wed, 28 Dec 2016 13:57:26 +0300 Subject: [PATCH 6/8] Minor fixes for the doc --- README_client.rst | 109 +++++++++++++++++++++++++++++----------------- 1 file changed, 70 insertions(+), 39 deletions(-) diff --git a/README_client.rst b/README_client.rst index 7e29e339..f0aad963 100644 --- a/README_client.rst +++ b/README_client.rst @@ -21,7 +21,7 @@ First, you connect to Scrapinghub:: >>> client -Client instance has ``projects`` field for access to client projects collection. +Client instance has ``projects`` field for access to client projects. Projects -------- @@ -53,20 +53,20 @@ And select a particular project to work with:: >>> project.id 123 -(The above is a shortcut for ``client.projects.get(123)``.) +The above is a shortcut for ``client.projects.get(123)``. Project ------- Project instance has ``jobs`` field to work with the project jobs. -To schedule a spider run (it returns a job object):: +Jobs instance is described well in ``Jobs`` section below. + +For example, to schedule a spider run (it returns a job object):: >>> project.jobs.schedule('spider1', arg1='val1') > -(Check ``Jobs`` section below for other features.) - Project instance also has the following fields: - activity - access to project activity records @@ -80,7 +80,7 @@ Project instance also has the following fields: Spiders ------- -To get the list of spiders in the project:: +To get the list of spiders of the project:: >>> project.spiders.list() [ @@ -103,11 +103,13 @@ Spider Like project instance, spider instance has ``jobs`` field to work with the spider's jobs. -To schedule a spider run (you don't need to specify spider name explicitly):: +To schedule a spider run:: >>> spider.jobs.schedule(arg1='val1') > +Note that you don't need to specify spider name explicitly. + Jobs ---- @@ -127,27 +129,27 @@ Use ``schedule`` method to schedule a new job for project/spider:: >>> job = spider.jobs.schedule() -It's possible to count jobs for a given project/spider:: +It's also possible to count jobs for a given project/spider:: >> spider.jobs.count() 5 Count logic supports different filters, as described for `count endpoint`_. -To get a list of jobs for a spider:: - >>> jobs = spider.jobs.iter() +List jobs +^^^^^^^^^ -Iter logic also supports different filters, as described for `list endpoint`_. +To iterate through the spider jobs (descending order):: -For example, to get all finished jobs:: + >>> jobs_metadata = spider.jobs.iter() + >>> [j['key'] for j in jobs_metadata] + ['1111111/1/3', '1111111/1/2', '1111111/1/1'] - >>> jobs = spider.jobs.iter(state='finished') +``jobs_metadata`` is an iterator and, when iterated, returns an iterable +of dict objects, so you typically use it like this:: -``jobs`` is an iterator and, when iterated, return an iterable of dict objects, -so you typically use it like this:: - - >>> for job in jobs: + >>> for job in jobs_metadata: ... # do something with job data Or, if you just want to get the job ids:: @@ -155,26 +157,51 @@ Or, if you just want to get the job ids:: >>> [x['key'] for x in jobs] ['123/1/1', '123/1/2', '123/1/3'] -Job dictionary object itself looks like:: +Job metadata fieldset from ``iter()`` is less detailed than ``job.metadata``, +but contains few new fields as well. Additional fields can be requested using +the ``jobmeta`` parameter. If it used, then it's up to the user to list all the +required fields, so only few default fields would be added except requested +ones:: - >>> job - { - 'key': '123/1/2', - 'spider': 'myspider', - 'version': 'some-version' - 'state': 'finished', - 'close_reason': 'success', - 'errors': 0, - 'logs': 8, - 'pending_time': 1482852737072, - 'running_time': 1482852737848, - 'finished_time': 1482852774356, - 'ts': 1482852755902, - 'elapsed': 207609, - } + >>> metadata = next(project.jobs.iter()) + >>> metadata.get('spider', 'missing') + 'foo' + >>> jobs_metadata = project.jobs.iter(jobmeta=['scheduled_by', ]) + >>> metadata = next(jobs_metadata) + >>> metadata.get('scheduled_by', 'missing') + 'John' + >>> metadata.get('spider', 'missing') + missing + +By default ``jobs.iter()`` returns maximum last 1000 results. +Pagination is available using the ``start`` parameter:: + + >>> jobs_metadata = spider.jobs.iter(start=1000) + +There are several filters like spider, state, has_tag, lacks_tag, +startts and endts (check `list endpoint`_ for more details). + +To get jobs filtered by tags:: -Dict entries returned by ``iter`` method contain some additional meta, but can be -easily converted to ``Job`` instances with:: + >>> jobs_metadata = project.jobs.iter(has_tag=['new', 'verified'], lacks_tag='obsolete') + +List of tags has ``OR`` power, so in the case above jobs with 'new' or +'verified' tag are expected. + +To get certain number of last finished jobs per some spider:: + + >>> jobs_metadata = project.jobs.iter(spider='foo', state='finished', count=3) + +There are 4 possible job states, which can be used as values +for filtering by state: + +- pending +- running +- finished +- deleted + +Dict entries returned by ``iter`` method contain some additional meta, +but can be easily converted to ``Job`` instances with:: >>> [Job(x['key']) for x in jobs] [ @@ -183,6 +210,9 @@ easily converted to ``Job`` instances with:: , ] +Show summaries +^^^^^^^^^^^^^^ + To check jobs summary:: >>> spider.jobs.summary() @@ -209,18 +239,18 @@ It's also possible to get last job summary (for each spider):: 'ts': 1482911615830, 'version': 'some-version'}] -(Note that there can be a lot of spiders, so the method above returns an iterator.) +Note that there can be a lot of spiders, so the method above returns an iterator. Job --- -Job instance provides access to job data: +Job instance provides access to a job data with the following fields: +- metadata - items - logs - requests - samples -- metadata Request to cancel a job:: @@ -230,7 +260,6 @@ To delete a job:: >>> job.delete() - Metadata ^^^^^^^^ @@ -306,6 +335,7 @@ To retrieve all samples for a job:: >>> sample [1482233732452, 0, 0, 0, 0, 0] + Additional features =================== @@ -389,6 +419,7 @@ To remove existing tag ``existing`` for all spider jobs:: Modifying tags is available on spider/job levels. + .. _Scrapinghub API: http://doc.scrapinghub.com/api.html .. _count endpoint: https://doc.scrapinghub.com/api/jobq.html#jobq-project-id-count .. _list endpoint: https://doc.scrapinghub.com/api/jobq.html#jobq-project-id-list From b51f1aa237f504ccd51aab9649afc74e6a4a3c44 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Wed, 28 Dec 2016 14:18:41 +0300 Subject: [PATCH 7/8] Extend jobs section --- README_client.rst | 40 +++++++++++++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 7 deletions(-) diff --git a/README_client.rst b/README_client.rst index f0aad963..d0eec24c 100644 --- a/README_client.rst +++ b/README_client.rst @@ -115,6 +115,9 @@ Jobs Jobs collection is available on project/spider level. +get +^^^ + To select a specific job for a project:: >>> job = project.jobs.get('123/1/2') @@ -125,10 +128,33 @@ Also there's a shortcut to get same job with client instance:: >>> job = client.get_job('123/1/2') +schedule +^^^^^^^^ + Use ``schedule`` method to schedule a new job for project/spider:: >>> job = spider.jobs.schedule() +Scheduling logic supports different options, like + +- units to specify amount of units to schedule the job +- job_settings to pass additional settings for the job +- priority to set higher/lower priority of the job +- add_tag to create a job with a set of initial tags +- meta to pass additional custom metadata + +For example, to schedule a new job for a given spider with custom params:: + + >>> job = spider.jobs.schedule(units=2, job_settings={'SETTING': 'VALUE'}, + priority=1, add_tag=['tagA','tagB'], meta={'custom-data': 'val1'}) + +Note that if you schedule a job on project level, spider name is required:: + + >>> job = project.jobs.schedule('spider1') + +count +^^^^^ + It's also possible to count jobs for a given project/spider:: >> spider.jobs.count() @@ -137,14 +163,14 @@ It's also possible to count jobs for a given project/spider:: Count logic supports different filters, as described for `count endpoint`_. -List jobs -^^^^^^^^^ +iter +^^^^ To iterate through the spider jobs (descending order):: >>> jobs_metadata = spider.jobs.iter() >>> [j['key'] for j in jobs_metadata] - ['1111111/1/3', '1111111/1/2', '1111111/1/1'] + ['123/1/3', '123/1/2', '123/1/1'] ``jobs_metadata`` is an iterator and, when iterated, returns an iterable of dict objects, so you typically use it like this:: @@ -154,8 +180,8 @@ of dict objects, so you typically use it like this:: Or, if you just want to get the job ids:: - >>> [x['key'] for x in jobs] - ['123/1/1', '123/1/2', '123/1/3'] + >>> [x['key'] for x in jobs_metadata] + ['123/1/3', '123/1/2', '123/1/1'] Job metadata fieldset from ``iter()`` is less detailed than ``job.metadata``, but contains few new fields as well. Additional fields can be requested using @@ -210,8 +236,8 @@ but can be easily converted to ``Job`` instances with:: , ] -Show summaries -^^^^^^^^^^^^^^ +summary +^^^^^^^ To check jobs summary:: From ba91a3a57042c0513abdad8aa5a68d5f544288bd Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Wed, 28 Dec 2016 14:20:40 +0300 Subject: [PATCH 8/8] Decrease content depth for simplicity --- README_client.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README_client.rst b/README_client.rst index d0eec24c..a3e06b99 100644 --- a/README_client.rst +++ b/README_client.rst @@ -8,7 +8,7 @@ with the `Scrapinghub API`_. It takes best from ``scrapinghub.Connection`` and ``scrapinghub.HubstorageClient`` and combines it under single interface. -.. contents:: :depth: 3 +.. contents:: :depth: 2 Basic usage