diff --git a/.gitignore b/.gitignore index d9c603d5..50003f37 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,5 @@ dist /.cache .coverage +# documentation +docs/_build diff --git a/README.rst b/README.rst index 4e597115..57386e53 100644 --- a/README.rst +++ b/README.rst @@ -5,13 +5,9 @@ Client interface for Scrapinghub API .. image:: https://secure.travis-ci.org/scrapinghub/python-scrapinghub.png?branch=master :target: http://travis-ci.org/scrapinghub/python-scrapinghub - The ``scrapinghub`` is a Python library for communicating with the `Scrapinghub API`_. -.. contents:: :depth: 2 - - Requirements ============ @@ -31,947 +27,11 @@ response time and improved bandwidth usage:: pip install scrapinghub[msgpack] -New client -========== - -The ``scrapinghub.ScrapinghubClient`` is a new Python client for communicating -with the `Scrapinghub API`_. It takes best from ``scrapinghub.Connection`` and -``scrapinghub.HubstorageClient`` and combines it under single interface. - -First, you instantiate new client:: - - >>> from scrapinghub import ScrapinghubClient - >>> client = ScrapinghubClient('APIKEY') - >>> client - - -Client instance has ``projects`` field for access to client projects. - -Projects --------- - -You can list the projects available to your account:: - - >>> client.projects.list() - [123, 456] - -Or check the projects summary:: - - >>> client.projects.summary() - [{'finished': 674, - 'has_capacity': True, - 'pending': 0, - 'project': 123, - 'running': 1}, - {'finished': 33079, - 'has_capacity': True, - 'pending': 0, - 'project': 456, - 'running': 2}] - -And select a particular project to work with:: - - >>> project = client.get_project(123) - >>> project - - >>> project.key - '123' - -The above is a shortcut for ``client.projects.get(123)``. - -Project -------- - -Project instance has ``jobs`` field to work with the project jobs. - -Jobs instance is described well in ``Jobs`` section below. - -For example, to schedule a spider run (it returns a job object):: - - >>> project.jobs.run('spider1', job_args={'arg1':'val1'}) - > - -Project instance also has the following fields: - -- activity - access to project activity records -- collections - work with project collections (see ``Collections`` section) -- frontiers - using project frontier (see ``Frontiers`` section) -- settings - interface to project settings -- spiders - access to spiders collection (see ``Spiders`` section) - - -Settings --------- - -To get a list of the project settings:: - - >>> project.settings.list() - [(u'default_job_units', 2), (u'job_runtime_limit', 24)]] - -To get a project setting value by name:: - - >>> project.settings.get('job_runtime_limit') - 24 - -To update a project setting value by name:: - - >>> project.settings.set('job_runtime_limit', 20) - -Or update a few project settings at once:: - - >>> project.settings.update({'default_job_units': 1, - ... 'job_runtime_limit': 20}) - - -Spiders -------- - -To get the list of spiders of the project:: - - >>> project.spiders.list() - [ - {'id': 'spider1', 'tags': [], 'type': 'manual', 'version': '123'}, - {'id': 'spider2', 'tags': [], 'type': 'manual', 'version': '123'} - ] - -To select a particular spider to work with:: - - >>> spider = project.spiders.get('spider2') - >>> spider - - >>> spider.key - '123/2' - >>> spider.name - spider2 - -Spider ------- - -Like project instance, spider instance has ``jobs`` field to work with the spider's jobs. - -To schedule a spider run:: - - >>> spider.jobs.run(job_args={'arg1:'val1'}) - > - -Note that you don't need to specify spider name explicitly. - -Jobs ----- - -Jobs collection is available on project/spider level. - -get -^^^ - -To select a specific job for a project:: - - >>> job = project.jobs.get('123/1/2') - >>> job.key - '123/1/2' - -Also there's a shortcut to get same job with client instance:: - - >>> job = client.get_job('123/1/2') - -run -^^^ - -Use ``run`` method to run a new job for project/spider:: - - >>> job = spider.jobs.run() - -Scheduling logic supports different options, like - -- spider_args to provide spider arguments for the job -- units to specify amount of units to run the job -- job_settings to pass additional settings for the job -- priority to set higher/lower priority of the job -- add_tag to create a job with a set of initial tags -- meta to pass additional custom metadata - -For example, to run a new job for a given spider with custom params:: - - >>> job = spider.jobs.run(units=2, job_settings={'SETTING': 'VALUE'}, - priority=1, add_tag=['tagA','tagB'], meta={'custom-data': 'val1'}) - -Note that if you run a job on project level, spider name is required:: - - >>> job = project.jobs.run('spider1') - -count -^^^^^ - -It's also possible to count jobs for a given project/spider:: - - >>> spider.jobs.count() - 5 - -Count logic supports different filters, as described for `count endpoint`_. - - -iter -^^^^ - -To iterate through the spider jobs (descending order):: - - >>> jobs_summary = spider.jobs.iter() - >>> [j['key'] for j in jobs_summary] - ['123/1/3', '123/1/2', '123/1/1'] - -``jobs_summary`` is an iterator and, when iterated, returns an iterable -of dict objects, so you typically use it like this:: - - >>> for job in jobs_summary: - ... # do something with job data - -Or, if you just want to get the job ids:: - - >>> [x['key'] for x in jobs_summary] - ['123/1/3', '123/1/2', '123/1/1'] - -Job summary fieldset from ``iter()`` is less detailed than ``job.metadata``, -but contains few new fields as well. Additional fields can be requested using -the ``jobmeta`` parameter. If it used, then it's up to the user to list all the -required fields, so only few default fields would be added except requested -ones:: - - >>> job_summary = next(project.jobs.iter()) - >>> job_summary.get('spider', 'missing') - 'foo' - >>> jobs_summary = project.jobs.iter(jobmeta=['scheduled_by', ]) - >>> job_summary = next(jobs_summary) - >>> job_summary.get('scheduled_by', 'missing') - 'John' - >>> job_summary.get('spider', 'missing') - missing - -By default ``jobs.iter()`` returns maximum last 1000 results. -Pagination is available using the ``start`` parameter:: - - >>> jobs_summary = spider.jobs.iter(start=1000) - -There are several filters like spider, state, has_tag, lacks_tag, -startts and endts (check `list endpoint`_ for more details). - -To get jobs filtered by tags:: - - >>> jobs_summary = project.jobs.iter(has_tag=['new', 'verified'], lacks_tag='obsolete') - -List of tags has ``OR`` power, so in the case above jobs with 'new' or -'verified' tag are expected. - -To get certain number of last finished jobs per some spider:: - - >>> jobs_summary = project.jobs.iter(spider='foo', state='finished', count=3) - -There are 4 possible job states, which can be used as values -for filtering by state: - -- pending -- running -- finished -- deleted - -Dict entries returned by ``iter`` method contain some additional meta, -but can be easily converted to ``Job`` instances with:: - - >>> [Job(x['key']) for x in jobs] - [ - , - , - , - ] - -summary -^^^^^^^ - -To check jobs summary:: - - >>> spider.jobs.summary() - [{'count': 0, 'name': 'pending', 'summary': []}, - {'count': 0, 'name': 'running', 'summary': []}, - {'count': 5, - 'name': 'finished', - 'summary': [...]} - -It's also possible to get last jobs summary (for each spider):: - - >>> list(sp.jobs.iter_last()) - [{'close_reason': 'success', - 'elapsed': 3062444, - 'errors': 1, - 'finished_time': 1482911633089, - 'key': '123/1/3', - 'logs': 8, - 'pending_time': 1482911596566, - 'running_time': 1482911598909, - 'spider': 'spider1', - 'state': 'finished', - 'ts': 1482911615830, - 'version': 'some-version'}] - -Note that there can be a lot of spiders, so the method above returns an iterator. - -Job ---- - -Job instance provides access to a job data with the following fields: - -- metadata -- items -- logs -- requests -- samples - -Request to cancel a job:: - - >>> job.cancel() - -To delete a job:: - - >>> job.delete() - -Metadata -^^^^^^^^ - -Job details can be found in jobs metadata and it's scrapystats:: - - >>> job.metadata.get('version') - '5123a86-master' - >>> job.metadata.get('scrapystats') - ... - 'downloader/response_count': 104, - 'downloader/response_status_count/200': 104, - 'finish_reason': 'finished', - 'finish_time': 1447160494937, - 'item_scraped_count': 50, - 'log_count/DEBUG': 157, - 'log_count/INFO': 1365, - 'log_count/WARNING': 3, - 'memusage/max': 182988800, - 'memusage/startup': 62439424, - ... - -Anything can be stored in metadata, here is example how to add tags:: - - >>> job.metadata.set('tags', ['obsolete']) - -Items -^^^^^ - -To retrieve all scraped items from a job:: - - >>> for item in job.items.iter(): - ... # do something with item (it's just a dict) - -Logs -^^^^ - -To retrieve all log entries from a job:: - - >>> for logitem in job.logs.iter(): - ... # logitem is a dict with level, message, time - >>> logitem - { - 'level': 20, - 'message': '[scrapy.core.engine] Closing spider (finished)', - 'time': 1482233733976}, - } - -Requests -^^^^^^^^ - -To retrieve all requests from a job:: - - >>> for reqitem in job.requests.iter(): - ... # reqitem is a dict - >>> reqitem - [{ - 'duration': 354, - 'fp': '6d748741a927b10454c83ac285b002cd239964ea', - 'method': 'GET', - 'rs': 1270, - 'status': 200, - 'time': 1482233733870, - 'url': 'https://example.com' - }] - -Samples -^^^^^^^ - -To retrieve all samples for a job:: - - >>> for sample in job.samples.iter(): - ... # sample is a list with a timestamp and data - >>> sample - [1482233732452, 0, 0, 0, 0, 0] - - -Activity --------- - -To retrieve all activity events from a project:: - - >>> project.activity.iter() - - - >>> project.activity.list() - [{'event': 'job:completed', 'job': '123/2/3', 'user': 'jobrunner'}, - {'event': 'job:cancelled', 'job': '123/2/3', 'user': 'john'}] - -To post a new activity event:: - - >>> event = {'event': 'job:completed', 'job': '123/2/4', 'user': 'john'} - >>> project.activity.add(event) - -Or post multiple events at once:: - - >>> events = [ - {'event': 'job:completed', 'job': '123/2/5', 'user': 'john'}, - {'event': 'job:cancelled', 'job': '123/2/6', 'user': 'john'}, - ] - >>> project.activity.add(events) - - -Collections ------------ - -As an example, let's store hash and timestamp pair for foo spider. - -Usual workflow with `Collections`_ would be:: - - >>> collections = project.collections - >>> foo_store = collections.get_store('foo_store') - >>> foo_store.set({'_key': '002d050ee3ff6192dcbecc4e4b4457d7', 'value': '1447221694537'}) - >>> foo_store.count() - 1 - >>> foo_store.get('002d050ee3ff6192dcbecc4e4b4457d7') - {u'value': u'1447221694537'} - >>> # iterate over _key & value pair - ... list(foo_store.iter()) - [{u'_key': u'002d050ee3ff6192dcbecc4e4b4457d7', u'value': u'1447221694537'}] - >>> # filter by multiple keys - only values for keys that exist will be returned - ... list(foo_store.iter(key=['002d050ee3ff6192dcbecc4e4b4457d7', 'blah'])) - [{u'_key': u'002d050ee3ff6192dcbecc4e4b4457d7', u'value': u'1447221694537'}] - >>> foo_store.delete('002d050ee3ff6192dcbecc4e4b4457d7') - >>> foo_store.count() - 0 - -Collections are available on project level only. - -Frontiers ---------- - -Typical workflow with `Frontier`_:: - - >>> frontiers = project.frontiers - -Get all frontiers from a project to iterate through it:: - - >>> frontiers.iter() - - -List all frontiers:: - - >>> frontiers.list() - ['test', 'test1', 'test2'] - -Get a frontier by name:: - - >>> frontier = frontiers.get('test') - >>> frontier - - -Get an iterator to iterate through a frontier slots:: - - >>> frontier.iter() - - -List all slots:: - - >>> frontier.list() - ['example.com', 'example.com2'] - -Get a frontier slot by name:: - - >>> slot = frontier.get('example.com') - >>> slot - - -Add a request to the slot:: - - >>> slot.queue.add([{'fp': '/some/path.html'}]) - >>> slot.flush() - >>> slot.newcount - 1 - -``newcount`` is defined per slot, but also available per frontier and globally:: - - >>> frontier.newcount - 1 - >>> frontiers.newcount - 3 - -Add a fingerprint only to the slot:: - - >>> slot.fingerprints.add(['fp1', 'fp2']) - >>> slot.flush() - -There are convenient shortcuts: ``f`` for ``fingerprints`` and ``q`` for ``queue``. - -Add requests with additional parameters:: - - >>> slot.q.add([{'fp': '/'}, {'fp': 'page1.html', 'p': 1, 'qdata': {'depth': 1}}]) - >>> slot.flush() - -To retrieve all requests for a given slot:: - - >>> reqs = slot.q.iter() - -To retrieve all fingerprints for a given slot:: - - >>> fps = slot.f.iter() - -To list all the requests use ``list()`` method (similar for ``fingerprints``):: - - >>> fps = slot.q.list() - -To delete a batch of requests:: - - >>> slot.q.delete('00013967d8af7b0001') - -To delete the whole slot from the frontier:: - - >>> slot.delete() - -Flush data of the given frontier:: - - >>> frontier.flush() - -Flush data of all frontiers of a project:: - - >>> frontiers.flush() - -Close batch writers of all frontiers of a project:: - - >>> frontiers.close() - -Frontiers are available on project level only. - -Tags ----- - -Tags is a convenient way to mark specific jobs (for better search, postprocessing etc). - -To mark a job with tag ``consumed``:: - - >>> job.update_tags(add=['consumed']) - -To mark all spider jobs with tag ``consumed``:: - - >>> spider.jobs.update_tags(add=['consumed']) - -To remove existing tag ``existing`` for all spider jobs:: - - >>> spider.jobs.update_tags(remove=['existing']) - -Modifying tags is available on spider/job levels. - - -Exceptions ----------- - -scrapinghub.exceptions.ScrapinghubAPIError -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Base exception class. - - -scrapinghub.exceptions.InvalidUsage -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Usually raised in case of 400 response from API. - - -scrapinghub.exceptions.NotFound -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Entity doesn't exist (e.g. spider or project). - - -scrapinghub.exceptions.ValueTooLarge -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Value cannot be writtent because it exceeds size limits. - -scrapinghub.exceptions.DuplicateJobError -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Job for given spider with given arguments is already scheduled or running. - - - - -Legacy client -============= - -First, you connect to Scrapinghub:: - - >>> from scrapinghub import Connection - >>> conn = Connection('APIKEY') - >>> conn - Connection('APIKEY') - -You can list the projects available to your account:: - - >>> conn.project_ids() - [123, 456] - -And select a particular project to work with:: - - >>> project = conn[123] - >>> project - Project(Connection('APIKEY'), 123) - >>> project.id - 123 - -To schedule a spider run (it returns the job id):: - - >>> project.schedule('myspider', arg1='val1') - u'123/1/1' - -To get the list of spiders in the project:: - - >>> project.spiders() - [ - {u'id': u'spider1', u'tags': [], u'type': u'manual', u'version': u'123'}, - {u'id': u'spider2', u'tags': [], u'type': u'manual', u'version': u'123'} - ] - -To get all finished jobs:: - - >>> jobs = project.jobs(state='finished') - -``jobs`` is a ``JobSet``. ``JobSet`` objects are iterable and, when iterated, -return an iterable of ``Job`` objects, so you typically use it like this:: - - >>> for job in jobs: - ... # do something with job - -Or, if you just want to get the job ids:: - - >>> [x.id for x in jobs] - [u'123/1/1', u'123/1/2', u'123/1/3'] - -To select a specific job:: - - >>> job = project.job(u'123/1/2') - >>> job.id - u'123/1/2' - -To retrieve all scraped items from a job:: - - >>> for item in job.items(): - ... # do something with item (it's just a dict) - -To retrieve all log entries from a job:: - - >>> for logitem in job.log(): - ... # logitem is a dict with logLevel, message, time - -To get job info:: - - >>> job.info['spider'] - 'myspider' - >>> job.info['started_time'] - '2010-09-28T15:09:57.629000' - >>> job.info['tags'] - [] - >>> job.info['fields_count]['description'] - 1253 - -To mark a job with tag ``consumed``:: - - >>> job.update(add_tag='consumed') - -To mark several jobs with tag ``consumed`` (``JobSet`` also supports the -``update()`` method):: - - >>> project.jobs(state='finished').update(add_tag='consumed') - -To delete a job:: - - >>> job.delete() - -To delete several jobs (``JobSet`` also supports the ``update()`` method):: - - >>> project.jobs(state='finished').delete() - - -Legacy Hubstorage client -======================== - -The library can also be used for interaction with spiders, jobs and scraped data through ``storage.scrapinghub.com`` endpoints. - -First, use your API key for authorization:: - - >>> from scrapinghub import HubstorageClient - >>> hc = HubstorageClient(auth='apikey') - >>> hc.server_timestamp() - 1446222762611 - -Project -------- - -To get project settings or jobs summary:: - - >>> project = hc.get_project('1111111') - >>> project.settings['botgroups'] - [u'botgroup1', ] - >>> project.jobsummary() - {u'finished': 6, - u'has_capacity': True, - u'pending': 0, - u'project': 1111111, - u'running': 0} - -Spider ------- - -To get spider id correlated with its name:: - - >>> project.ids.spider('foo') - 1 - -To see last jobs summaries:: - - >>> summaries = project.spiders.lastjobsummary(count=3) - -To get job summary per spider:: - - >>> summary = project.spiders.lastjobsummary(spiderid='1') - -Job ---- - -Job can be **retrieved** directly by id (project_id/spider_id/job_id):: - - >>> job = hc.get_job('1111111/1/1') - >>> job.key - '1111111/1/1' - >>> job.metadata['state'] - u'finished' - -**Creating** a new job requires a spider name:: - - >>> job = hc.push_job(projectid='1111111', spidername='foo') - >>> job.key - '1111111/1/1' - -Priority can be between 0 and 4 (from lowest to highest), the default is 2. - -To push job from project level with the highest priority:: - - >>> job = project.push_job(spidername='foo', priority=4) - >>> job.metadata['priority'] - 4 - -Pushing a job with spider arguments:: - - >>> project.push_job(spidername='foo', spider_args={'arg1': 'foo', 'arg2': 'bar'}) - -Running job can be **cancelled** by calling ``request_cancel()``:: - - >>> job.request_cancel() - >>> job.metadata['cancelled_by'] - u'John' - -To **delete** job:: - - >>> job.purged() - >>> job.metadata['state'] - u'deleted' - -Job details ------------ - -Job details can be found in jobs metadata and it's scrapystats:: - - >>> job = hc.get_job('1111111/1/1') - >>> job.metadata['version'] - u'5123a86-master' - >>> job.metadata['scrapystats'] - ... - u'downloader/response_count': 104, - u'downloader/response_status_count/200': 104, - u'finish_reason': u'finished', - u'finish_time': 1447160494937, - u'item_scraped_count': 50, - u'log_count/DEBUG': 157, - u'log_count/INFO': 1365, - u'log_count/WARNING': 3, - u'memusage/max': 182988800, - u'memusage/startup': 62439424, - ... - -Anything can be stored in metadata, here is example how to add tags:: - - >>> job.update_metadata({'tags': 'obsolete'}) - -Jobs ----- - -To iterate through all jobs metadata per project (descending order):: - - >>> jobs_metadata = project.jobq.list() - >>> [j['key'] for j in jobs_metadata] - ['1111111/1/3', '1111111/1/2', '1111111/1/1'] - -Jobq metadata fieldset is less detailed, than ``job.metadata``, but contains few new fields as well. -Additional fields can be requested using the ``jobmeta`` parameter. -If it used, then it's up to the user to list all the required fields, so only few default fields would be added except requested ones:: - - >>> metadata = next(project.jobq.list()) - >>> metadata.get('spider', 'missing') - u'foo' - >>> jobs_metadata = project.jobq.list(jobmeta=['scheduled_by', ]) - >>> metadata = next(jobs_metadata) - >>> metadata.get('scheduled_by', 'missing') - u'John' - >>> metadata.get('spider', 'missing') - missing - -By default ``jobq.list()`` returns maximum last 1000 results. Pagination is available using the ``start`` parameter:: - - >>> jobs_metadata = project.jobq.list(start=1000) - -There are several filters like spider, state, has_tag, lacks_tag, startts and endts. -To get jobs filtered by tags:: - - >>> jobs_metadata = project.jobq.list(has_tag=['new', 'verified'], lacks_tag='obsolete') - -List of tags has ``OR`` power, so in the case above jobs with 'new' or 'verified' tag are expected. - -To get certain number of last finished jobs per some spider:: - - >>> jobs_metadata = project.jobq.list(spider='foo', state='finished' count=3) - -There are 4 possible job states, which can be used as values for filtering by state: - -- pending -- running -- finished -- deleted - - -Items ------ - -To iterate through items:: - - >>> items = job.items.iter_values() - >>> for item in items: - # do something, item is just a dict - -Logs ----- - -To iterate through 10 first logs for example:: - - >>> logs = job.logs.iter_values(count=10) - >>> for log in logs: - # do something, log is a dict with log level, message and time keys - -Collections ------------ - -Let's store hash and timestamp pair for foo spider. Usual workflow with `Collections`_ would be:: - - >>> collections = project.collections - >>> foo_store = collections.new_store('foo_store') - >>> foo_store.set({'_key': '002d050ee3ff6192dcbecc4e4b4457d7', 'value': '1447221694537'}) - >>> foo_store.count() - 1 - >>> foo_store.get('002d050ee3ff6192dcbecc4e4b4457d7') - {u'value': u'1447221694537'} - >>> # iterate over _key & value pair - ... list(foo_store.iter_values()) - [{u'_key': u'002d050ee3ff6192dcbecc4e4b4457d7', u'value': u'1447221694537'}] - >>> # filter by multiple keys - only values for keys that exist will be returned - ... list(foo_store.iter_values(key=['002d050ee3ff6192dcbecc4e4b4457d7', 'blah'])) - [{u'_key': u'002d050ee3ff6192dcbecc4e4b4457d7', u'value': u'1447221694537'}] - >>> foo_store.delete('002d050ee3ff6192dcbecc4e4b4457d7') - >>> foo_store.count() - 0 - -Frontier --------- - -Typical workflow with `Frontier`_:: - - >>> frontier = project.frontier - -Add a request to the frontier:: - - >>> frontier.add('test', 'example.com', [{'fp': '/some/path.html'}]) - >>> frontier.flush() - >>> frontier.newcount - 1 - -Add requests with additional parameters:: - - >>> frontier.add('test', 'example.com', [{'fp': '/'}, {'fp': 'page1.html', 'p': 1, 'qdata': {'depth': 1}}]) - >>> frontier.flush() - >>> frontier.newcount - 2 - -To delete the slot ``example.com`` from the frontier:: - - >>> frontier.delete_slot('test', 'example.com') - -To retrieve requests for a given slot:: - - >>> reqs = frontier.read('test', 'example.com') - -To delete a batch of requests:: - - >>> frontier.delete('test', 'example.com', '00013967d8af7b0001') - -To retrieve fingerprints for a given slot:: - - >>> fps = [req['requests'] for req in frontier.read('test', 'example.com')] - -Tests -===== - -The package is covered with integration tests based on `VCR.py library`_: there -are recorded cassettes files in ``tests/*/cassettes`` used instead of HTTP -requests to real services, it helps to simplify and speed up development. - -By default, tests use VCR.py ``once`` mode to: - -- replay previously recorded interactions. -- record new interactions if there is no cassette file. -- cause an error to be raised for new requests if there is a cassette file. - -It means that if you add new integration tests and run all tests as usual, -only new cassettes will be created, all existing cassettes will stay unmodified. - -To ignore existing cassettes and use real service, please provide a flag:: - - py.test --ignore-cassettes - -If you want to update/recreate all the cassettes from scratch, please use:: - - py.test --update-cassettes +Documentation +------------- -Note that internally the above command erases the whole folder with cassettes. +Documentation is `available online`_ via Read the Docs or in the ``docs`` directory. .. _Scrapinghub API: http://doc.scrapinghub.com/api.html -.. _Collections: http://doc.scrapinghub.com/api/collections.html -.. _Frontier: http://doc.scrapinghub.com/api/frontier.html -.. _VCR.py library: https://pypi.python.org/pypi/vcrpy -.. _count endpoint: https://doc.scrapinghub.com/api/jobq.html#jobq-project-id-count -.. _list endpoint: https://doc.scrapinghub.com/api/jobq.html#jobq-project-id-list +.. _available online: https://python-scrapinghub.readthedocs.io/ diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 00000000..c44acee0 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,21 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +SPHINXPROJ = python-scrapinghub +SPHINXAPIDOCS = sphinx-apidoc +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/client/apidocs.rst b/docs/client/apidocs.rst new file mode 100644 index 00000000..e83f5c0f --- /dev/null +++ b/docs/client/apidocs.rst @@ -0,0 +1,97 @@ +API Reference +============= + +Client object +------------- + +.. automodule:: scrapinghub.client + :members: + :undoc-members: + :inherited-members: + +Activity +-------- + +.. automodule:: scrapinghub.client.activity + :members: + :undoc-members: + :inherited-members: + +Collections +----------- + +.. automodule:: scrapinghub.client.collections + :members: + :undoc-members: + :inherited-members: + +Exceptions +---------- + +.. automodule:: scrapinghub.client.exceptions + :members: + :undoc-members: + +Frontiers +--------- + +.. automodule:: scrapinghub.client.frontiers + :members: + :undoc-members: + :inherited-members: + +Items +----- + +.. automodule:: scrapinghub.client.items + :members: + :undoc-members: + :inherited-members: + +Jobs +---- + +.. automodule:: scrapinghub.client.jobs + :members: + :undoc-members: + :inherited-members: + +Logs +---- + +.. automodule:: scrapinghub.client.logs + :members: + :undoc-members: + :inherited-members: + +Projects +-------- + +.. automodule:: scrapinghub.client.projects + :members: + :undoc-members: + :inherited-members: + +Requests +-------- + +.. automodule:: scrapinghub.client.requests + :members: + :undoc-members: + :inherited-members: + +Samples +------- + +.. automodule:: scrapinghub.client.samples + :members: + :undoc-members: + :inherited-members: + +Spiders +------- + +.. automodule:: scrapinghub.client.spiders + :members: + :undoc-members: + :inherited-members: diff --git a/docs/client/overview.rst b/docs/client/overview.rst new file mode 100644 index 00000000..126e4fee --- /dev/null +++ b/docs/client/overview.rst @@ -0,0 +1,602 @@ +Overview +======== + +The :class:`~scrapinghub.client.ScrapinghubClient` is a new Python client for +communicating with the `Scrapinghub API`_. +It takes best from :class:`~scrapinghub.legacy.Connection` and +:class:`~scrapinghub.hubstorage.HubstorageClient`, and combines it under single +interface. + +First, you instantiate new client:: + + >>> from scrapinghub import ScrapinghubClient + >>> client = ScrapinghubClient('APIKEY') + >>> client + + +Client instance has :attr:`~scrapinghub.client.ScrapinghubClient.projects` field +for access to client projects. + +Projects +-------- + +You can list the :class:`~scrapinghub.client.projects.Projects` available to your +account:: + + >>> client.projects.list() + [123, 456] + +Or check the projects summary:: + + >>> client.projects.summary() + [{'finished': 674, + 'has_capacity': True, + 'pending': 0, + 'project': 123, + 'running': 1}, + {'finished': 33079, + 'has_capacity': True, + 'pending': 0, + 'project': 456, + 'running': 2}] + +And select a particular project to work with:: + + >>> project = client.get_project(123) + >>> project + + >>> project.key + '123' + +.. tip:: The above is a shortcut for ``client.projects.get(123)``. + + +Project +------- + +:class:`~scrapinghub.client.projects.Project` instance has +:attr:`~scrapinghub.client.projects.Project.jobs` field to work with +the project jobs. + +:class:`~scrapinghub.client.jobs.Jobs` instance is described well in +:ref:`Jobs ` section below. + +For example, to schedule a spider run (it returns a +:class:`~scrapinghub.client.jobs.Job` object):: + + >>> project.jobs.run('spider1', job_args={'arg1': 'val1'}) + > + + +Spiders +------- + +Spiders collection is accessible via :class:`~scrapinghub.client.spiders.Spiders`. + +To get the list of spiders of the project:: + + >>> project.spiders.list() + [ + {'id': 'spider1', 'tags': [], 'type': 'manual', 'version': '123'}, + {'id': 'spider2', 'tags': [], 'type': 'manual', 'version': '123'} + ] + +To select a particular spider to work with:: + + >>> spider = project.spiders.get('spider2') + >>> spider + + >>> spider.key + '123/2' + >>> spider.name + spider2 + +.. _spider: + +Spider +------ + +Like project instance, :class:`~scrapinghub.client.spiders.Spider` instance has +``jobs`` field to work with the spider's jobs. + +To schedule a spider run:: + + >>> spider.jobs.run(job_args={'arg1': 'val1'}) + > + +Note that you don't need to specify spider name explicitly. + +.. _jobs: + +Jobs +---- + +:class:`~scrapinghub.client.jobs.Jobs` collection is available on project/spider +level. + +get +^^^ + +To select a specific job for a project:: + + >>> job = project.jobs.get('123/1/2') + >>> job.key + '123/1/2' + +Also there's a shortcut to get same job with client instance:: + + >>> job = client.get_job('123/1/2') + +run +^^^ + +Use ``run`` method to run a new job for project/spider:: + + >>> job = spider.jobs.run() + +Scheduling logic supports different options, like + +- **job_args** to provide arguments for the job +- **units** to specify amount of units to run the job +- **job_settings** to pass additional settings for the job +- **priority** to set higher/lower priority of the job +- **add_tag** to create a job with a set of initial tags +- **meta** to pass additional custom metadata + +For example, to run a new job for a given spider with custom params:: + + >>> job = spider.jobs.run(units=2, job_settings={'SETTING': 'VALUE'}, priority=1, + ... add_tag=['tagA','tagB'], meta={'custom-data': 'val1'}) + +Note that if you run a job on project level, spider name is required:: + + >>> job = project.jobs.run('spider1') + +count +^^^^^ + +It's also possible to count jobs for a given project/spider:: + + >>> spider.jobs.count() + 5 + +Count logic supports different filters, as described for `count endpoint`_. + + +iter +^^^^ + +To iterate through the spider jobs (descending order):: + + >>> jobs_summary = spider.jobs.iter() + >>> [j['key'] for j in jobs_summary] + ['123/1/3', '123/1/2', '123/1/1'] + +``jobs_summary`` is an iterator and, when iterated, returns an iterable +of dict objects, so you typically use it like this:: + + >>> for job in jobs_summary: + ... # do something with job data + +Or, if you just want to get the job ids:: + + >>> [x['key'] for x in jobs_summary] + ['123/1/3', '123/1/2', '123/1/1'] + +Job summary fieldset from ``iter()`` is less detailed than ``job.metadata``, +but contains few new fields as well. Additional fields can be requested using +the ``jobmeta`` parameter. If it used, then it's up to the user to list all the +required fields, so only few default fields would be added except requested +ones:: + + >>> job_summary = next(project.jobs.iter()) + >>> job_summary.get('spider', 'missing') + 'foo' + >>> jobs_summary = project.jobs.iter(jobmeta=['scheduled_by']) + >>> job_summary = next(jobs_summary) + >>> job_summary.get('scheduled_by', 'missing') + 'John' + >>> job_summary.get('spider', 'missing') + missing + +By default ``jobs.iter()`` returns maximum last 1000 results. +Pagination is available using the ``start`` parameter:: + + >>> jobs_summary = spider.jobs.iter(start=1000) + +There are several filters like spider, state, has_tag, lacks_tag, +startts and endts (check `list endpoint`_ for more details). + +To get jobs filtered by tags:: + + >>> jobs_summary = project.jobs.iter(has_tag=['new', 'verified'], lacks_tag='obsolete') + +List of tags in **has_tag** has ``OR`` power, so in the case above jobs with +``new`` or ``verified`` tag are expected (while list of tags in **lacks_tag** +has ``AND`` power). + +To get certain number of last finished jobs per some spider:: + + >>> jobs_summary = project.jobs.iter(spider='foo', state='finished', count=3) + +There are 4 possible job states, which can be used as values +for filtering by state: + +- pending +- running +- finished +- deleted + +Dictionary entries returned by ``iter`` method contain some additional meta, +but can be easily converted to :class:`~scrapinghub.client.jobs.Job` instances with:: + + >>> [Job(client, x['key']) for x in jobs] + [ + , + , + , + ] + +summary +^^^^^^^ + +To check jobs summary:: + + >>> spider.jobs.summary() + [{'count': 0, 'name': 'pending', 'summary': []}, + {'count': 0, 'name': 'running', 'summary': []}, + {'count': 5, + 'name': 'finished', + 'summary': [...]} + +It's also possible to get last jobs summary (for each spider):: + + >>> list(sp.jobs.iter_last()) + [{'close_reason': 'success', + 'elapsed': 3062444, + 'errors': 1, + 'finished_time': 1482911633089, + 'key': '123/1/3', + 'logs': 8, + 'pending_time': 1482911596566, + 'running_time': 1482911598909, + 'spider': 'spider1', + 'state': 'finished', + 'ts': 1482911615830, + 'version': 'some-version'}] + +Note that there can be a lot of spiders, so the method above returns an iterator. + + +update_tags +^^^^^^^^^^^ + +Tags is a convenient way to mark specific jobs (for better search, postprocessing etc). + + +To mark all spider jobs with tag ``consumed``:: + + >>> spider.jobs.update_tags(add=['consumed']) + +To remove existing tag ``existing`` for all spider jobs:: + + >>> spider.jobs.update_tags(remove=['existing']) + +Modifying tags is available on :class:`~scrapinghub.client.spiders.Spider`/ +:class:`~scrapinghub.client.jobs.Job` levels. + + +Job +--- + +:class:`~scrapinghub.client.jobs.Job` instance provides access to a job data +with the following fields: + +- metadata +- items +- logs +- requests +- samples + +Request to cancel a job:: + + >>> job.cancel() + +To delete a job:: + + >>> job.delete() + +To mark a job with tag ``consumed``:: + + >>> job.update_tags(add=['consumed']) + +.. _job-metadata: + +Metadata +^^^^^^^^ + +:class:`~scrapinghub.client.jobs.JobMeta` details can be found in jobs metadata +and it's scrapystats:: + + >>> job.metadata.get('version') + '5123a86-master' + >>> job.metadata.get('scrapystats') + ... + 'downloader/response_count': 104, + 'downloader/response_status_count/200': 104, + 'finish_reason': 'finished', + 'finish_time': 1447160494937, + 'item_scraped_count': 50, + 'log_count/DEBUG': 157, + 'log_count/INFO': 1365, + 'log_count/WARNING': 3, + 'memusage/max': 182988800, + 'memusage/startup': 62439424, + ... + +Anything can be stored in metadata, here is example how to add tags:: + + >>> job.metadata.set('tags', ['obsolete']) + +.. _job-items: + +Items +^^^^^ + +To retrieve all scraped items from a job use +:class:`~scrapinghub.client.items.Items`:: + + >>> for item in job.items.iter(): + ... # do something with item (it's just a dict) + +.. _job-logs: + +Logs +^^^^ + +To retrieve all log entries from a job use :class:`~scrapinghub.client.logs.Logs`:: + + >>> for logitem in job.logs.iter(): + ... # logitem is a dict with level, message, time + >>> logitem + { + 'level': 20, + 'message': '[scrapy.core.engine] Closing spider (finished)', + 'time': 1482233733976}, + } + +.. _job-requests: + +Requests +^^^^^^^^ + +To retrieve all requests from a job there's :class:`~scrapinghub.client.requests.Requests`:: + + >>> for reqitem in job.requests.iter(): + ... # reqitem is a dict + >>> reqitem + [{ + 'duration': 354, + 'fp': '6d748741a927b10454c83ac285b002cd239964ea', + 'method': 'GET', + 'rs': 1270, + 'status': 200, + 'time': 1482233733870, + 'url': 'https://example.com' + }] + +.. _job-samples: + +Samples +^^^^^^^ + +:class:`~scrapinghub.client.samples.Samples` is useful to retrieve all samples +for a job:: + + >>> for sample in job.samples.iter(): + ... # sample is a list with a timestamp and data + >>> sample + [1482233732452, 0, 0, 0, 0, 0] + + +Activity +-------- + +:class:`~scrapinghub.client.activity.Activity` provides a convenient interface +to project activity events. + +To retrieve all activity events from a project:: + + >>> project.activity.iter() + + + >>> project.activity.list() + [{'event': 'job:completed', 'job': '123/2/3', 'user': 'jobrunner'}, + {'event': 'job:cancelled', 'job': '123/2/3', 'user': 'john'}] + +To post a new activity event:: + + >>> event = {'event': 'job:completed', 'job': '123/2/4', 'user': 'john'} + >>> project.activity.add(event) + +Or post multiple events at once:: + + >>> events = [ + ... {'event': 'job:completed', 'job': '123/2/5', 'user': 'john'}, + ... {'event': 'job:cancelled', 'job': '123/2/6', 'user': 'john'}, + ... ] + >>> project.activity.add(events) + + +Collections +----------- + +As an example, let's store hash and timestamp pair for foo spider. + +Usual workflow with :class:`~scrapinghub.client.collections.Collections` would be:: + + >>> collections = project.collections + >>> foo_store = collections.get_store('foo_store') + >>> foo_store.set({'_key': '002d050ee3ff6192dcbecc4e4b4457d7', 'value': '1447221694537'}) + >>> foo_store.count() + 1 + >>> foo_store.get('002d050ee3ff6192dcbecc4e4b4457d7') + {u'value': u'1447221694537'} + >>> # iterate over _key & value pair + ... list(foo_store.iter()) + [{u'_key': u'002d050ee3ff6192dcbecc4e4b4457d7', u'value': u'1447221694537'}] + >>> # filter by multiple keys - only values for keys that exist will be returned + ... list(foo_store.iter(key=['002d050ee3ff6192dcbecc4e4b4457d7', 'blah'])) + [{u'_key': u'002d050ee3ff6192dcbecc4e4b4457d7', u'value': u'1447221694537'}] + >>> foo_store.delete('002d050ee3ff6192dcbecc4e4b4457d7') + >>> foo_store.count() + 0 + +Collections are available on project level only. + + +Frontiers +--------- + +Typical workflow with :class:`~scrapinghub.client.frontiers.Frontiers`:: + + >>> frontiers = project.frontiers + +Get all frontiers from a project to iterate through it:: + + >>> frontiers.iter() + + +List all frontiers:: + + >>> frontiers.list() + ['test', 'test1', 'test2'] + +Get a :class:`~scrapinghub.client.frontiers.Frontier` instance by name:: + + >>> frontier = frontiers.get('test') + >>> frontier + + +Get an iterator to iterate through a frontier slots:: + + >>> frontier.iter() + + +List all slots:: + + >>> frontier.list() + ['example.com', 'example.com2'] + +Get a :class:`~scrapinghub.client.frontiers.FrontierSlot` by name:: + + >>> slot = frontier.get('example.com') + >>> slot + + +Add a request to the slot:: + + >>> slot.queue.add([{'fp': '/some/path.html'}]) + >>> slot.flush() + >>> slot.newcount + 1 + +``newcount`` is defined per slot, but also available per frontier and globally:: + + >>> frontier.newcount + 1 + >>> frontiers.newcount + 3 + +Add a fingerprint only to the slot:: + + >>> slot.fingerprints.add(['fp1', 'fp2']) + >>> slot.flush() + +There are convenient shortcuts: ``f`` for ``fingerprints`` to access +:class:`~scrapinghub.client.frontiers.FrontierSlotFingerprints` and ``q`` for +``queue`` to access :class:`~scrapinghub.client.frontiers.FrontierSlotQueue`. + +Add requests with additional parameters:: + + >>> slot.q.add([{'fp': '/'}, {'fp': 'page1.html', 'p': 1, 'qdata': {'depth': 1}}]) + >>> slot.flush() + +To retrieve all requests for a given slot:: + + >>> reqs = slot.q.iter() + +To retrieve all fingerprints for a given slot:: + + >>> fps = slot.f.iter() + +To list all the requests use ``list()`` method (similar for ``fingerprints``):: + + >>> fps = slot.q.list() + +To delete a batch of requests:: + + >>> slot.q.delete('00013967d8af7b0001') + +To delete the whole slot from the frontier:: + + >>> slot.delete() + +Flush data of the given frontier:: + + >>> frontier.flush() + +Flush data of all frontiers of a project:: + + >>> frontiers.flush() + +Close batch writers of all frontiers of a project:: + + >>> frontiers.close() + +Frontiers are available on project level only. + +.. _job-tags: + + +Settings +-------- + +You can work with project settings via :class:`~scrapinghub.client.projects.Settings`. + +To get a list of the project settings:: + + >>> project.settings.list() + [(u'default_job_units', 2), (u'job_runtime_limit', 24)]] + +To get a project setting value by name:: + + >>> project.settings.get('job_runtime_limit') + 24 + +To update a project setting value by name:: + + >>> project.settings.set('job_runtime_limit', 20) + +Or update a few project settings at once:: + + >>> project.settings.update({'default_job_units': 1, + ... 'job_runtime_limit': 20}) + + +Exceptions +---------- + +.. autoexception:: scrapinghub.ScrapinghubAPIError +.. autoexception:: scrapinghub.BadRequest +.. autoexception:: scrapinghub.Unauthorized +.. autoexception:: scrapinghub.NotFound +.. autoexception:: scrapinghub.ValueTooLarge +.. autoexception:: scrapinghub.DuplicateJobError +.. autoexception:: scrapinghub.ServerError + + +.. _Scrapinghub API: http://doc.scrapinghub.com/api.html +.. _Frontier: http://doc.scrapinghub.com/api/frontier.html +.. _count endpoint: https://doc.scrapinghub.com/api/jobq.html#jobq-project-id-count +.. _list endpoint: https://doc.scrapinghub.com/api/jobq.html#jobq-project-id-list diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 00000000..c1a37395 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,222 @@ +# -*- coding: utf-8 -*- +# +# python-scrapinghub documentation build configuration file, created by +# sphinx-quickstart on Fri Mar 24 12:28:40 2017. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +import os +import sys +from datetime import datetime + +from docutils import nodes +from sphinx.util.docfields import TypedField +from sphinx import addnodes + + +sys.path.insert(0, os.path.abspath('..')) + + +from scrapinghub import __version__ # noqa + + +YEAR = datetime.now().year +VERSION = __version__.rsplit('.', 2)[0] + +# -- General configuration ------------------------------------------------ + +# If your documentation needs a minimal Sphinx version, state it here. +# +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = ['sphinx.ext.autodoc'] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +# source_suffix = ['.rst', '.md'] +source_suffix = '.rst' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = u'scrapinghub' +copyright = u'2010-{}, Scrapinghub'.format(YEAR) +author = u'Scrapinghub' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = VERSION +# The full version, including alpha/beta/rc tags. +release = __version__ + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This patterns also effect to html_static_path and html_extra_path +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = False + + +# -- Options for HTML output ---------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'alabaster' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# +# html_theme_options = {} + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +# html_static_path = ['_static'] +html_static_path = [] + + +# -- Options for HTMLHelp output ------------------------------------------ + +# Output file base name for HTML help builder. +htmlhelp_basename = 'python-scrapinghubdoc' + + +# -- Options for LaTeX output --------------------------------------------- + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # + # 'papersize': 'letterpaper', + + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + + # Additional stuff for the LaTeX preamble. + # + # 'preamble': '', + + # Latex figure (float) alignment + # + # 'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, 'python-scrapinghub.tex', u'python-scrapinghub Documentation', + u'Pablo Hoffman, Daniel GraƱa', 'manual'), +] + + +# -- Options for manual page output --------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + (master_doc, 'python-scrapinghub', u'python-scrapinghub Documentation', + [author], 1) +] + + +# -- Options for Texinfo output ------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (master_doc, 'python-scrapinghub', u'python-scrapinghub Documentation', + author, 'python-scrapinghub', 'One line description of project.', + 'Miscellaneous'), +] + +# Following is taken from https://github.com/snide/sphinx_rtd_theme# +# using-this-theme-locally-then-building-on-read-the-docs + +# on_rtd is whether we are on readthedocs.org, +# this line of code grabbed from docs.readthedocs.org + +on_rtd = os.environ.get('READTHEDOCS', None) == 'True' + +if not on_rtd: # only import and set the theme if we're building docs locally + import sphinx_rtd_theme + html_theme = 'sphinx_rtd_theme' + html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] + +# otherwise, readthedocs.org uses their theme by default, no need to specify it + +# disable cross-reference for ivar +# patch taken from http://stackoverflow.com/a/41184353/1932023 +def patched_make_field(self, types, domain, items): + # type: (List, unicode, Tuple) -> nodes.field + def handle_item(fieldarg, content): + par = nodes.paragraph() + par += addnodes.literal_strong('', fieldarg) # Patch: this line added + # par.extend(self.make_xrefs(self.rolename, domain, fieldarg, + # addnodes.literal_strong)) + if fieldarg in types: + par += nodes.Text(' (') + # NOTE: using .pop() here to prevent a single type node to be + # inserted twice into the doctree, which leads to + # inconsistencies later when references are resolved + fieldtype = types.pop(fieldarg) + if len(fieldtype) == 1 and isinstance(fieldtype[0], nodes.Text): + typename = u''.join(n.astext() for n in fieldtype) + par.extend(self.make_xrefs(self.typerolename, domain, typename, + addnodes.literal_emphasis)) + else: + par += fieldtype + par += nodes.Text(')') + par += nodes.Text(' -- ') + par += content + return par + + fieldname = nodes.field_name('', self.label) + if len(items) == 1 and self.can_collapse: + fieldarg, content = items[0] + bodynode = handle_item(fieldarg, content) + else: + bodynode = self.list_type() + for fieldarg, content in items: + bodynode += nodes.list_item('', handle_item(fieldarg, content)) + fieldbody = nodes.field_body('', bodynode) + return nodes.field('', fieldname, fieldbody) + + +TypedField.make_field = patched_make_field diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 00000000..ea9f6021 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,18 @@ +==================================== +Client interface for Scrapinghub API +==================================== + +.. image:: https://secure.travis-ci.org/scrapinghub/python-scrapinghub.png?branch=master + :target: http://travis-ci.org/scrapinghub/python-scrapinghub + +The ``scrapinghub`` is a Python library for communicating with the `Scrapinghub API`_. + +.. _Scrapinghub API: http://doc.scrapinghub.com/api.html + +.. toctree:: + :maxdepth: 1 + + quickstart + client/overview + client/apidocs + legacy/clients diff --git a/docs/legacy/clients.rst b/docs/legacy/clients.rst new file mode 100644 index 00000000..f5194d2f --- /dev/null +++ b/docs/legacy/clients.rst @@ -0,0 +1,8 @@ +Legacy clients +============== + +.. toctree:: + :maxdepth: 2 + + connection + hubstorage diff --git a/docs/legacy/connection.rst b/docs/legacy/connection.rst new file mode 100644 index 00000000..11e53351 --- /dev/null +++ b/docs/legacy/connection.rst @@ -0,0 +1,113 @@ +scrapinghub.Connection +====================== + +The module is the very first Python library for communicating with the Scrapinghub API. + +[WARNING] It is deprecated, please use `scrapinghub.ScrapinghubClient`_ instead. + +Overview +-------- + +First, you connect to Scrapinghub:: + + >>> from scrapinghub import Connection + >>> conn = Connection('APIKEY') + >>> conn + Connection('APIKEY') + +You can list the projects available to your account:: + + >>> conn.project_ids() + [123, 456] + +And select a particular project to work with:: + + >>> project = conn[123] + >>> project + Project(Connection('APIKEY'), 123) + >>> project.id + 123 + +To schedule a spider run (it returns the job id):: + + >>> project.schedule('myspider', arg1='val1') + u'123/1/1' + +To get the list of spiders in the project:: + + >>> project.spiders() + [ + {u'id': u'spider1', u'tags': [], u'type': u'manual', u'version': u'123'}, + {u'id': u'spider2', u'tags': [], u'type': u'manual', u'version': u'123'} + ] + +To get all finished jobs:: + + >>> jobs = project.jobs(state='finished') + +``jobs`` is a ``JobSet``. ``JobSet`` objects are iterable and, when iterated, +return an iterable of ``Job`` objects, so you typically use it like this:: + + >>> for job in jobs: + ... # do something with job + +Or, if you just want to get the job ids:: + + >>> [x.id for x in jobs] + [u'123/1/1', u'123/1/2', u'123/1/3'] + +To select a specific job:: + + >>> job = project.job(u'123/1/2') + >>> job.id + u'123/1/2' + +To retrieve all scraped items from a job:: + + >>> for item in job.items(): + ... # do something with item (it's just a dict) + +To retrieve all log entries from a job:: + + >>> for logitem in job.log(): + ... # logitem is a dict with logLevel, message, time + +To get job info:: + + >>> job.info['spider'] + 'myspider' + >>> job.info['started_time'] + '2010-09-28T15:09:57.629000' + >>> job.info['tags'] + [] + >>> job.info['fields_count]['description'] + 1253 + +To mark a job with tag ``consumed``:: + + >>> job.update(add_tag='consumed') + +To mark several jobs with tag ``consumed`` (``JobSet`` also supports the +``update()`` method):: + + >>> project.jobs(state='finished').update(add_tag='consumed') + +To delete a job:: + + >>> job.delete() + +To delete several jobs (``JobSet`` also supports the ``update()`` method):: + + >>> project.jobs(state='finished').delete() + + +Module contents +--------------- + +.. automodule:: scrapinghub.legacy + :members: + :undoc-members: + :show-inheritance: + + +.. _scrapinghub.ScrapinghubClient: ../client/overview.html diff --git a/docs/legacy/hubstorage.rst b/docs/legacy/hubstorage.rst new file mode 100644 index 00000000..d1e06ea4 --- /dev/null +++ b/docs/legacy/hubstorage.rst @@ -0,0 +1,249 @@ +scrapinghub.HubstorageClient +============================ + +The library can be used for interaction with spiders, jobs and scraped data through ``storage.scrapinghub.com`` endpoints. + +[WARNING] It is deprecated, please use `scrapinghub.ScrapinghubClient`_ instead. + + +Overview +-------- + +First, use your API key for authorization:: + + >>> from scrapinghub import HubstorageClient + >>> hc = HubstorageClient(auth='apikey') + >>> hc.server_timestamp() + 1446222762611 + +Project +^^^^^^^ + +To get project settings or jobs summary:: + + >>> project = hc.get_project('1111111') + >>> project.settings['botgroups'] + [u'botgroup1', ] + >>> project.jobsummary() + {u'finished': 6, + u'has_capacity': True, + u'pending': 0, + u'project': 1111111, + u'running': 0} + +Spider +^^^^^^ + +To get spider id correlated with its name:: + + >>> project.ids.spider('foo') + 1 + +To see last jobs summaries:: + + >>> summaries = project.spiders.lastjobsummary(count=3) + +To get job summary per spider:: + + >>> summary = project.spiders.lastjobsummary(spiderid='1') + +Job +^^^ + +Job can be **retrieved** directly by id (project_id/spider_id/job_id):: + + >>> job = hc.get_job('1111111/1/1') + >>> job.key + '1111111/1/1' + >>> job.metadata['state'] + u'finished' + +**Creating** a new job requires a spider name:: + + >>> job = hc.push_job(projectid='1111111', spidername='foo') + >>> job.key + '1111111/1/1' + +Priority can be between 0 and 4 (from lowest to highest), the default is 2. + +To push job from project level with the highest priority:: + + >>> job = project.push_job(spidername='foo', priority=4) + >>> job.metadata['priority'] + 4 + +Pushing a job with spider arguments:: + + >>> project.push_job(spidername='foo', spider_args={'arg1': 'foo', 'arg2': 'bar'}) + +Running job can be **cancelled** by calling ``request_cancel()``:: + + >>> job.request_cancel() + >>> job.metadata['cancelled_by'] + u'John' + +To **delete** job:: + + >>> job.purged() + >>> job.metadata['state'] + u'deleted' + +Job details +^^^^^^^^^^^ + +Job details can be found in jobs metadata and it's scrapystats:: + + >>> job = hc.get_job('1111111/1/1') + >>> job.metadata['version'] + u'5123a86-master' + >>> job.metadata['scrapystats'] + ... + u'downloader/response_count': 104, + u'downloader/response_status_count/200': 104, + u'finish_reason': u'finished', + u'finish_time': 1447160494937, + u'item_scraped_count': 50, + u'log_count/DEBUG': 157, + u'log_count/INFO': 1365, + u'log_count/WARNING': 3, + u'memusage/max': 182988800, + u'memusage/startup': 62439424, + ... + +Anything can be stored in metadata, here is example how to add tags:: + + >>> job.update_metadata({'tags': 'obsolete'}) + +Jobs +^^^^ + +To iterate through all jobs metadata per project (descending order):: + + >>> jobs_metadata = project.jobq.list() + >>> [j['key'] for j in jobs_metadata] + ['1111111/1/3', '1111111/1/2', '1111111/1/1'] + +Jobq metadata fieldset is less detailed, than ``job.metadata``, but contains few new fields as well. +Additional fields can be requested using the ``jobmeta`` parameter. +If it used, then it's up to the user to list all the required fields, so only few default fields would be added except requested ones:: + + >>> metadata = next(project.jobq.list()) + >>> metadata.get('spider', 'missing') + u'foo' + >>> jobs_metadata = project.jobq.list(jobmeta=['scheduled_by']) + >>> metadata = next(jobs_metadata) + >>> metadata.get('scheduled_by', 'missing') + u'John' + >>> metadata.get('spider', 'missing') + missing + +By default ``jobq.list()`` returns maximum last 1000 results. Pagination is available using the ``start`` parameter:: + + >>> jobs_metadata = project.jobq.list(start=1000) + +There are several filters like spider, state, has_tag, lacks_tag, startts and endts. +To get jobs filtered by tags:: + + >>> jobs_metadata = project.jobq.list(has_tag=['new', 'verified'], lacks_tag='obsolete') + +List of tags has ``OR`` power, so in the case above jobs with 'new' or 'verified' tag are expected. + +To get certain number of last finished jobs per some spider:: + + >>> jobs_metadata = project.jobq.list(spider='foo', state='finished', count=3) + +There are 4 possible job states, which can be used as values for filtering by state: + +- pending +- running +- finished +- deleted + + +Items +^^^^^ + +To iterate through items:: + + >>> items = job.items.iter_values() + >>> for item in items: + ... # do something, item is just a dict + +Logs +^^^^ + +To iterate through 10 first logs for example:: + + >>> logs = job.logs.iter_values(count=10) + >>> for log in logs: + ... # do something, log is a dict with log level, message and time keys + +Collections +^^^^^^^^^^^ + +Let's store hash and timestamp pair for foo spider. Usual workflow with `Collections`_ would be:: + + >>> collections = project.collections + >>> foo_store = collections.new_store('foo_store') + >>> foo_store.set({'_key': '002d050ee3ff6192dcbecc4e4b4457d7', 'value': '1447221694537'}) + >>> foo_store.count() + 1 + >>> foo_store.get('002d050ee3ff6192dcbecc4e4b4457d7') + {u'value': u'1447221694537'} + >>> # iterate over _key & value pair + ... list(foo_store.iter_values()) + [{u'_key': u'002d050ee3ff6192dcbecc4e4b4457d7', u'value': u'1447221694537'}] + >>> # filter by multiple keys - only values for keys that exist will be returned + ... list(foo_store.iter_values(key=['002d050ee3ff6192dcbecc4e4b4457d7', 'blah'])) + [{u'_key': u'002d050ee3ff6192dcbecc4e4b4457d7', u'value': u'1447221694537'}] + >>> foo_store.delete('002d050ee3ff6192dcbecc4e4b4457d7') + >>> foo_store.count() + 0 + +Frontier +^^^^^^^^ + +Typical workflow with `Frontier`_:: + + >>> frontier = project.frontier + +Add a request to the frontier:: + + >>> frontier.add('test', 'example.com', [{'fp': '/some/path.html'}]) + >>> frontier.flush() + >>> frontier.newcount + 1 + +Add requests with additional parameters:: + + >>> frontier.add('test', 'example.com', [{'fp': '/'}, {'fp': 'page1.html', 'p': 1, 'qdata': {'depth': 1}}]) + >>> frontier.flush() + >>> frontier.newcount + 2 + +To delete the slot ``example.com`` from the frontier:: + + >>> frontier.delete_slot('test', 'example.com') + +To retrieve requests for a given slot:: + + >>> reqs = frontier.read('test', 'example.com') + +To delete a batch of requests:: + + >>> frontier.delete('test', 'example.com', '00013967d8af7b0001') + +To retrieve fingerprints for a given slot:: + + >>> fps = [req['requests'] for req in frontier.read('test', 'example.com')] + + +Module contents +--------------- + +.. automodule:: scrapinghub.hubstorage + :members: + :undoc-members: + :show-inheritance: + +.. _scrapinghub.ScrapinghubClient: ../client/overview.html diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 00000000..f7dfb3e2 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,36 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build +set SPHINXPROJ=python-scrapinghub + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% + +:end +popd diff --git a/docs/quickstart.rst b/docs/quickstart.rst new file mode 100644 index 00000000..426e0475 --- /dev/null +++ b/docs/quickstart.rst @@ -0,0 +1,85 @@ +Quickstart +========== + +Requirements +------------ + +* Python 2.7 or above + + +Installation +------------ + +The quick way:: + + pip install scrapinghub + +You can also install the library with `MessagePack`_ support, it provides better +response time and improved bandwidth usage:: + + pip install scrapinghub[msgpack] + + +Basic usage +----------- + +Instantiate new client:: + + >>> from scrapinghub import ScrapinghubClient + >>> client = ScrapinghubClient('APIKEY') + +Work with your projects:: + + >>> client.projects.list() + [123, 456] + +Run new jobs from the client:: + + >>> project = client.get_project(123) + >>> project.jobs.run('spider1', job_args={'arg1': 'val1'}) + > + +Access your jobs data:: + + >>> job = client.get_job('123/1/2') + >>> for item in job.items(): + ... print(item) + { + 'name': ['Some other item'], + 'url': 'http://some-url/other-item.html', + 'size': 35000, + } + +Many more features `are awaiting`_ for you. + + +Tests +----- + +The package is covered with integration tests based on `VCR.py`_ library: there +are recorded cassettes files in ``tests/*/cassettes`` used instead of HTTP +requests to real services, it helps to simplify and speed up development. + +By default, tests use VCR.py ``once`` mode to: + +- replay previously recorded interactions. +- record new interactions if there is no cassette file. +- cause an error to be raised for new requests if there is a cassette file. + +It means that if you add new integration tests and run all tests as usual, +only new cassettes will be created, all existing cassettes will stay unmodified. + +To ignore existing cassettes and use real services, please provide a flag:: + + py.test --ignore-cassettes + +If you want to update/recreate all the cassettes from scratch, please use:: + + py.test --update-cassettes + +Note that internally the above command erases the whole folder with cassettes. + + +.. _MessagePack: https://en.wikipedia.org/wiki/MessagePack +.. _are awaiting: client/overview.html +.. _VCR.py: https://pypi.python.org/pypi/vcrpy diff --git a/requirements-docs.txt b/requirements-docs.txt new file mode 100644 index 00000000..8284e0b7 --- /dev/null +++ b/requirements-docs.txt @@ -0,0 +1,3 @@ +-r requirements.txt +sphinx==1.5.3 +sphinx_rtd_theme==0.2.4 diff --git a/scrapinghub/__init__.py b/scrapinghub/__init__.py index b1b67292..c1240ce7 100644 --- a/scrapinghub/__init__.py +++ b/scrapinghub/__init__.py @@ -1,7 +1,7 @@ __all__ = ["APIError", "Connection", "HubstorageClient", "ScrapinghubClient", "ScrapinghubAPIError", "DuplicateJobError", "BadRequest", "NotFound", - "Unauthorized", "ValueTooLarge"] + "Unauthorized", "ValueTooLarge", "ServerError"] import pkgutil __version__ = pkgutil.get_data(__package__, 'VERSION') @@ -19,4 +19,5 @@ NotFound, Unauthorized, ValueTooLarge, + ServerError, ) diff --git a/scrapinghub/client/__init__.py b/scrapinghub/client/__init__.py index b201b12b..5e9fbafa 100644 --- a/scrapinghub/client/__init__.py +++ b/scrapinghub/client/__init__.py @@ -1,9 +1,8 @@ from scrapinghub import Connection as _Connection from scrapinghub import HubstorageClient as _HubstorageClient +from .exceptions import _wrap_http_errors from .projects import Projects -from .exceptions import wrap_http_errors - from .utils import parse_auth from .utils import parse_project_id, parse_job_key @@ -13,14 +12,14 @@ class Connection(_Connection): - @wrap_http_errors + @_wrap_http_errors def _request(self, *args, **kwargs): return super(Connection, self)._request(*args, **kwargs) class HubstorageClient(_HubstorageClient): - @wrap_http_errors + @_wrap_http_errors def request(self, *args, **kwargs): return super(HubstorageClient, self).request(*args, **kwargs) @@ -31,9 +30,10 @@ class ScrapinghubClient(object): :param auth: Scrapinghub APIKEY or other SH auth credentials. :param dash_endpoint: (optional) Scrapinghub Dash panel url. :param \*\*kwargs: (optional) Additional arguments for - :class:`scrapinghub.hubstorage.HubstorageClient` constructor. + :class:`~scrapinghub.hubstorage.HubstorageClient` constructor. - :ivar projects: projects collection, :class:`Projects` instance. + :ivar projects: projects collection, + :class:`~scrapinghub.client.projects.Projects` instance. Usage:: @@ -52,13 +52,14 @@ def __init__(self, auth=None, dash_endpoint=None, **kwargs): self._hsclient = HubstorageClient(auth=(login, password), **kwargs) def get_project(self, project_id): - """Get :class:`Project` instance with a given project id. + """Get :class:`scrapinghub.client.projects.Project` instance with + a given project id. The method is a shortcut for client.projects.get(). :param project_id: integer or string numeric project id. - :return: :class:`Project` object. - :rtype: scrapinghub.client.projects.Project + :return: a project instance. + :rtype: :class:`~scrapinghub.client.projects.Project` Usage:: @@ -69,12 +70,12 @@ def get_project(self, project_id): return self.projects.get(parse_project_id(project_id)) def get_job(self, job_key): - """Get Job with a given job key. + """Get :class:`~scrapinghub.client.jobs.Job` with a given job key. - :param job_key: job key string in format 'project_id/spider_id/job_id', + :param job_key: job key string in format ``project_id/spider_id/job_id``, where all the components are integers. - :return: :class:`Job` object. - :rtype: scrapinghub.client.jobs.Job + :return: a job instance. + :rtype: :class:`~scrapinghub.client.jobs.Job` Usage:: diff --git a/scrapinghub/client/activity.py b/scrapinghub/client/activity.py index 40f02fc1..b5d1777f 100644 --- a/scrapinghub/client/activity.py +++ b/scrapinghub/client/activity.py @@ -1,17 +1,18 @@ from __future__ import absolute_import -from .utils import _Proxy -from .utils import parse_job_key +from .proxy import _Proxy +from .utils import parse_job_key, update_kwargs class Activity(_Proxy): """Representation of collection of job activity events. - Not a public constructor: use :class:`Project` instance to get a - :class:`Activity` instance. See :attr:`Project.activity` attribute. + Not a public constructor: use :class:`~scrapinghub.client.projects.Project` + instance to get a :class:`~scrapinghub.client.activity.Activity` instance. + See :attr:`~scrapinghub.client.projects.Project.activity` attribute. - Please note that list() method can use a lot of memory and for a large - amount of activities it's recommended to iterate through it via iter() + Please note that :meth:`list` method can use a lot of memory and for a large + amount of activities it's recommended to iterate through it via :meth:`iter` method (all params and available filters are same for both methods). Usage: @@ -30,23 +31,29 @@ class Activity(_Proxy): - post a new event:: >>> event = {'event': 'job:completed', - 'job': '123/2/4', - 'user': 'jobrunner'} + ... 'job': '123/2/4', + ... 'user': 'jobrunner'} >>> project.activity.add(event) - post multiple events at once:: >>> events = [ - {'event': 'job:completed', 'job': '123/2/5', 'user': 'jobrunner'}, - {'event': 'job:cancelled', 'job': '123/2/6', 'user': 'john'}, - ] + ... {'event': 'job:completed', 'job': '123/2/5', 'user': 'jobrunner'}, + ... {'event': 'job:cancelled', 'job': '123/2/6', 'user': 'john'}, + ... ] >>> project.activity.add(events) """ - def __init__(self, *args, **kwargs): - super(Activity, self).__init__(*args, **kwargs) - self._proxy_methods([('iter', 'list')]) - self._wrap_iter_methods(['iter']) + def iter(self, count=None, **params): + """Iterate over activity events. + + :param count: limit amount of elements. + :return: a generator object over a list of activity event dicts. + :rtype: :class:`types.GeneratorType[dict]` + """ + update_kwargs(params, count=count) + params = self._modify_iter_params(params) + return self._origin.list(**params) def add(self, values, **kwargs): """Add new event to the project activity. diff --git a/scrapinghub/client/collections.py b/scrapinghub/client/collections.py index a2d5e22f..10f78e2e 100644 --- a/scrapinghub/client/collections.py +++ b/scrapinghub/client/collections.py @@ -5,16 +5,16 @@ from ..hubstorage.collectionsrt import Collection as _Collection -from .utils import ( - _Proxy, format_iter_filters, proxy_methods, wrap_kwargs, update_kwargs, -) +from .proxy import _Proxy, _DownloadableProxyMixin +from .utils import update_kwargs -class Collections(_Proxy): +class Collections(_Proxy, _DownloadableProxyMixin): """Access to project collections. - Not a public constructor: use :class:`Project` instance to get a - :class:`Collections` instance. See :attr:`Project.collections` attribute. + Not a public constructor: use :class:`~scrapinghub.client.projects.Project` + instance to get a :class:`Collections` instance. + See :attr:`~scrapinghub.client.projects.Project.collections` attribute. Usage:: @@ -27,10 +27,10 @@ class Collections(_Proxy): def get(self, type_, name): """Base method to get a collection with a given type and name. - :param type_: a collection type string. + :param `type_`: a collection type string. :param name: a collection name string. - :return: :class:`Collection` object. - :rtype: Collection + :return: a collection object. + :rtype: :class:`Collection` """ self._origin._validate_collection(type_, name) return Collection(self._client, self, type_, name) @@ -39,8 +39,8 @@ def get_store(self, name): """Method to get a store collection by name. :param name: a collection name string. - :return: :class:`Collection` object. - :rtype: Collection + :return: a collection object. + :rtype: :class:`Collection` """ return self.get('s', name) @@ -50,8 +50,8 @@ def get_cached_store(self, name): The collection type means that items expire after a month. :param name: a collection name string. - :return: :class:`Collection` object. - :rtype: Collection + :return: a collection object. + :rtype: :class:`Collection` """ return self.get('cs', name) @@ -61,8 +61,8 @@ def get_versioned_store(self, name): The collection type retains up to 3 copies of each item. :param name: a collection name string. - :return: :class:`Collection` object. - :rtype: Collection + :return: a collection object. + :rtype: :class:`Collection` """ return self.get('vs', name) @@ -72,8 +72,8 @@ def get_versioned_cached_store(self, name): Multiple copies are retained, and each one expires after a month. :param name: a collection name string. - :return: :class:`Collection` object. - :rtype: Collection + :return: a collection object. + :rtype: :class:`Collection` """ return self.get('vcs', name) @@ -82,7 +82,7 @@ def iter(self): :return: an iterator over collections list where each collection is represented by a dictionary with ('name','type') fields. - :rtype: collections.Iterable[dict] + :rtype: :class:`collections.Iterable[dict]` """ return self._origin.apiget('list') @@ -91,7 +91,7 @@ def list(self): :return: a list of collections where each collection is represented by a dictionary with ('name','type') fields. - :rtype: list[dict] + :rtype: :class:`list[dict]` """ return list(self.iter()) @@ -108,7 +108,7 @@ class Collection(object): - add a new item to collection:: >>> foo_store.set({'_key': '002d050ee3ff6192dcbecc4e4b4457d7', - 'value': '1447221694537'}) + ... 'value': '1447221694537'}) - count items in collection:: @@ -128,7 +128,7 @@ class Collection(object): - iterate iterate over _key & value pair:: >>> for elem in foo_store.iter(count=1)): - >>> ... print(elem) + ... print(elem) [{'_key': '002d050ee3ff6192dcbecc4e4b4457d7', 'value': '1447221694537'}] - filter by multiple keys, only values for keys that exist will be returned:: @@ -143,43 +143,8 @@ class Collection(object): def __init__(self, client, collections, type_, name): self._client = client + self._collections = collections self._origin = _Collection(type_, name, collections._origin) - proxy_methods(self._origin, self, [ - 'create_writer', 'count', - ('iter', 'iter_values'), - ('iter_raw_json', 'iter_json'), - ]) - # simplified version of _Proxy._wrap_iter_methods logic - # to provide better support for filter param in iter methods - for method in ['iter', 'iter_raw_json']: - wrapped = wrap_kwargs(getattr(self, method), format_iter_filters) - setattr(self, method, wrapped) - - def list(self, key=None, prefix=None, prefixcount=None, startts=None, - endts=None, requests_params=None, **params): - """Convenient shortcut to list iter results. - - Please note that list() method can use a lot of memory and for a large - amount of elements it's recommended to iterate through it via iter() - method (all params and available filters are same for both methods). - - :param key: a string key or a list of keys to filter with. - :param prefix: a string prefix to filter items. - :param prefixcount: maximum number of values to return per prefix. - :param startts: UNIX timestamp at which to begin results. - :param endts: UNIX timestamp at which to end results. - :param requests_params: (optional) a dict with optional requests params. - :param \*\*params: (optional) additional query params for the request. - :return: a list of items where each item is represented with a dict. - :rtype: list[dict] - - # FIXME there should be similar docstrings for iter/iter_raw_json - # but as we proxy them as-is, it's not in place, should be improved - """ - update_kwargs(params, key=key, prefix=prefix, prefixcount=prefixcount, - startts=startts, endts=endts, - requests_params=requests_params) - return list(self.iter(requests_params=None, **params)) def get(self, key, **params): """Get item from collection by key. @@ -187,7 +152,7 @@ def get(self, key, **params): :param key: string item key. :param \*\*params: (optional) additional query params for the request. :return: an item dictionary if exists. - :rtype: dict + :rtype: :class:`dict` """ if key is None: raise ValueError("key cannot be None") @@ -198,7 +163,7 @@ def set(self, value): :param value: a dict representing a collection item. - The method returns None (original method returns an empty generator). + The method returns ``None`` (original method returns an empty generator). """ self._origin.set(value) @@ -207,7 +172,7 @@ def delete(self, keys): :param keys: a single key or a list of keys. - The method returns None (original method returns an empty generator). + The method returns ``None`` (original method returns an empty generator). """ if (not isinstance(keys, string_types) and not isinstance(keys, collections.Iterable)): @@ -215,11 +180,19 @@ def delete(self, keys): "object providing string keys") self._origin.delete(keys) - def iter_raw_msgpack(self, key=None, prefix=None, prefixcount=None, - startts=None, endts=None, requests_params=None, - **params): - """A method to iterate through raw msgpack-ed items. - Can be convenient if data is needed in same msgpack format. + def count(self, *args, **kwargs): + """Count collection items with a given filters. + + :return: amount of elements in collection. + :rtype: :class:`int` + """ + # TODO describe allowable params + return self._origin._collections.count( + self._origin.coltype, self._origin.colname, *args, **kwargs) + + def iter(self, key=None, prefix=None, prefixcount=None, startts=None, + endts=None, requests_params=None, **params): + """A method to iterate through collection items. :param key: a string key or a list of keys to filter with. :param prefix: a string prefix to filter items. @@ -228,11 +201,61 @@ def iter_raw_msgpack(self, key=None, prefix=None, prefixcount=None, :param endts: UNIX timestamp at which to end results. :param requests_params: (optional) a dict with optional requests params. :param \*\*params: (optional) additional query params for the request. - :return: an iterator over items list packed with msgpack. - :rtype: collections.Iterable[bytes] + :return: an iterator over items list. + :rtype: :class:`collections.Iterable[dict]` """ update_kwargs(params, key=key, prefix=prefix, prefixcount=prefixcount, startts=startts, endts=endts, requests_params=requests_params) - return self._origin._collections.iter_msgpack( + params = self._collections._modify_iter_params(params) + return self._origin._collections.iter_values( self._origin.coltype, self._origin.colname, **params) + + def list(self, key=None, prefix=None, prefixcount=None, startts=None, + endts=None, requests_params=None, **params): + """Convenient shortcut to list iter results. + + Please note that :meth:`list` method can use a lot of memory and for a + large amount of logs it's recommended to iterate through it + via :meth:`iter` method (all params and available filters are same for + both methods). + + :param key: a string key or a list of keys to filter with. + :param prefix: a string prefix to filter items. + :param prefixcount: maximum number of values to return per prefix. + :param startts: UNIX timestamp at which to begin results. + :param endts: UNIX timestamp at which to end results. + :param requests_params: (optional) a dict with optional requests params. + :param \*\*params: (optional) additional query params for the request. + :return: a list of items where each item is represented with a dict. + :rtype: :class:`list[dict]` + """ + update_kwargs(params, key=key, prefix=prefix, prefixcount=prefixcount, + startts=startts, endts=endts) + return list(self.iter(requests_params=requests_params, **params)) + + def create_writer(self, start=0, auth=None, size=1000, interval=15, + qsize=None, content_encoding='identity', + maxitemsize=1024 ** 2, callback=None): + """Create a new writer for a collection. + + :param start: (optional) initial offset for writer thread. + :param auth: (optional) set auth credentials for the request. + :param size: (optional) set initial queue size. + :param interval: (optional) set interval for writer thread. + :param qsize: (optional) setup max queue size for the writer. + :param content_encoding: (optional) set different Content-Encoding header. + :param maxitemsize: (optional) max item size in bytes. + :param callback: (optional) some callback function. + :return: a new writer object. + :rtype: :class:`scrapinghub.hubstorage.batchuploader._BatchWriter` + + If provided - calllback shouldn't try to inject more items in the queue, + otherwise it can lead to deadlocks. + """ + kwargs = {} + update_kwargs(kwargs, start=start, auth=auth, size=size, interval=interval, + qsize=qsize, content_encoding=content_encoding, + maxitemsize=maxitemsize, callback=callback) + return self._origin._collections.create_writer( + self._origin.coltype, self._origin.colname, **kwargs) diff --git a/scrapinghub/client/exceptions.py b/scrapinghub/client/exceptions.py index 4c014d35..6a4b405c 100644 --- a/scrapinghub/client/exceptions.py +++ b/scrapinghub/client/exceptions.py @@ -5,7 +5,6 @@ from requests import HTTPError from ..legacy import APIError -from ..hubstorage import ValueTooLarge as _ValueTooLarge def _get_http_error_msg(exc): @@ -24,6 +23,7 @@ def _get_http_error_msg(exc): class ScrapinghubAPIError(Exception): + """Base exception class.""" def __init__(self, message=None, http_error=None): self.http_error = http_error @@ -33,30 +33,31 @@ def __init__(self, message=None, http_error=None): class BadRequest(ScrapinghubAPIError): - pass + """Usually raised in case of 400 response from API.""" class Unauthorized(ScrapinghubAPIError): - pass + """Request lacks valid authentication credentials for the target resource.""" class NotFound(ScrapinghubAPIError): - pass + """Entity doesn't exist (e.g. spider or project).""" class ValueTooLarge(ScrapinghubAPIError): - pass + """Value cannot be writtent because it exceeds size limits.""" class DuplicateJobError(ScrapinghubAPIError): - pass + """Job for given spider with given arguments is already scheduled or running.""" class ServerError(ScrapinghubAPIError): - pass + """Indicates some server error: something unexpected has happened.""" -def wrap_http_errors(method): +def _wrap_http_errors(method): + """Internal helper to handle exceptions gracefully.""" @wraps(method) def wrapped(*args, **kwargs): try: @@ -90,13 +91,3 @@ def wrapped(*args, **kwargs): raise ServerError(http_error=exc) raise ScrapinghubAPIError(msg) return wrapped - - -def wrap_value_too_large(method): - @wraps(method) - def wrapped(*args, **kwargs): - try: - return method(*args, **kwargs) - except _ValueTooLarge as exc: - raise ValueTooLarge(str(exc)) - return wrapped diff --git a/scrapinghub/client/frontiers.py b/scrapinghub/client/frontiers.py index bf2b460b..72f4edd4 100644 --- a/scrapinghub/client/frontiers.py +++ b/scrapinghub/client/frontiers.py @@ -7,7 +7,8 @@ from ..hubstorage.frontier import Frontier as _Frontier from ..hubstorage.utils import urlpathjoin -from .utils import _Proxy, update_kwargs +from .proxy import _Proxy +from .utils import update_kwargs class _HSFrontier(_Frontier): @@ -22,7 +23,7 @@ def _get_writer(self, frontier, slot): callback to write newcount data per slot. :return: a batchuploader writer instance. - :rtype: scrapinghub.hubstorage.batchuploader._BatchWriter + :rtype: :class:`~scrapinghub.hubstorage.batchuploader._BatchWriter` """ key = (frontier, slot) writer = self._writers.get(key) @@ -41,14 +42,16 @@ def _get_writer(self, frontier, slot): return writer def _writer_callback(self, key, response): + """Writer callback function when new batch is added.""" self.newcount[key] += response.json()["newcount"] class Frontiers(_Proxy): """Frontiers collection for a project. - Not a public constructor: use :class:`Project` instance to get a - :class:`Frontiers` instance. See :attr:`Project.frontiers` attribute. + Not a public constructor: use :class:`~scrapinghub.client.projects.Project` + instance to get a :class:`Frontiers` instance. + See :attr:`~scrapinghub.client.Project.frontiers` attribute. Usage: @@ -82,14 +85,13 @@ class Frontiers(_Proxy): """ def __init__(self, *args, **kwargs): super(Frontiers, self).__init__(*args, **kwargs) - self._proxy_methods(['close', 'flush']) def get(self, name): """Get a frontier by name. :param name: a frontier name string. - :return: class:`Frontier` instance. - :rtype: Frontier + :return: a frontier instance. + :rtype: :class:`Frontier` """ return Frontier(self._client, self, name) @@ -97,7 +99,7 @@ def iter(self): """Iterate through frontiers. :return: an iterator over frontiers names. - :rtype: collections.Iterable[str] + :rtype: :class:`collections.Iterable[str]` """ return iter(self.list()) @@ -105,14 +107,23 @@ def list(self): """List frontiers names. :return: a list of frontiers names. - :rtype: list[str] + :rtype: :class:`list[str]` """ return next(self._origin.apiget('list')) @property def newcount(self): + """Integer amount of new entries added to all frontiers.""" return sum(self._origin.newcount.values()) + def flush(self): + """Flush data in all frontiers writer threads.""" + self._origin.flush() + + def close(self): + """Close frontier writer threads one-by-one.""" + self._origin.close() + class Frontier(object): """Representation of a frontier object. @@ -154,8 +165,8 @@ def __init__(self, client, frontiers, name): def get(self, slot): """Get a slot by name. - :return: class:`FrontierSlot` instance. - :rtype: FrontierSlot + :return: a frontier slot instance. + :rtype: :class:`FrontierSlot` """ return FrontierSlot(self._client, self, slot) @@ -163,7 +174,7 @@ def iter(self): """Iterate through slots. :return: an iterator over frontier slots names. - :rtype: collections.Iterate[str] + :rtype: :class:`collections.Iterable[str]` """ return iter(self.list()) @@ -171,7 +182,7 @@ def list(self): """List all slots. :return: a list of frontier slots names. - :rtype: list[str] + :rtype: :class:`list[str]` """ return next(self._frontiers._origin.apiget((self.key, 'list'))) @@ -184,6 +195,7 @@ def flush(self): @property def newcount(self): + """Integer amount of new entries added to frontier.""" newcount_values = self._frontiers._origin.newcount return sum(v for (frontier, _), v in newcount_values.items() if frontier == self.key) @@ -249,8 +261,8 @@ def __init__(self, client, frontier, slot): def f(self): """Shortcut to have quick access to slot fingerprints. - :return: class:`FrontierSlotFingerprints` instance. - :rtype: FrontierSlotFingerprints + :return: fingerprints collection for the slot. + :rtype: :class:`FrontierSlotFingerprints` """ return self.fingerprints @@ -258,8 +270,8 @@ def f(self): def q(self): """Shortcut to have quick access to a slot queue. - :return: class:`FrontierSlotQueue` instance. - :rtype: FrontierSlotQueue + :return: queue instance for the slot. + :rtype: :class:`FrontierSlotQueue` """ return self.queue @@ -278,11 +290,13 @@ def flush(self): @property def newcount(self): + """Integer amount of new entries added to slot.""" newcount_values = self._frontier._frontiers._origin.newcount return newcount_values.get((self._frontier.key, self.key), 0) class FrontierSlotFingerprints(object): + """Representation of request fingerprints collection stored in slot.""" def __init__(self, slot): self.key = slot.key @@ -290,6 +304,10 @@ def __init__(self, slot): self._slot = slot def add(self, fps): + """Add new fingerprints to slot. + + :param fps: a list of string fingerprints to add. + """ origin = self._frontier._frontiers._origin writer = origin._get_writer(self._frontier.key, self.key) fps = list(fps) if not isinstance(fps, list) else fps @@ -303,7 +321,7 @@ def iter(self, **params): :param \*\*params: (optional) additional query params for the request. :return: an iterator over fingerprints. - :rtype: collections.Iterable[str] + :rtype: :class:`collections.Iterable[str]` """ origin = self._frontier._frontiers._origin path = (self._frontier.key, 's', self.key, 'f') @@ -315,12 +333,13 @@ def list(self, **params): :param \*\*params: (optional) additional query params for the request. :return: a list of fingerprints. - :rtype: list[str] + :rtype: :class:`list[str]` """ return list(self.iter(**params)) class FrontierSlotQueue(object): + """Representation of request batches queue stored in slot.""" def __init__(self, slot): self.key = slot.key @@ -339,7 +358,7 @@ def iter(self, mincount=None, **params): :param \*\*params: (optional) additional query params for the request. :return: an iterator over request batches in the queue where each batch is represented with a dict with ('id', 'requests') field. - :rtype: collections.Iterable[dict] + :rtype: :class:`collections.Iterable[dict]` """ origin = self._frontier._frontiers._origin path = (self._frontier.key, 's', self.key, 'q') @@ -353,7 +372,7 @@ def list(self, mincount=None, **params): :param \*\*params: (optional) additional query params for the request. :return: a list of request batches in the queue where each batch is represented with a dict with ('id', 'requests') field. - :rtype: list[dict] + :rtype: :class:`list[dict]` """ return list(self.iter(mincount=mincount, **params)) diff --git a/scrapinghub/client/items.py b/scrapinghub/client/items.py index a127e2e0..c3d5828a 100644 --- a/scrapinghub/client/items.py +++ b/scrapinghub/client/items.py @@ -1,17 +1,19 @@ from __future__ import absolute_import -from .utils import _Proxy +from .proxy import _ItemsResourceProxy, _DownloadableProxyMixin -class Items(_Proxy): +class Items(_ItemsResourceProxy, _DownloadableProxyMixin): """Representation of collection of job items. - Not a public constructor: use :class:`Job` instance to get a :class:`Items` - instance. See :attr:`Job.items` attribute. + Not a public constructor: use :class:`~scrapinghub.client.jobs.Job` + instance to get a :class:`Items` instance. See + :attr:`~scrapinghub.client.jobs.Job.items` attribute. - Please note that list() method can use a lot of memory and for a large - amount of items it's recommended to iterate through it via iter() method - (all params and available filters are same for both methods). + Please note that :meth:`list` method can use a lot of memory and for + a large amount of logs it's recommended to iterate through it via + :meth:`iter` method (all params and available filters are same for + both methods). Usage: @@ -23,7 +25,7 @@ class Items(_Proxy): - iterate through first 100 items and print them:: >>> for log in job.logs.iter(count=100): - >>> ... print(log) + ... print(log) - retrieve items with timestamp greater or equal to given timestamp (item here is an arbitrary dictionary depending on your code):: @@ -50,7 +52,7 @@ def _modify_iter_params(self, params): """Modify iter filter to convert offset to start parameter. :return: a dict with updated set of params. - :rtype: dict + :rtype: :class:`dict` """ params = super(Items, self)._modify_iter_params(params) offset = params.pop('offset', None) diff --git a/scrapinghub/client/jobs.py b/scrapinghub/client/jobs.py index 43837216..f6813976 100644 --- a/scrapinghub/client/jobs.py +++ b/scrapinghub/client/jobs.py @@ -1,5 +1,4 @@ from __future__ import absolute_import -import json from ..hubstorage.job import JobMeta as _JobMeta from ..hubstorage.job import Items as _Items @@ -12,20 +11,20 @@ from .requests import Requests from .samples import Samples from .exceptions import NotFound, BadRequest, DuplicateJobError -from .utils import ( - _MappingProxy, get_tags_for_update, parse_job_key, update_kwargs, -) +from .proxy import _MappingProxy +from .utils import get_tags_for_update, parse_job_key, update_kwargs class Jobs(object): """Class representing a collection of jobs for a project/spider. - Not a public constructor: use :class:`Project` instance or :class:`Spider` - instance to get a :class:`Jobs` instance. See :attr:`Project.jobs` and - :attr:`Spider.jobs` attributes. + Not a public constructor: use :class:`~scrapinghub.client.projects.Project` + instance or :class:`~scrapinghub.client.spiders.Spider` instance to get + a :class:`Jobs` instance. See :attr:`scrapinghub.client.projects.Project.jobs` + and :attr:`scrapinghub.client.spiders.Spider.jobs` attributes. :ivar project_id: a string project id. - :ivar spider: :class:`Spider` object if defined. + :ivar spider: :class:`~scrapinghub.client.spiders.Spider` object if defined. Usage:: @@ -59,7 +58,7 @@ def count(self, spider=None, state=None, has_tag=None, lacks_tag=None, :param \*\*params: (optional) other filter params. :return: jobs count. - :rtype: int + :rtype: :class:`int` Usage:: @@ -98,7 +97,7 @@ def iter(self, count=None, start=None, spider=None, state=None, :return: a generator object over a list of dictionaries of jobs summary for a given filter params. - :rtype: types.GeneratorType[dict] + :rtype: :class:`types.GeneratorType[dict]` Usage: @@ -113,16 +112,16 @@ def iter(self, count=None, start=None, spider=None, state=None, >>> [job['key'] for job in jobs_summary] ['123/1/3', '123/1/2', '123/1/1'] - - job summary fieldset is less detailed than job.metadata but - contains few new fields as well. Additional fields can be requested - using ``meta`` parameter. If it's used, then it's up to the user - to list all the required fields, so only few default fields would - be added except requested ones:: + - job summary fieldset is less detailed than :class:`JobMeta` but + contains a few new fields as well. Additional fields can be requested + using ``meta`` parameter. If it's used, then it's up to the user to + list all the required fields, so only few default fields would be + added except requested ones:: >>> jobs_summary = project.jobs.iter(meta=['scheduled_by', ]) - by default :meth:`Jobs.iter` returns maximum last 1000 results. - Pagination is available using start parameter:: + Pagination is available using start parameter:: >>> jobs_summary = spider.jobs.iter(start=1000) @@ -164,11 +163,11 @@ def list(self, count=None, start=None, spider=None, state=None, field name or a list of field names to return. :param \*\*params: (optional) other filter params. - :return: list of dictionaries of jobs summary for a given filter params - :rtype: list[dict] + :return: list of dictionaries of jobs summary for a given filter params. + :rtype: :class:`list[dict]` - Please note that list() method can use a lot of memory and for a large - amount of jobs it's recommended to iterate through it via iter() + Please note that :meth:`list` can use a lot of memory and for a large + amount of logs it's recommended to iterate through it via :meth:`iter` method (all params and available filters are same for both methods). """ # FIXME we double-check the params here, is there a better way? @@ -196,7 +195,7 @@ def run(self, spider=None, units=None, priority=None, meta=None, :param \*\*params: (optional) additional keyword args. :return: a job key string pointing to the new job. - :rtype: str + :rtype: :class:`str` Usage:: @@ -228,16 +227,17 @@ def run(self, spider=None, units=None, priority=None, meta=None, return Job(self._client, response['jobid']) def get(self, job_key): - """Get a Job with a given job_key. + """Get a :class:`Job` with a given job_key. :param job_key: a string job key. job_key's project component should match the project used to get :class:`Jobs` instance, and job_key's spider component should match - the spider (if :attr:`Spider.jobs` was used). + the spider (if :class:`~scrapinghub.client.spiders.Spider` was used + to get :class:`Jobs` instance). - :return: :class:`Job` object. - :rtype: scrapinghub.client.jobs.Job + :return: a job object. + :rtype: :class:`Job` Usage:: @@ -256,12 +256,12 @@ def summary(self, state=None, spider=None, **params): """Get jobs summary (optionally by state). :param state: (optional) a string state to filter jobs. - :param spider: (optional) a spider name - (not needed if instantiated with :cls:`Spider`). + :param spider: (optional) a spider name (not needed if instantiated + with :class:`~scrapinghub.client.spiders.Spider`). :param \*\*params: (optional) additional keyword args. :return: a list of dictionaries of jobs summary for a given filter params grouped by job state. - :rtype: list[dict] + :rtype: :class:`list[dict]` Usage:: @@ -284,12 +284,12 @@ def iter_last(self, start=None, start_after=None, count=None, :param start: (optional) :param start_after: (optional) :param count: (optional) - :param spider: (optional) a spider name - (not needed if instantiated with :cls:`Spider`). + :param spider: (optional) a spider name (not needed if instantiated + with :class:`~scrapinghub.client.spiders.Spider`). :param \*\*params: (optional) additional keyword args. :return: a generator object over a list of dictionaries of jobs summary for a given filter params. - :rtype: types.GeneratorType[dict] + :rtype: :class:`types.GeneratorType[dict]` Usage: @@ -342,7 +342,7 @@ def update_tags(self, add=None, remove=None, spider=None): have to specify ``spider`` param when using :attr:`Project.jobs`). :return: amount of jobs that were updated. - :rtype: int + :rtype: :class:`int` Usage: @@ -372,21 +372,22 @@ def update_tags(self, add=None, remove=None, spider=None): class Job(object): """Class representing a job object. - Not a public constructor: use :class:`ScrapinghubClient` instance or - :class:`Jobs` instance to get a :class:`Job` instance. See - :meth:`ScrapinghubClient.get_job` and :meth:`Jobs.get` methods. + Not a public constructor: use :class:`~scrapinghub.client.ScrapinghubClient` + instance or :class:`Jobs` instance to get a :class:`Job` instance. See + :meth:`scrapinghub.client.ScrapinghubClient.get_job` and :meth:`Jobs.get` + methods. :ivar project_id: integer project id. :ivar key: a job key. - :ivar items: :class:`Items` resource object. - :ivar logs: :class:`Logs` resource object. - :ivar requests: :class:`Requests` resource object. - :ivar samples: :class:`Samples` resource object. - :ivar metadata: :class:`Metadata` resource. + :ivar items: :class:`~scrapinghub.client.items.Items` resource object. + :ivar logs: :class:`~scrapinghub.client.logs.Logs` resource object. + :ivar requests: :class:`~scrapinghub.client.requests.Requests` resource object. + :ivar samples: :class:`~scrapinghub.client.samples.Samples` resource object. + :ivar metadata: :class:`JobMeta` resource object. Usage:: - >>> job = project.job('123/1/2') + >>> job = project.jobs.get('123/1/2') >>> job.key '123/1/2' >>> job.metadata.get('state') @@ -437,7 +438,7 @@ def start(self, **params): :param \*\*params: (optional) keyword meta parameters to update. :return: a previous string job state. - :rtype: str + :rtype: :class:`str` Usage:: @@ -451,7 +452,7 @@ def finish(self, **params): :param \*\*params: (optional) keyword meta parameters to update. :return: a previous string job state. - :rtype: str + :rtype: :class:`str` Usage:: @@ -465,7 +466,7 @@ def delete(self, **params): :param \*\*params: (optional) keyword meta parameters to update. :return: a previous string job state. - :rtype: str + :rtype: :class:`str` Usage:: @@ -480,7 +481,7 @@ def update(self, state, **params): :param state: a new job state. :param \*\*params: (optional) keyword meta parameters to update. :return: a previous string job state. - :rtype: str + :rtype: :class:`str` Usage:: @@ -509,7 +510,7 @@ class JobMeta(_MappingProxy): """Class representing job metadata. Not a public constructor: use :class:`Job` instance to get a - :class:`Jobmeta` instance. See :attr:`Job.metadata` attribute. + :class:`JobMeta` instance. See :attr:`~Job.metadata` attribute. Usage: @@ -539,7 +540,7 @@ class JobMeta(_MappingProxy): - update multiple meta fields at once - >>> job.metadata.update({'my-meta1': 'test1', 'my-meta2': 'test2}) + >>> job.metadata.update({'my-meta1': 'test1', 'my-meta2': 'test2'}) - delete meta field by name:: diff --git a/scrapinghub/client/logs.py b/scrapinghub/client/logs.py index ebfdfde7..2c68d800 100644 --- a/scrapinghub/client/logs.py +++ b/scrapinghub/client/logs.py @@ -1,19 +1,22 @@ from __future__ import absolute_import + import json +import logging -from .utils import _Proxy +from .proxy import _ItemsResourceProxy, _DownloadableProxyMixin from .utils import LogLevel -class Logs(_Proxy): +class Logs(_ItemsResourceProxy, _DownloadableProxyMixin): """Representation of collection of job logs. - Not a public constructor: use :class:`Job` instance to get a :class:`Logs` - instance. See :attr:`Job.logs` attribute. + Not a public constructor: use :class:`~scrapinghub.client.jobs.Job` instance + to get a :class:`Logs` instance. See :attr:`~scrapinghub.client.jobs.Job.logs` + attribute. - Please note that list() method can use a lot of memory and for a large - amount of logs it's recommended to iterate through it via iter() method - (all params and available filters are same for both methods). + Please note that :meth:`list` method can use a lot of memory and for a + large amount of logs it's recommended to iterate through it via :meth:`iter` + method (all params and available filters are same for both methods). Usage: @@ -25,7 +28,7 @@ class Logs(_Proxy): - iterate through first 100 log entries and print them:: >>> for log in job.logs.iter(count=100): - >>> ... print(log) + ... print(log) - retrieve a single log entry from a job:: @@ -46,11 +49,36 @@ class Logs(_Proxy): 'time': 1486375511188, }] """ + def log(self, message, level=logging.INFO, ts=None, **other): + """Base method to write a log entry. + + :param message: a string message. + :param level: (optional) logging level, default to INFO. + :param ts: (optional) UNIX timestamp in milliseconds. + :param \*\*other: other optional kwargs. + """ + self._origin.log(message, level=level, ts=ts, **other) + + def debug(self, message, **other): + """Log a message with DEBUG level.""" + self._origin.debug(message, **other) + + def info(self, message, **other): + """Log a message with INFO level.""" + self._origin.info(message, **other) + + def warn(self, message, **other): + """Log a message with WARN level.""" + self._origin.warn(message, **other) + warning = warn + + def error(self, message, **other): + """Log a message with ERROR level.""" + self._origin.error(message, **other) - def __init__(self, *args, **kwargs): - super(Logs, self).__init__(*args, **kwargs) - self._proxy_methods(['log', 'debug', 'info', 'warning', 'warn', - 'error', 'batch_write_start']) + def batch_write_start(self): + """Override to set a start parameter when commencing writing.""" + return self._origin.batch_write_start() def _modify_iter_params(self, params): """Modify iter() filters on-the-fly. @@ -60,7 +88,7 @@ def _modify_iter_params(self, params): :param params: an original dictionary with params. :return: a modified dictionary with params. - :rtype: dict + :rtype: :class:`dict` """ params = super(Logs, self)._modify_iter_params(params) offset = params.pop('offset', None) diff --git a/scrapinghub/client/projects.py b/scrapinghub/client/projects.py index d8fcbf9f..35f93eae 100644 --- a/scrapinghub/client/projects.py +++ b/scrapinghub/client/projects.py @@ -8,15 +8,17 @@ from .collections import Collections from .frontiers import _HSFrontier, Frontiers from .jobs import Jobs +from .proxy import _MappingProxy from .spiders import Spiders -from .utils import _MappingProxy, parse_project_id +from .utils import parse_project_id class Projects(object): """Collection of projects available to current user. - Not a public constructor: use :class:`Scrapinghub` client instance to get - a :class:`Projects` instance. See :attr:`Scrapinghub.projects` attribute. + Not a public constructor: use :class:`~scrapinghub.client.ScrapinghubClient` + client instance to get a :class:`Projects` instance. + See :attr:`scrapinghub.client.Scrapinghub.projects` attribute. Usage:: @@ -31,8 +33,8 @@ def get(self, project_id): """Get project for a given project id. :param project_id: integer or string numeric project id. - :return: :class:`Project` object. - :rtype: scrapinghub.client.projects.Project + :return: a project object. + :rtype: :class:`Project` Usage:: @@ -46,7 +48,7 @@ def list(self): """Get list of projects available to current user. :return: a list of project ids. - :rtype: list[int] + :rtype: :class:`list[int]` Usage:: @@ -61,7 +63,7 @@ def iter(self): Provided for the sake of API consistency. :return: an iterator over project ids list. - :rtype: collections.Iterable[int] + :rtype: :class:`collections.Iterable[int]` """ return iter(self.list()) @@ -72,7 +74,7 @@ def summary(self, state=None, **params): :return: a list of dictionaries: each dictionary represents a project summary (amount of pending/running/finished jobs and a flag if it has a capacity to run new jobs). - :rtype: list[dict] + :rtype: :class:`list[dict]` Usage:: @@ -96,17 +98,18 @@ def summary(self, state=None, **params): class Project(object): """Class representing a project object and its resources. - Not a public constructor: use :class:`ScrapinghubClient` instance or - :class:`Projects` instance to get a :class:`Project` instance. See - :meth:`Scrapinghub.get_project` or :meth:`Projects.get` methods. + Not a public constructor: use :class:`~scrapinghub.client.ScrapinghubClient` + instance or :class:`Projects` instance to get a :class:`Project` instance. + See :meth:`scrapinghub.client.ScrapinghubClient.get_project` or + :meth:`Projects.get` methods. :ivar key: string project id. - :ivar activity: :class:`Activity` resource object. - :ivar collections: :class:`Collections` resource object. - :ivar frontiers: :class:`Frontiers` resource object. - :ivar jobs: :class:`Jobs` resource object. - :ivar settings: :class:`Settings` resource object. - :ivar spiders: :class:`Spiders` resource object. + :ivar activity: :class:`~scrapinghub.client.activity.Activity` resource object. + :ivar collections: :class:`~scrapinghub.client.collections.Collections` resource object. + :ivar frontiers: :class:`~scrapinghub.client.frontiers.Frontiers` resource object. + :ivar jobs: :class:`~scrapinghub.client.jobs.Jobs` resource object. + :ivar settings: :class:`~scrapinghub.client.settings.Settings` resource object. + :ivar spiders: :class:`~scrapinghub.client.spiders.Spiders` resource object. Usage:: @@ -174,5 +177,10 @@ class Settings(_MappingProxy): >>> project.settings.delete('job_runtime_limit') """ def set(self, key, value): + """Update project setting value by key. + + :param key: a string setting key. + :param value: new setting value. + """ # FIXME drop the method when post-by-key is implemented on server side self.update({key: value}) diff --git a/scrapinghub/client/proxy.py b/scrapinghub/client/proxy.py new file mode 100644 index 00000000..6f247b0d --- /dev/null +++ b/scrapinghub/client/proxy.py @@ -0,0 +1,185 @@ +from __future__ import absolute_import + +import six +import json + +from ..hubstorage import ValueTooLarge as _ValueTooLarge +from .utils import update_kwargs +from .exceptions import ValueTooLarge + + +class _Proxy(object): + """A helper to create a class instance and proxy its methods to origin. + + The internal proxy class is useful to link class attributes from its + origin depending on the origin base class as a part of init logic: + + - :class:`~scrapinghub.hubstorage.resourcetype.ItemsResourceType` provides + items-based attributes to access items in an arbitrary collection with + get/write/flush/close/stats/iter methods. + + - :class:`~scrapinghub.hubstorage.resourcetype.DownloadableResource` provides + download-based attributes to iter through collection with or without + msgpack support. + """ + + def __init__(self, cls, client, key): + self.key = key + self._client = client + self._origin = cls(client._hsclient, key) + + def list(self, *args, **kwargs): + """Convenient shortcut to list iter results. + + Please note that :meth:`list` method can use a lot of memory and for a + large amount of elements it's recommended to iterate through it via + :meth:`iter` method (all params and available filters are same for both + methods). + """ + return list(self.iter(*args, **kwargs)) + + def _modify_iter_params(self, params): + """A helper to modify iter*() params on-the-fly. + + The method is internal and should be redefined in subclasses. + + :param params: a dictionary with input parameters. + :return: an updated dictionary with parameters. + :rtype: :class:`dict` + """ + return _format_iter_filters(params) + + +class _ItemsResourceProxy(_Proxy): + + def get(self, key, **params): + """Get element from collection. + + :param key: element key. + :return: a dictionary with element data. + :rtype: :class:`dict` + """ + return self._origin.get(key, **params) + + def write(self, item): + """Write new element to collection. + + :param item: element data dict to write. + """ + try: + return self._origin.write(item) + except _ValueTooLarge as exc: + raise ValueTooLarge(str(exc)) + + def iter(self, _key=None, count=None, **params): + """Iterate over elements in collection. + + :param count: limit amount of elements. + :return: a generator object over a list of element dictionaries. + :rtype: :class:`types.GeneratorType[dict]` + """ + update_kwargs(params or {}, count=count) + params = self._modify_iter_params(params) + return self._origin.list(_key, **params) + + def flush(self): + """Flush data from writer threads.""" + self._origin.flush() + + def stats(self): + """Get resource stats. + + :return: a dictionary with stats data. + :rtype: :class:`dict` + """ + return self._origin.stats() + + def close(self, block=True): + """Close writers one-by-one.""" + self._origin.close(block) + + +class _DownloadableProxyMixin(object): + + def iter(self, _path=None, count=None, requests_params=None, **apiparams): + """A general method to iterate through elements. + + :param count: limit amount of elements. + :return: an iterator over elements list. + :rtype: :class:`collections.Iterable` + """ + update_kwargs(apiparams, count=count) + apiparams = self._modify_iter_params(apiparams) + return self._origin.iter_values(_path, requests_params, **apiparams) + + +class _MappingProxy(_Proxy): + """A helper class to support basic get/set interface for dict-like + collections of elements. + """ + + def get(self, key): + """Get element value by key. + + :param key: a string key + """ + return next(self._origin.apiget(key)) + + def set(self, key, value): + """Set element value. + + :param key: a string key + :param value: new value to set for the key + """ + self._origin.apipost(key, data=json.dumps(value), is_idempotent=True) + + def update(self, values): + """Update multiple elements at once. + + The method provides convenient interface for partial updates. + + :param values: a dictionary with key/values to update. + """ + if not isinstance(values, dict): + raise TypeError("values should be a dict") + data = next(self._origin.apiget()) + data.update(values) + self._origin.apipost(jl={k: v for k, v in six.iteritems(data) + if k not in self._origin.ignore_fields}, + is_idempotent=True) + + def delete(self, key): + """Delete element by key. + + :param key: a string key + """ + self._origin.apidelete(key) + + def iter(self): + """Iterate through key/value pairs. + + :return: an iterator over key/value pairs. + :rtype: :class:`collections.Iterable` + """ + return six.iteritems(next(self._origin.apiget())) + + +def _format_iter_filters(params): + """Format iter() filter param on-the-fly. + + Support passing multiple filters at once as a list with tuples. + """ + filters = params.get('filter') + if filters and isinstance(filters, list): + filter_data = [] + for elem in params.pop('filter'): + if isinstance(elem, six.string_types): + filter_data.append(elem) + elif isinstance(elem, (list, tuple)): + filter_data.append(json.dumps(elem)) + else: + raise ValueError( + "Filter condition must be string, tuple or list") + if filter_data: + params['filter'] = filter_data + return params diff --git a/scrapinghub/client/requests.py b/scrapinghub/client/requests.py index 06ee1125..7f5428ef 100644 --- a/scrapinghub/client/requests.py +++ b/scrapinghub/client/requests.py @@ -1,17 +1,19 @@ from __future__ import absolute_import -from .utils import _Proxy +from .proxy import _ItemsResourceProxy, _DownloadableProxyMixin -class Requests(_Proxy): +class Requests(_ItemsResourceProxy, _DownloadableProxyMixin): """Representation of collection of job requests. - Not a public constructor: use :class:`Job` instance to get a - :class:`Requests` instance. See :attr:`Job.requests` attribute. + Not a public constructor: use :class:`~scrapinghub.client.jobs.Job` instance + to get a :class:`Requests` instance. + See :attr:`~scrapinghub.client.jobs.Job.requests` attribute. - Please note that list() method can use a lot of memory and for a large - amount of requests it's recommended to iterate through it via iter() - method (all params and available filters are same for both methods). + Please note that :meth:`list` method can use a lot of memory and for + a large amount of logs it's recommended to iterate through it via + :meth:`iter` method (all params and available filters are same for + both methods). Usage: @@ -39,6 +41,17 @@ class Requests(_Proxy): 'url': 'https://example.com' }] """ - def __init__(self, *args, **kwargs): - super(Requests, self).__init__(*args, **kwargs) - self._proxy_methods(['add']) + def add(self, url, status, method, rs, parent, duration, ts, fp=None): + """ Add a new requests. + + :param url: string url for the request. + :param status: HTTP status of the request. + :param method: stringified request method. + :param rs: response body length. + :param parent: parent request id or ``None``. + :param duration: request duration in milliseconds. + :param ts: UNIX timestamp in milliseconds. + :param fp: (optional) string fingerprint for the request. + """ + return self._origin.add( + url, status, method, rs, parent, duration, ts, fp=None) diff --git a/scrapinghub/client/samples.py b/scrapinghub/client/samples.py index 581d0fd7..87a8e9bc 100644 --- a/scrapinghub/client/samples.py +++ b/scrapinghub/client/samples.py @@ -1,17 +1,19 @@ from __future__ import absolute_import -from .utils import _Proxy +from .proxy import _ItemsResourceProxy -class Samples(_Proxy): +class Samples(_ItemsResourceProxy): """Representation of collection of job samples. - Not a public constructor: use :class:`Job` instance to get a - :class:`Samples` instance. See :attr:`Job.samples` attribute. + Not a public constructor: use :class:`~scrapinghub.client.jobs.Job` instance + to get a :class:`Samples` instance. + See :attr:`~scrapinghub.client.jobs.Job.samples` attribute. - Please note that list() method can use a lot of memory and for a large - amount of samples it's recommended to iterate through it via iter() - method (all params and available filters are same for both methods). + Please note that :meth:`list` method can use a lot of memory and for + a large amount of logs it's recommended to iterate through it via + :meth:`iter` method (all params and available filters are same for + both methods). Usage: diff --git a/scrapinghub/client/spiders.py b/scrapinghub/client/spiders.py index 522ecb05..1d665801 100644 --- a/scrapinghub/client/spiders.py +++ b/scrapinghub/client/spiders.py @@ -2,17 +2,17 @@ from requests.compat import urljoin +from .exceptions import NotFound, _wrap_http_errors from .jobs import Jobs -from .exceptions import NotFound -from .exceptions import wrap_http_errors from .utils import get_tags_for_update class Spiders(object): """Class to work with a collection of project spiders. - Not a public constructor: use :class:`Project` instance to get - a :class:`Spiders` instance. See :attr:`Project.spiders` attribute. + Not a public constructor: use :class:`~scrapinghub.client.projects.Project` + instance to get a :class:`Spiders` instance. + See :attr:`~scrapinghub.client.projects.Project.spiders` attribute. :ivar project_id: string project id. @@ -32,8 +32,8 @@ def get(self, spider, **params): The method gets/sets spider id (and checks if spider exists). :param spider: a string spider name. - :return: :class:`Spider` object. - :rtype: scrapinghub.client.spiders.Spider + :return: a spider object. + :rtype: :class:`scrapinghub.client.spiders.Spider` Usage:: @@ -52,7 +52,7 @@ def list(self): """Get a list of spiders for a project. :return: a list of dictionaries with spiders metadata. - :rtype: list[dict] + :rtype: :class:`list[dict]` Usage:: @@ -68,7 +68,7 @@ def iter(self): :return: an iterator over spiders list where each spider is represented as a dict containing its metadata. - :rtype: collection.Iterable[dict] + :rtype: :class:`collection.Iterable[dict]` Provided for the sake of API consistency. """ @@ -84,7 +84,7 @@ class Spider(object): :ivar project_id: a string project id. :ivar key: a string key in format 'project_id/spider_id'. :ivar name: a spider name string. - :ivar jobs: a collection of jobs, :class:`Jobs` object. + :ivar jobs: a collection of jobs, :class:`~scrapinghub.client.jobs.Jobs` object. Usage:: @@ -103,7 +103,7 @@ def __init__(self, client, project_id, spider_id, spider): self.jobs = Jobs(client, project_id, self) self._client = client - @wrap_http_errors + @_wrap_http_errors def update_tags(self, add=None, remove=None): """Update tags for the spider. @@ -117,12 +117,12 @@ def update_tags(self, add=None, remove=None): response = self._client._connection._session.patch(url, json=params) response.raise_for_status() - @wrap_http_errors + @_wrap_http_errors def list_tags(self): """List spider tags. :return: a list of spider tags. - :rtype: list[str] + :rtype: :class:`list[str]` """ path = 'v2/projects/{}/spiders/{}'.format(self.project_id, self._id) url = urljoin(self._client._connection.url, path) diff --git a/scrapinghub/client/utils.py b/scrapinghub/client/utils.py index c0ec0496..78a51292 100644 --- a/scrapinghub/client/utils.py +++ b/scrapinghub/client/utils.py @@ -8,12 +8,6 @@ import six -from ..hubstorage.resourcetype import DownloadableResource -from ..hubstorage.resourcetype import ItemsResourceType -from ..hubstorage.collectionsrt import Collections - -from .exceptions import wrap_value_too_large - class LogLevel(object): DEBUG = logging.DEBUG @@ -36,6 +30,12 @@ def __str__(self): def parse_project_id(project_id): + """Simple check for project id. + + :param project_id: a numeric project id, int or string. + :return: a unified project id. + :rtype: :class:`str` + """ try: int(project_id) except ValueError: @@ -44,6 +44,12 @@ def parse_project_id(project_id): def parse_job_key(job_key): + """Inner helper to parse job key. + + :param job_key: a job key (str or tuple of 3 ints). + :return: parsed job key. + :rtype: :class:`JobKey` + """ if isinstance(job_key, tuple): parts = job_key elif isinstance(job_key, six.string_types): @@ -72,173 +78,8 @@ def get_tags_for_update(**kwargs): return params -class _Proxy(object): - """A helper to create a class instance and proxy its methods to origin. - - The internal proxy class is useful to link class attributes from its - origin depending on the origin base class as a part of init logic: - - - :class:`ItemsResourceType` provides items-based attributes to access - items in an arbitrary collection with get/write/flush/close/stats/ - iter methods. - - - :class:`DownloadableResource` provides download-based attributes to - iter through collection with or without msgpack support. - """ - - def __init__(self, cls, client, key): - self.key = key - self._client = client - self._origin = cls(client._hsclient, key) - - if issubclass(cls, ItemsResourceType): - self._proxy_methods(['get', 'write', 'flush', 'close', - 'stats', ('iter', 'list')]) - # redefine write method to wrap hubstorage.ValueTooLarge error - origin_method = getattr(self, 'write') - setattr(self, 'write', wrap_value_too_large(origin_method)) - - # DType iter_values() has more priority than IType list() - # plus Collections interface doesn't need the iter methods - if issubclass(cls, DownloadableResource) and cls is not Collections: - methods = [('iter', 'iter_values'), - ('iter_raw_msgpack', 'iter_msgpack'), - ('iter_raw_json', 'iter_json')] - self._proxy_methods(methods) - self._wrap_iter_methods([method[0] for method in methods]) - - def _proxy_methods(self, methods): - """A little helper for cleaner interface.""" - proxy_methods(self._origin, self, methods) - - def _wrap_iter_methods(self, methods): - """Modify kwargs for all passed self.iter* methods.""" - for method in methods: - wrapped = wrap_kwargs(getattr(self, method), - self._modify_iter_params) - setattr(self, method, wrapped) - - def _modify_iter_params(self, params): - """A helper to modify iter() params on-the-fly. - - The method is internal and should be redefined in subclasses. - - :param params: a dictionary with input parameters. - :return: an updated dictionary with parameters. - :rtype: dict - """ - return format_iter_filters(params) - - def list(self, *args, **kwargs): - """Convenient shortcut to list iter results. - - Please note that list() method can use a lot of memory and for a large - amount of elements it's recommended to iterate through it via iter() - method (all params and available filters are same for both methods). - """ - return list(self.iter(*args, **kwargs)) - - -class _MappingProxy(_Proxy): - """A helper class to support basic get/set interface for dict-like - collections of elements. - """ - - def get(self, key): - """Get element value by key. - - :param key: a string key - """ - return next(self._origin.apiget(key)) - - def set(self, key, value): - """Set element value. - - :param key: a string key - :param value: new value to set for the key - """ - self._origin.apipost(key, data=json.dumps(value), is_idempotent=True) - - def update(self, values): - """Update multiple elements at once. - - The method provides convenient interface for partial updates. - - :param values: a dictionary with key/values to update. - """ - if not isinstance(values, dict): - raise TypeError("values should be a dict") - data = next(self._origin.apiget()) - data.update(values) - self._origin.apipost(jl={k: v for k, v in six.iteritems(data) - if k not in self._origin.ignore_fields}, - is_idempotent=True) - - def delete(self, key): - """Delete element by key. - - :param key: a string key - """ - self._origin.apidelete(key) - - def iter(self): - """Iterate through key/value pairs. - - :return: an iterator over key/value pairs. - :rtype: collections.Iterable - """ - return six.iteritems(next(self._origin.apiget())) - - -def wrap_kwargs(fn, kwargs_fn): - """Tiny wrapper to prepare modified version of function kwargs""" - def wrapped(*args, **kwargs): - kwargs = kwargs_fn(kwargs) - return fn(*args, **kwargs) - return wrapped - - -def proxy_methods(origin, successor, methods): - """A helper to proxy methods from origin to successor. - - Accepts a list with strings and tuples: - - - each string defines: - a successor method name to proxy 1:1 with origin method - - each tuple should consist of 2 strings: - a successor method name and an origin method name - """ - for method in methods: - if isinstance(method, tuple): - successor_name, origin_name = method - else: - successor_name, origin_name = method, method - if not hasattr(successor, successor_name): - setattr(successor, successor_name, getattr(origin, origin_name)) - - -def format_iter_filters(params): - """Format iter() filter param on-the-fly. - - Support passing multiple filters at once as a list with tuples. - """ - filters = params.get('filter') - if filters and isinstance(filters, list): - filter_data = [] - for elem in params.pop('filter'): - if isinstance(elem, six.string_types): - filter_data.append(elem) - elif isinstance(elem, (list, tuple)): - filter_data.append(json.dumps(elem)) - else: - raise ValueError( - "Filter condition must be string, tuple or list") - if filter_data: - params['filter'] = filter_data - return params - - def update_kwargs(kwargs, **params): + """Update kwargs dict with non-empty params with json-encoded values.""" kwargs.update({k: json.dumps(v) if isinstance(v, dict) else v for k, v in params.items() if v is not None}) diff --git a/tests/client/cassetes/test_logs/test_logs_iter_raw_json.gz b/tests/client/cassetes/test_logs/test_logs_iter_raw_json.gz deleted file mode 100644 index bce71e44..00000000 --- a/tests/client/cassetes/test_logs/test_logs_iter_raw_json.gz +++ /dev/null @@ -1 +0,0 @@ -eJyll/1XE2cWx0HrG+tbt12VVbcBDY1K3g1ENCoQ3kQEJOhoO7pD8pAZCJncmQkvulGKFS2K5VWsrgiV2tquortlPSurnnOfX/av2f9gz9lnngSxLnvadZNMcu7NnTz3uffzvTPpXZaCLJuwNisrS4kbRJPChqLGdcgWYZktBcttwmr2lUb0BPMSeC8FK2zCKuaSiRQhmg4rU7DKJrzHPEHJILBahDXCdmZVakqhxbPXUidpFo/LXWxxe0u8/hKXy1JVF4IcSVjJgpqI1kk0+JUIa4VtzD7ZVFWT9jldDo+lvKHHkNW40+vY6yiCdRJf5oSk9cB6ETYI65lVGg6ThFFoKVfVdoXARklYYXpjMbUL3hfh18I6ZjbUN4UKLfUNoZr6Y03wgcS3W66y/cYNe6gnQeBDEX4jbGReKZGIKWHJLIKzTVfjsElK8mVb1EgPbE7BFp64bmhKPAq5wsfMOJ+vG5KR1PNL8tX2/ML8NrVFiTDDk3443U5fkS8/Bb/VM+eawbA1BdvSpewgui5FCWwXljGrvhZ+x1cMqxECH9X+I5nkURqBJNENsKQgzyZvFqzMl9DUNhI2ApmVCvSEwpoSkHW7wULtaRPyZdakHTZeitIkq6imnOM7hJ0iWAUPc5dJuhK2nO74pc9TRVAgCTnszGadaPbSKKskfCyCjfc+wdtm18OalGBlkpMtTo/DxZ4R0umCXZKw4XXn7BVxtk+zlrtF2MNTjJ5TEoWWCGmNmUAVpknJ9NcugkOoNYuja62G2k7inkBLUArWdRc1x6uUruLiU+XdSijS1tJVU+U2fDHDr4aPlEKoo12PNJeHaoqrG4/G1NZOzZWoIKQoFPM3gHMJIFwiuIW8t4Dotnd1ddlbVa3DntRixMycRMCTLgQ7PU64esArwl7uayckYZdiSicBn8R5XVjkKIlHDRmKRCjmXfcWgz+903RdYJ8IJcJyZu927ob9DMKVnBRW2Agc4HyYVEOAxyQ1BQ4KO01ZGkaixOn0eh3pl89d4nf5XU7WCKeWjDs404eSyRQctslMzaU2E44y21LJlYsQFLKZ3wUVkrA1k5uu2804TY3Zuczs9ZoSVeJQKUIVj94N1UsUtEaEI0Iu8xqk23AmYpIS328Jy5KmEyPQHKq0+6FWSspMYUflXIHFZUGdLjORHLPJr4VRL3M9yEwDDUwD4bNmB8J6TroVOdAo/JMFfoTXVmfty6PjwjL6Cr+hw3Qwh/a51mbRKTqPky105gBOXzToOD7Dr7bU4jR9hDP4iP7FRf9KH3fiBD4kZXQ4fOEgjuBQLo7AavwWb+LADjqNIzUVESfObyo8Jus4icPb/Hgd6FD71iC9H9+0wdGWU+vFV914bxd+W1aDj/F7P52nM+wYpQ9P0hsX6fTmA/QencPv2xP4N3xOn9IJfEpv0j/TvoPYT6dwkE7QV/TJ6T05eC0Pr0v0koqX8XN3aQ+dpZP7cYI+WIMT2bS/oYDVBY5zNJhYlLgbmvohdByazaaesL0t0JMiCL9AoKckmQ2G0+86GD6RZKblT5fQsijJTBpnZJ8kM/DPcnTdvn3we4nP3nCGF7IwEiQRWjjq5s9AWJKZKCIyE4P8hgKIfFA4sEi+bqgaG6aOzg7HG9tzhNUOZ0yN6s6fDOVDbBZrRsAFraYiolwRMleEsqQi2kRoT2dd7IZYOuuFkNeDrEOE+GLW6hJaSIgAQsF/DBdTmzElTvS3haH9vPp0EQy5Oi2gpJwrNwr/WlBC+oHjdKjOlMHGFQyxF/QK3kpalQv4AJ8y4IdxILymCB/tbMRhOoLzn0CYflZgMOi/pFfolXCl/wx9uMKHV+ksix9R6NjRw9l0uAT78HlolaMRp+i0IGVvD+B3eKfpDPbi/Y14uxMfH8Yn9DGDe5hO4uUTtA9/3Mo0Oap90IMP6CjL4wX7zR/Y6QPY25JNb9OXHpwJ4F2mihnad74IX+KzVGt+vICOnmFbeErHaN8OOrsK762kgxsEOoi3CnG8VDlxYcuuIN6ik3Qg2oWPvKYwOmWmhq7j0M3nSM8bc+Tc4hw5z+bIMbPjf7CZcKZMOJlOLshpFVx8VxX0chV8Jouc2r4MtSY7VRUhuMSgPf3/QZvxWT2VbnYwP3v3FLArhBQ420564HMT6csc6X6O9JUlkb4qwhe8LPu8MPBfiL4mwvVFogeXIPqGCF/+L0QP/TzRwyKMLBA9yonu/ynRVjpQmoO9Fgb0EM7Tv+McnZZLGLIM5014KXgEp94PGvQZ9jEqntO71XjnU7yKTw514I9RnF3+Id7AuRz8Grw4GOvJX49/aqAzOMbCv6vG6SN0tpa+zGEXhjl8sQ7nogeceQE6htMN5qVpzERr/Djc5GhNvIHWrUW0vlpA6zZH604GrT9m0Lr7rmhNcrTuZdCaegutaYaW/q5ovcanoFWJsb8DAauvzOrxxEgniVlN2Mr3mB/eCqs3uGD7yva6rL4ge8HXySThN6vsDl4374Tu12YnHf8GcstbzQ== \ No newline at end of file diff --git a/tests/client/cassetes/test_logs/test_logs_iter_raw_msgpack.gz b/tests/client/cassetes/test_logs/test_logs_iter_raw_msgpack.gz deleted file mode 100644 index dc115941..00000000 --- a/tests/client/cassetes/test_logs/test_logs_iter_raw_msgpack.gz +++ /dev/null @@ -1 +0,0 @@ -eJyllv1TFGcSx5GoIfieixoNXlaSxT1l32UhGEyQd1F5W5LRc9Rh92FnYNnZnpflRTeGGGIiEhNEPSWK5IQ7zouVGMvEnFWp6qeu6v6i+/l6ngVBi1Qsb3Z2t7q3Z55+uj/f3vk4Pwt5HmltXl6elrKYocQsTU+ZsEKGfE8WXvJIBfSTwcw0eRmszMIqj/QyuVSmxJlhwuosvOyRVpKnVrEYFMjwirSDrHpDK3WF9roOK4YrFAiWu4LhynBFZbDM1XA4CoWKtJqCOpiRYQaskWGtVET2hx0NTTmfP+ALuWpaBy1VT/nDvr2+CKxTxDIfKMYgrJdhg7SerOpYjKWtUleNrvdqDDYq0irHm0zq/bBJhleldWS2tnRES10trdGmliMd8AdFbLdGp/2mLG90MM3gNRk2SxvJq6TTSS2mOEXw95h6CrYotli2S48PwtYsvC4SNy1DSyVgm7SLjNPFpqVYtllcWaz3FpcW9+hdWpyMUO7wB/1lkUhxFrab89c6wfBGFopypexjpqkkGOyQ8slqaYY/ihVjepzBm83/sW0RZTCwmWmBKws7PepWyU2+tKH3sJhVNb9SiZnWqClVqum1KNSbM6FYpSa95RGlqLapooY2JHYIb8vglkLkPqCYWsx1rO95X0cjUKJIhXRlp8kMb3WCKgm7ZPCI3qdF27xmzFDSVCbV7vKHfAF6xVkmAH9SpA1POuetS9E+nVrulmGPSDExpKVLXXHWnXSAKs2RMt9frww+qdkpjml0W3ovS4WqumqV2sMDkc5Ug9ZfXn60ZkCLxnu6+psaglZZ0qrQYwerIdrXa8Y7a6JN5Y1th5J6d8YIpOsYi0STFa3gXwaIgAxBaeczQAx4+/v7vd260ee1jSRzMmdxCOUKQZenmFAPhGXYK3y9jKW9SlLLMChTBK8LixxiqYSlQkSGctH1cDlU5Haaqwu8I0Ol9BLZu/27YR9BuFqQQoWNw7uCD4dqqBIxtqHBfultR5aWla70+8NhX+4sC1ZWBCoCfmqE37BTPsH0e7adhfc9Kqm52uPAccCzXHI1MtRKK8gfgDpFemM+N9P0OnGGnvQKmXlbDC2hpaBehgYRvRsalylokwwHpW3ktdiA5U8nFS21zxVTFcNkVlVntN5bAc2KrZLCDqnbJIrLg8OmSiI54lGfCKNFFXpQSQOtpIHYSacDMbMw14pCaJP+S4Fv4mhBXuNOfkXK57/iDP+ajxXyc661efz2aZzCb/kdfIzD2/ld/k9+C2djeL2gB89n+CRO8wf8Jr92gj98i3+3McMv8ksNr4b4J6eq+DD+wGfxPsO/46c9OFnPp/He6oZ+wzyB37/Dv8eJ9gRejTTyqyd38O+yoSKc7TnDf8RzfHwtPuIThfamrXzqTLaST9HyP/MRPrPhI5zll/lYKf6ykv8N7/LJNJ9r9nbzh5TxyCm69BGex2vH8PpRvIA/4CU/f9z2umcdfsWn8F8U9SXODeWfLcGRTAnVB9oFIiQaLRWEjs8g2g6dTnM/8Dwr1A9lkJ5DqEcVlQbEsRcdEH9WVNL08WU0LSsqSeSEWqaoJICTAuFgJAynFDGDY/PcsIXRoMjQJZB3bgMxRSVxxFUShbpECUzdL727qADT0g0aqr5Mn2/J9nwxvc+f1BOm/6nh/B7NZMOqCkC3o4yEUIYqlKEtq4weGXpzWYeDkMxlvRDyZKD1yZBazFpfRhNpGUCo9ukh02cm0kqs91l5GL+vQVMGS23MychWt6lt0r8X9JA7ek+04zhex8khvIXDOBzEGwT9j2f5T3wcv+ETeJ9PFOE9PoOPcI4gHcEv+BzO4V0cDZtrjHbbxFF+Gcdwsi3KH+JtnDtOdxvHG1G8sb4dP+/EmwWH+E28wC+Sxh6coXte4Hf47S14bdcWPsP4LJ+ObMR7xqpG9xG2Pc1/wpktm/nMVpzGf6zBXzfh433r+Vi2+WA7jjbn0boPPHhlM40VyKjEdH87DIipMLhkKgwtToXTNBWOOH0743EQyzqIEe0fqTmWz74oyx8LlodVWbD3ibT5t5oG5wSVDhsNdVH4lKA89v9BOe9zh+qD9CY/fYZK6J9AqTrZywZhxEH2M4HseYHs58si+4UMF0TBIgEY/Q1iL8owtkjsl8sQe0mGr56f2K9/n9hxGS4vEDshiD3+nMReWkrs2LPE7unAUaLwPo5aAtkV+4v4yDqP87dyxQHpajtcEyD9ZQlI1xdBurEA0qQA6Zt5kG7Og3TrRUGaEiDdngdpWn0al28JF/NFcXmCREm3lqRH+Sp32QF3KJRkGZZ0OwDV7HG+wnXucO2CXXZgb8BdVksn/NW2mXjQpKdv03mKudO8wvb9D/2gOpI= \ No newline at end of file diff --git a/tests/client/cassetes/test_requests/test_requests_iter_raw_json.gz b/tests/client/cassetes/test_requests/test_requests_iter_raw_json.gz deleted file mode 100644 index 85e51ebd..00000000 --- a/tests/client/cassetes/test_requests/test_requests_iter_raw_json.gz +++ /dev/null @@ -1 +0,0 @@ -eJyllvt3FFUSx4cgD2PAALKIBpgMCEPMPPMaw0bJOyGEBDLBDtILnZmbdCeT7ql+5AGOxABRBOUVyHFXObIGc6K7sj4OKiCeU/WLf9ZW94THYs7R407PD1M11ffWrfp+qnuyIAe+oFTk8/k03RamkrI1Q7dgmQwFwRwsD0qr+S9TWFn2CngmByuC0ip2qUJJC9OClTlYFZSeYU+TYgtYLcOz0ha2Wkyt3B+v9Hcqpj8ejdX4YxW1Fa/VxmL+1s4kFCrSSg7qEeaoMOE5GYqkErbf7Gltz/si0XDc39g9YauGHqkIV4arYY3ibXNYMSdgrQzPS2vZqk+lRNYu9zcaxrAmoFiRVrjeTMYYg3UyrJfWsNnd1ZMs93d1J9u7DvTABsU7bqPB59XtUHIiK+AFGTZKxexVstmMllLcIkSGLEOHvyiOt22/kZ6ATTl40Uvcsk1NH4TN0i42TgYsW7EdK1AbMIYD5YEho19LsxHPfyKxSFUiEcjBS9bivW4wvJyDknwpR4RlKYMCtkgFbHV1wFZvx5SRFrCt41fH8aJMAY6wbPDnoDSobpJeYV/WNIZEyq5b3GmnldW4KXWqFbI5NJQ3IaByk7YHvVLUO1xRUzvhnRB2yPCKFGd3g2JpKf+RkT969VXDTkUq5Dt7LWGG6ge5krBLhqDX+6zXtpCVMpUsl0l1+iPxcJSvtBiNwm5Fev5R50LNOp/TrWWZDK96KQ6e0LLl/rQYyLiCKs8rZbG/IRnCUodbHMscsI1hocfr+puUps7x6l69VRurqelrHNeS6aH+sfbWmF2VsRNGal89JEeGrXRvY7K9pu3g/owxMGpGs81CVCcziW6ILCGIqAwxqfQpQYyHxsbGQgOGORJyzIxwMxdpiOcLwbfrwqMHKmSo9HzDQmRDSkYbFVCleHp9uMl+oQ/aKlTLUON1vaIGEvmT5usCr8lQKy1nuyxSBntYhCs9pXBh0/BXTx+uqqHOi3FMDV6XdrhY2na2NhKpqAjnv1Wx2kQ0EY1wIyKmo4c9Tb/hODnYG1SZ5vqgK46G4FLJNcrQJC1jfxSaFenlxdwsK+TGmUYm5GEW6jK1QU2HFhlavegyaFuioO0y7JM2s9cW43Ykm1E0fY8/pSqmJey63mRLKAEdiqMyYfvVzRLH+aDTUhmSA0H1ERhdqseDygx0MwOpY24HUlZhvhWFcFB6jjPYhudX+3CylK5JBfQLXqOPOtbiVLR4JX2GP+O8H7+rjuMt9s/SDfyRZprqfHj5pTa1VODdAN2gT2mhBc/tw3/iPfqJ5vGaUTCEH+J0fH1Zh4UzDXQLF+gCza4r4jWu4vXx4zmaBbqDX9PnvjV0uZumY7SAl6vdDegMTW6vwemTuyvwa4NO01c454by6nfwB/qEPjZLcCGw7fCGJvyopYeu8k03af6dMZyme0N0d+du+mL90YEdXavC2+lMCd6kG8X0Hp3W2vnnf1rxLN1P4mQbzuHUqRKcpclxH02KU/Q5znIp4JCnGyZJ02PQMw3JQ9Drdvxw8Gl635RB+gP09ikqT40jf3ZqvKWoDPrRJUCXFZW5+ZtapahMxTFP17GaBBxXvMGcWhSTeDgvFBn6PQ7cZSClqExMWmVS1CfwEOrr0t7HWFi2YfKkDY+OhJ84XjhljEQWx6sV+Z+p/QYPa9Oui8KAi8ygh4zqIaMticyQDMP5zBNVkMln/jDk0aQbkUF/nLmxBCxZGUDa+Zvp48Kb0XRhPU2O+ft4WjLYalueMEfdrB6Utj5EJf9xOaHzhTi19wUGJYcX6Ru8je/RF/jlAfwOL+L1FXStdscunKFvcWrrplJ8cHL5uo1ButxHD2ie7qxJyPTzlrX0Zd9R+pbePxKmyb3NtEA3ca69/1X8vrKaruP1UvqK/oEz+AHexSk8/2LtPobpNpM5U1zEUM7jfU5jCm/Rh3S3GO9w7KUO/PcGnMYHDO/Zyl1DJbzZ3J4jeINRupI6WsQI/kK3aI5ub2zEq0zVg0CCvqFzeMlghn7qp3/xMvdwtpEulNIM3m/o3I0X6Me3gO7jObqk0Gd0Bc/gdJL+fvxtH10cxnN4Fq+49IyqjMzYIRj3JtHEE5PoxONJdJIn0QFXEm8HXQXnXAUzTO+oeVRO/VlUJj1U3lVlT9pTi9J2xdXanITTrOyG/0fZ/CxR6o4Niwk44zjCe8Xg9y7LfX6d7VjmhP8LWciWDQ== \ No newline at end of file diff --git a/tests/client/test_items.py b/tests/client/test_items.py index 38af756a..1dfaeaf1 100644 --- a/tests/client/test_items.py +++ b/tests/client/test_items.py @@ -1,10 +1,6 @@ -import json - import pytest from six.moves import range -from scrapinghub.hubstorage.serialization import mpdecode - def _add_test_items(job): for i in range(3): @@ -29,18 +25,6 @@ def test_items_iter(spider): with pytest.raises(StopIteration): next(o) - o = job.items.iter_raw_json(offset=2) - item = json.loads(next(o)) - assert item['id'] == 2 - assert item['data'] == 'data2' - with pytest.raises(StopIteration): - next(o) - - msgpacked_o = job.items.iter_raw_msgpack(offset=2) - o = mpdecode(msgpacked_o) - assert item['id'] == 2 - assert item['data'] == 'data2' - def test_items_list(spider): job = spider.jobs.run(meta={'state': 'running'}) diff --git a/tests/client/test_logs.py b/tests/client/test_logs.py index 52b42ec3..88cb4b85 100644 --- a/tests/client/test_logs.py +++ b/tests/client/test_logs.py @@ -1,11 +1,9 @@ -import json import types from numbers import Integral import pytest from scrapinghub.client.utils import LogLevel -from scrapinghub.hubstorage.serialization import mpdecode from .conftest import TEST_TS @@ -103,35 +101,3 @@ def test_logs_list_filter(spider): logs3 = job.logs.list(filter=[('message', 'contains', ['simple'])]) assert len(logs3) == 3 - - -def test_logs_iter_raw_json(spider): - job = spider.jobs.run() - _add_test_logs(job) - - logs0 = job.logs.iter_raw_json(offset=2) - raw_log0 = next(logs0) - log0 = json.loads(raw_log0) - assert log0.get('message') == 'simple-msg3' - assert log0.get('_key') - assert isinstance(log0.get('time'), Integral) - assert log0.get('level') == 10 - - logs1 = job.logs.iter_raw_json(level='ERROR') - raw_log1 = next(logs1) - log1 = json.loads(raw_log1) - assert log1.get('message') == 'error-msg' - - -def test_logs_iter_raw_msgpack(spider): - job = spider.jobs.run() - _add_test_logs(job) - - logs1 = job.logs.iter_raw_msgpack(offset=2) - assert isinstance(logs1, types.GeneratorType) - unpacked_logs1 = list(mpdecode(logs1)) - assert unpacked_logs1[0].get('message') == 'simple-msg3' - - logs2 = job.logs.iter_raw_msgpack(level='ERROR') - unpacked_logs2 = list(mpdecode(logs2)) - assert unpacked_logs2[0].get('message') == 'error-msg' diff --git a/tests/client/test_proxy.py b/tests/client/test_proxy.py new file mode 100644 index 00000000..7fd4f272 --- /dev/null +++ b/tests/client/test_proxy.py @@ -0,0 +1,38 @@ +import pytest + +from scrapinghub.client.proxy import _format_iter_filters + + +def test_format_iter_filters(): + # work with empty params + assert _format_iter_filters({}) == {} + + # doesn't affect other params + params = {'a': 123, 'b': 456} + assert _format_iter_filters(params) == params + + # pass filter as-is if not list + params = {'filter': 'some-string'} + assert _format_iter_filters(params) == params + + # work fine with empty filter + params = {'filter': []} + assert _format_iter_filters(params) == params + + # pass string filters as-is + params = {'filter': ['str1', 'str2']} + assert _format_iter_filters(params) == params + + # converts list-formatted filters + params = {'filter': [['field', '>=', ['val']], 'filter2']} + assert (_format_iter_filters(params) == + {'filter': ['["field", ">=", ["val"]]', 'filter2']}) + + # works the same with tuple entries + params = {'filter': [('field', '==', ['val'])]} + assert (_format_iter_filters(params) == + {'filter': ['["field", "==", ["val"]]']}) + + # exception if entry is not list/tuple or string + with pytest.raises(ValueError): + _format_iter_filters({'filter': ['test', 123]}) diff --git a/tests/client/test_requests.py b/tests/client/test_requests.py index 1d2e3bca..a71b3820 100644 --- a/tests/client/test_requests.py +++ b/tests/client/test_requests.py @@ -1,5 +1,3 @@ -import json - import pytest from .conftest import TEST_TS @@ -39,18 +37,3 @@ def test_requests_iter(spider): } with pytest.raises(StopIteration): next(rr) - - -def test_requests_iter_raw_json(spider): - job = spider.jobs.run() - _add_test_requests(job) - job.requests.close() - - rr = job.requests.iter_raw_json() - raw_req = next(rr) - req = json.loads(raw_req) - assert req.get('url') == 'http://test.com/' - assert req.get('status') == 200 - next(rr), next(rr) - with pytest.raises(StopIteration): - next(rr) diff --git a/tests/client/test_utils.py b/tests/client/test_utils.py index 03e4362c..f109894c 100644 --- a/tests/client/test_utils.py +++ b/tests/client/test_utils.py @@ -5,42 +5,6 @@ import mock from scrapinghub.client.utils import parse_auth -from scrapinghub.client.utils import format_iter_filters - - -def test_format_iter_filters(): - # work with empty params - assert format_iter_filters({}) == {} - - # doesn't affect other params - params = {'a': 123, 'b': 456} - assert format_iter_filters(params) == params - - # pass filter as-is if not list - params = {'filter': 'some-string'} - assert format_iter_filters(params) == params - - # work fine with empty filter - params = {'filter': []} - assert format_iter_filters(params) == params - - # pass string filters as-is - params = {'filter': ['str1', 'str2']} - assert format_iter_filters(params) == params - - # converts list-formatted filters - params = {'filter': [['field', '>=', ['val']], 'filter2']} - assert (format_iter_filters(params) == - {'filter': ['["field", ">=", ["val"]]', 'filter2']}) - - # works the same with tuple entries - params = {'filter': [('field', '==', ['val'])]} - assert (format_iter_filters(params) == - {'filter': ['["field", "==", ["val"]]']}) - - # exception if entry is not list/tuple or string - with pytest.raises(ValueError): - format_iter_filters({'filter': ['test', 123]}) def test_parse_auth_none():